Skip to content

Commit 6b65bdd

Browse files
switching to C++0X for unicode support, fleshing out squeleton
1 parent 2fc9a01 commit 6b65bdd

File tree

6 files changed

+179
-80
lines changed

6 files changed

+179
-80
lines changed

src/Makefile.am

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,11 @@ AUTOMAKE_OPTIONS = 1.4 foreign
22

33
WARNS = -W -Wall -Wshadow -Wpointer-arith
44
IFLAGS =
5-
FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE
5+
FLAGS = -g -O3 -funroll-loops -D_THREAD_SAFE -D_GNU_SOURCE -march=native
66
VERBOSE = -DVERBOSE
77
AM_CFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS)
8-
AM_LDFLAGS = -g
8+
AM_CXXFLAGS = $(IFLAGS) $(VERBOSE) $(WARNS) $(FLAGS) -std=c++0x
9+
AM_LDFLAGS = -g -licuuc
910

1011
noinst_HEADERS = \
1112
common_impl.h wg_mempool.h

src/my_createfp.cxx

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,20 @@
11
#include <iterator>
2+
#include <vector>
3+
#include <algorithm>
4+
#include <string>
25
#include <iostream>
36

47
#include "textcat++.hxx"
58

69
int main(int argc, char* argv[]){
7-
textcat::fingerprint fp(std::istream_iterator<char>(std::cin), 400);
8-
std::cout<< fp << std::endl;
10+
typedef std::tuple<std::string, std::string> lang_info_type;
11+
std::vector<lang_info_type> languages;
12+
languages.push_back(lang_info_type(argv[1], argv[2]));
13+
textcat::classifier<5> cf(languages.begin(), languages.end());
14+
std::vector<std::string> results;
15+
std::istream_iterator<char> b(std::cin), e;
16+
cf(b, e, std::back_inserter(results));
17+
std::copy(results.begin(), results.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
18+
std::cout << std::endl;
919
return 0;
1020
}

src/ranks.hxx

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#ifndef RANKS_HXX
2+
#define RANKS_HXX
3+
4+
#include <iterator>
5+
#include <functional>
6+
#include <algorithm>
7+
#include <vector>
8+
9+
template<typename In, typename Comp, typename SizeType= std::size_t>
10+
struct indexed_binary_function
11+
: std::binary_function<typename Comp::result_type, SizeType, SizeType> {
12+
indexed_binary_function(In b, Comp c):begin(b), comp(c){}
13+
typename Comp::result_type operator()(SizeType i1, SizeType i2) const
14+
{ return comp(*(begin+i1), *(begin+i2));}
15+
In begin;
16+
Comp comp;
17+
};
18+
19+
template<typename In, typename Out, typename Comp>
20+
Out indexes(In b, In e, Out o, Comp comp) {
21+
std::vector<std::size_t> idx(e-b);
22+
for(std::size_t i(0); i != idx.size(); ++i)
23+
{ idx[i]= i;}
24+
std::sort(idx.begin(), idx.end(), indexed_binary_function<In, Comp, std::size_t>(b, comp));
25+
return std::copy(idx.begin(), idx.end(), o);
26+
}
27+
28+
template<typename In, typename Out> Out indexes(In b, In e, Out o) {
29+
return indexes(b, e, o, std::less<typename std::iterator_traits<In>::value_type>());
30+
}
31+
32+
33+
template<typename In, typename Out, typename Comp>
34+
Out ranks(In b, In e, Out o, Comp comp) {
35+
std::vector<std::size_t> idx(e-b), rnks(e-b);
36+
indexes(b, e, idx.begin(), comp);
37+
for(std::size_t i(0); i != rnks.size(); ++i)
38+
{ rnks[idx[i]]= i; }
39+
return std::copy(rnks.begin(), rnks.end(), o);
40+
}
41+
42+
template<typename In, typename Out> Out ranks(In b, In e, Out o) {
43+
return ranks(b, e, o, std::less<typename std::iterator_traits<In>::value_type>());
44+
}
45+
46+
#endif

src/textcat++.hxx

Lines changed: 111 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,35 @@
11
#ifndef TEXTCAT_HXX
22
#define TEXTCAT_HXX
33
#include <iterator>
4+
#include <iostream>
5+
#include <fstream>
6+
#include <tuple>
7+
#include <functional>
8+
#include <unordered_map>
9+
#include <vector>
10+
#include <algorithm>
11+
#include <limits>
412

13+
extern "C" {
14+
//:TODO: adjust configure
15+
#include <unicode/uchar.h>
16+
}
17+
#include "utf_codecs.hxx"
18+
#include "ranks.hxx"
19+
/*
20+
various data structures :
21+
ngrams : compact array of bitfields :TODO: make a generic reusable template: how to specialize std::rotate for it ?
22+
hash_table of ngrams and counts
23+
vector of ngrams and counts
24+
vector or ngrams and ranks : fingerprints
25+
vector of fingerprints and names : smart pointers ? iterators ? how to avoid copy ?
26+
ony one needed per app, maybe no copyes at all
27+
*/
528
namespace textcat {
29+
30+
char32_t constexpr placeholder_char= U'_';
31+
char32_t to_normal(char32_t c) { return u_isalpha(c) ? u_toupper(c) : placeholder_char; } //u_isUAlphabetic
32+
633
// storing ngrams for 1 <= n <= N
734
template<unsigned char N>
835
struct ngram {
@@ -18,18 +45,21 @@ namespace textcat {
1845
return static_cast<std::size_t>(data)
1946
^ static_cast<std::size_t>(data>>(8*sizeof(std::size_t)));
2047
}
21-
friend bool operator<(ngram<N> const& n1, ngram<N> const& n2);
22-
friend bool operator==(ngram<N> const& n1, ngram<N> const& n2);
23-
template<typename Os> Os& operator<<(Os& os, ngram<N> const& n);
48+
template<unsigned char NN> friend bool operator<(ngram<NN> const& n1, ngram<NN> const& n2);
49+
template<unsigned char NN> friend bool operator==(ngram<NN> const& n1, ngram<NN> const& n2);
50+
template<typename Os, unsigned char NN> friend Os& operator<<(Os& os, ngram<NN> const& n);
2451
typedef __int128 data_type;
2552
data_type data;
2653
};
54+
template<unsigned char N>
2755
bool operator<(ngram<N> const& n1, ngram<N> const& n2)
2856
{ return n1.data < n2.data; }
57+
template<unsigned char N>
2958
bool operator==(ngram<N> const& n1, ngram<N> const& n2)
3059
{ return n1.data == n2.data; }
3160

32-
template<typename Os> Os& operator<<(Os& os, ngram<N> const& n) {
61+
template<typename Os, unsigned char N>
62+
Os& operator<<(Os& os, ngram<N> const& n) {
3363
for(unsigned char i(0); i != N; ++i){
3464
char32_t const c (static_cast<char32_t>(n.data >> 21*i) & ((1 << 21)-1));
3565
if(c) { to_utf8(c, os);}
@@ -39,45 +69,47 @@ namespace textcat {
3969
}
4070
}
4171
return os;
72+
}
4273
}
74+
4375
namespace std {
44-
namespace tr1 {
45-
template<unsigned char N> hash<textcat::ngram<N> > {
46-
std::size_t operator()(textcat::ngram<N> const& ng) const
47-
{ return ng.hash() ; }
48-
};
49-
}
76+
template<unsigned char N> struct hash<textcat::ngram<N> > {
77+
std::size_t operator()(textcat::ngram<N> const& ng) const
78+
{ return ng.hash() ; }
79+
};
5080
}
81+
5182
namespace textcat {
5283

53-
template<unsigned char N, typedef SizeType= std::size_t>
84+
template<unsigned char N, typename SizeType= std::size_t>
5485
struct counts {
5586
typedef ngram<N> ngram_type;
5687
typedef SizeType size_type;
57-
typedef std::tr1::map<ngram_type, size_type> map_type;
58-
typedef typename map_type::value_type value_type;
88+
typedef std::unordered_map<ngram_type, size_type> map_type;
89+
typedef typename std::pair<ngram_type, size_type> value_type;
5990
typedef std::vector<value_type> fingerprint_type;
6091
};
6192

6293
template<unsigned char N, typename In>
63-
typename counts<N>::map_type make_counts(In b, In e, std::size_t max_n){
64-
typename counts<N>::map_type counts;
94+
typename counts<N>::map_type make_counts(In b, In e, std::size_t max_ngrams){
95+
typename counts<N>::map_type counters;
6596
typedef typename counts<N>::ngram_type ngram_type;
66-
ngram_type ng;
67-
std::size_t n;
68-
for( n=0, ng(U'_'), was_invalid=true; (b != e) && (n != max_ngrams)
97+
ngram_type ng;
98+
std::size_t n;
99+
bool was_invalid, invalid;
100+
for( n=0, ng(placeholder_char), was_invalid=true; (b != e) && (n != max_ngrams)
69101
; was_invalid= invalid) {
70102
char32_t const new_codepoint(to_normal(from_utf8(b, e)));
71-
invalid= (new_char == U'_');
103+
invalid= (new_codepoint == placeholder_char);
72104
if(! (invalid && was_invalid)) {
73105
ng(new_codepoint);
74-
++counts[ng];
106+
++counters[ng];
75107
++n;
76108
}
77109
if(invalid && !was_invalid)
78-
{ ng= ngram_type(U'_');}
110+
{ ng= ngram_type(placeholder_char);}
79111
}
80-
return counts;
112+
return counters;
81113
}
82114

83115
template<unsigned char N>
@@ -86,20 +118,16 @@ namespace textcat {
86118
typedef typename counts<N>::value_type value_type;
87119
typename counts<N>::fingerprint_type result(c.begin(), c.end());
88120
typedef typename value_type::second_type size_type;
89-
if(maxngrams > tmp.size()) { maxngrams= tmp.size(); }
90-
std::partial_sort(tmp.begin(), tmp.begin()+maxngrams, tmp.end()
91-
, std::tr1::bind(std::greater<size_type>()
92-
, std::tr1::bind<size_type>(&pair_type::second
93-
,std::tr1::placeholders::_1)
94-
, std::tr1::bind<size_type>(&pair_type::second
95-
,std::tr1::placeholders::_2)));
121+
if(maxngrams > result.size()) { maxngrams= result.size(); }
122+
std::partial_sort(result.begin(), result.begin()+maxngrams, result.end()
123+
,[](value_type const& v1, value_type const& v2){ return v1.second > v2.second; });
96124
result.resize(maxngrams);
97125
return result;
98126
}
99127

100128
template<typename InOut> InOut count_to_rank(InOut b, InOut e){
101129
if(b!= e){
102-
for(std::iterator_traits<InOut>::value_type last((*b).second), rnk(0)
130+
for(typename std::iterator_traits<InOut>::value_type::second_type last((*b).second), rnk(0)
103131
; b != e; ++b) {
104132
if((*b).second != last){ last= (*b).second; ++rnk;}
105133
(*b).second= rnk;
@@ -110,90 +138,98 @@ namespace textcat {
110138

111139
// Input Stream data is supposed to be already sorted
112140
// imbue maxngrams and >> ?
113-
template<unsigned char N, typename Is> typename counts<N>::fingerprint_type read(Is& is, std::size_t maxngrams) {
141+
template<unsigned char N, typename Is> typename counts<N>::fingerprint_type read(Is& is, std::size_t maxngrams= std::numeric_limits<std::size_t>::max()) {
114142
typename counts<N>::fingerprint_type result;
115143
std::string tmp;
116144
std::size_t count;
117145
while(std::getline(is, tmp, '\t') && (maxngrams--)){
118146
is >> count;
119147
if(!is) { break; }
120148
ngram<N> ng;
121-
result.push_back(std::for_each(tmp.begin(), tmp.end(), ng), count);
149+
result.push_back(std::make_pair(std::for_each(tmp.begin(), tmp.end(), ng)
150+
, count));
122151
}
123152
count_to_rank(result.begin(), result.end());
124153
return result;
125154
}
126155

127156
template<unsigned char N, typename Os> Os& operator<<(Os& os, typename counts<N>::fingerprint_type fp) {
128-
for(typename counts<N>::fingerprint_type::const_iterator it(fp.begin()); it != fp.end(); ++it){
129-
os<<(*it).first << '\t' << (*it).second << '\n';
130-
}
157+
std::for_each(fp.begin(), fp.end()
158+
, [&os](typename counts<N>::value_type const& v)
159+
{os << v.first << '\t' << v.second << '\n';});
160+
// for(typename counts<N>::fingerprint_type::const_iterator it(fp.begin()); it != fp.end(); ++it){
161+
// os<<(*it).first << '\t' << (*it).second << '\n';
162+
// }
131163
return os;
132164
}
133165

134-
template<unsigned char N >
135166
struct scorer {
136-
scorer(std::size_t out_of_place= 400
137-
, std::size_t max_score= std::numeric_limits<std::size_t>::max()
138-
, float treshold_ratio=1.03):
139-
out_of_place(out_of_place), max_score(max_score)
140-
, cutoff(max_score), treshold_ratio(treshold_ratio){}
141-
142-
std::size_t operator()(typename score<N>::fingerprint_type const& f1
143-
,typename score<N>::fingerprint_type const& f2){
144-
typedef typename score<N>::fingerprint_type fp_type;
167+
scorer(float ratio=1.03, std::size_t oop= 400
168+
, std::size_t max_s= std::numeric_limits<std::size_t>::max()
169+
):
170+
out_of_place(oop), max_score(max_s)
171+
, cutoff(max_s), treshold_ratio(ratio){}
172+
template<unsigned char N>
173+
std::size_t operator()(typename counts<N>::fingerprint_type const& f1
174+
, typename counts<N>::fingerprint_type const& f2){
175+
typedef typename counts<N>::fingerprint_type fp_type;
145176
std::size_t result(0);
146177
for(typename fp_type::const_iterator it1(f1.begin()), it2(f2.begin())
147178
; (it1 != f1.end()) && (it2 != f2.end()) && (result != max_score)
148179
; ) {
149180
if( (*it1).first < (*it2).first) { ++it1; }
150181
else {
151182
result += ((*it1).first == (*it2).first)
152-
? std::abs((*it1++).second - (*it2).second);
183+
? std::abs((*(it1++)).second - (*it2).second)
153184
: out_of_place;
154185
if(result > cutoff) { result= max_score; }
155186
++it2;
156187
}
157188
}
158-
cutoff= std::min(cutoff, result* treshold_ratio);// check no overflow
189+
cutoff= std::min(cutoff, static_cast<std::size_t>(result* treshold_ratio));// check no overflow
159190
return result;
160191
}
161192
std::size_t const out_of_place, max_score;
162193
std::size_t cutoff;
163-
float treshold_ratio;
194+
float const treshold_ratio;
164195
};
165196

197+
198+
166199
template<unsigned char N> struct classifier {
167-
template<typename In> classifier(In b, In e)
168-
std::size_t score( typename counts<N>::fingerprint_type const&
169-
std::array<CountsType::mapped_type::size * 4> string_type;
200+
typedef typename counts<N>::fingerprint_type fingerprint_type;
201+
typedef std::tuple<std::string, fingerprint_type> language_type;
202+
std::vector<language_type> languages;
170203

171-
struct fingerprint {
172-
template<typename In>
173-
fingerprint(In b, In e, std::size_t max_ngrams){
174-
bool invalid, was_invalid;
175-
typedef ngram<5> ngram_type;
176-
ngram_type ng;
177-
std::size_t n;
178-
for( n=0, ng(U'_'), was_invalid=true; (b != e) && (n != max_ngrams); was_invalid= invalid) {
179-
char32_t const new_codepoint(to_normal(from_utf8(b, e)));
180-
invalid= (new_char == U'_');
181-
if(! (invalid && was_invalid)) {
182-
ng(new_codepoint);
183-
++counts[ng];
184-
++n;
185-
}
186-
if(invalid && !was_invalid)
187-
{ ng= ngram_type(U'_');}
188-
}
204+
// for now, taking iterators to std::pairs of languagename, filename
205+
template<typename In> classifier(In b, In e) {
206+
std::transform(b, e, std::back_inserter(languages)
207+
, [](typename std::iterator_traits<In>::value_type const& v)
208+
-> language_type {
209+
std::ifstream ifs(std::get<1>(v));
210+
return language_type(std::get<0>(v)
211+
, read<N>(ifs)); });
189212
}
190-
template<typename Out> Out ranks(Out o ) const {
191-
std::vector<counts_type::value_type>
192-
std::copy(counts.begin(), counts.end(),
213+
214+
template<typename In, typename Out> Out operator()(In b, In e, Out o
215+
, std::size_t max_read=std::numeric_limits<std::size_t>::max()) const {
216+
float const treshold_ratio (1.03);
217+
//:TODO: LRU caching of detected languages
218+
typename counts<N>::fingerprint_type fp(to_vector<N>(make_counts<N>(b, e, max_read), max_read));
219+
count_to_rank(fp.begin(), fp.end());
220+
typename counts<N>::fingerprint_type const& cfp(fp);
221+
std::vector<std::size_t> scores(languages.size()), idx(languages.size());
222+
scorer s(treshold_ratio);
223+
std::transform(languages.begin(), languages.end(), scores.begin()
224+
,[&s, &fp](language_type const& lang){ return s.operator()<N>(fp, std::get<1>(lang));});
225+
indexes(scores.begin(), scores.end(), idx.begin());
226+
std::size_t const best(scores[idx.front()]);
227+
for(auto it(idx.begin()); (it != idx.end()) || (scores[*it] <= best*treshold_ratio); ++it, ++o)
228+
{ *o= std::get<0>(languages[*it]); }
229+
return o;
193230
}
194231
};
195-
template<typename Os> Os& operator<<(Os& os, fingerprint const& fp){
196-
return os;
197-
}
198232
}
233+
234+
199235
#endif

src/textcat.cxx

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#include "textcat.hxx"
2+
3+
namespace textcat {
4+
void test(){}
5+
}

0 commit comments

Comments
 (0)