11#ifndef TEXTCAT_HXX
22#define TEXTCAT_HXX
33#include < iterator>
4+ #include < iostream>
5+ #include < fstream>
6+ #include < tuple>
7+ #include < functional>
8+ #include < unordered_map>
9+ #include < vector>
10+ #include < algorithm>
11+ #include < limits>
412
13+ extern " C" {
14+ // :TODO: adjust configure
15+ #include < unicode/uchar.h>
16+ }
17+ #include " utf_codecs.hxx"
18+ #include " ranks.hxx"
19+ /*
20+ various data structures :
21+ ngrams : compact array of bitfields :TODO: make a generic reusable template: how to specialize std::rotate for it ?
22+ hash_table of ngrams and counts
23+ vector of ngrams and counts
24+ vector or ngrams and ranks : fingerprints
25+ vector of fingerprints and names : smart pointers ? iterators ? how to avoid copy ?
26+ ony one needed per app, maybe no copyes at all
27+ */
528namespace textcat {
29+
30+ char32_t constexpr placeholder_char= U' _' ;
31+ char32_t to_normal (char32_t c) { return u_isalpha (c) ? u_toupper (c) : placeholder_char; } // u_isUAlphabetic
32+
633 // storing ngrams for 1 <= n <= N
734 template <unsigned char N>
835 struct ngram {
@@ -18,18 +45,21 @@ namespace textcat {
1845 return static_cast <std::size_t >(data)
1946 ^ static_cast <std::size_t >(data>>(8 *sizeof (std::size_t )));
2047 }
21- friend bool operator <(ngram<N > const & n1, ngram<N > const & n2);
22- friend bool operator ==(ngram<N > const & n1, ngram<N > const & n2);
23- template <typename Os> Os& operator <<(Os& os, ngram<N > const & n);
48+ template < unsigned char NN> friend bool operator <(ngram<NN > const & n1, ngram<NN > const & n2);
49+ template < unsigned char NN> friend bool operator ==(ngram<NN > const & n1, ngram<NN > const & n2);
50+ template <typename Os, unsigned char NN> friend Os& operator <<(Os& os, ngram<NN > const & n);
2451 typedef __int128 data_type;
2552 data_type data;
2653 };
54+ template <unsigned char N>
2755 bool operator <(ngram<N> const & n1, ngram<N> const & n2)
2856 { return n1.data < n2.data ; }
57+ template <unsigned char N>
2958 bool operator ==(ngram<N> const & n1, ngram<N> const & n2)
3059 { return n1.data == n2.data ; }
3160
32- template <typename Os> Os& operator <<(Os& os, ngram<N> const & n) {
61+ template <typename Os, unsigned char N>
62+ Os& operator <<(Os& os, ngram<N> const & n) {
3363 for (unsigned char i (0 ); i != N; ++i){
3464 char32_t const c (static_cast <char32_t >(n.data >> 21 *i) & ((1 << 21 )-1 ));
3565 if (c) { to_utf8 (c, os);}
@@ -39,45 +69,47 @@ namespace textcat {
3969 }
4070 }
4171 return os;
72+ }
4273}
74+
4375namespace std {
44- namespace tr1 {
45- template <unsigned char N> hash<textcat::ngram<N> > {
46- std::size_t operator ()(textcat::ngram<N> const & ng) const
47- { return ng.hash () ; }
48- };
49- }
76+ template <unsigned char N> struct hash <textcat::ngram<N> > {
77+ std::size_t operator ()(textcat::ngram<N> const & ng) const
78+ { return ng.hash () ; }
79+ };
5080}
81+
5182namespace textcat {
5283
53- template <unsigned char N, typedef SizeType= std::size_t >
84+ template <unsigned char N, typename SizeType= std::size_t >
5485 struct counts {
5586 typedef ngram<N> ngram_type;
5687 typedef SizeType size_type;
57- typedef std::tr1::map <ngram_type, size_type> map_type;
58- typedef typename map_type::value_type value_type;
88+ typedef std::unordered_map <ngram_type, size_type> map_type;
89+ typedef typename std::pair<ngram_type, size_type> value_type;
5990 typedef std::vector<value_type> fingerprint_type;
6091 };
6192
6293 template <unsigned char N, typename In>
63- typename counts<N>::map_type make_counts (In b, In e, std::size_t max_n ){
64- typename counts<N>::map_type counts ;
94+ typename counts<N>::map_type make_counts (In b, In e, std::size_t max_ngrams ){
95+ typename counts<N>::map_type counters ;
6596 typedef typename counts<N>::ngram_type ngram_type;
66- ngram_type ng;
67- std::size_t n;
68- for ( n=0 , ng (U' _' ), was_invalid=true ; (b != e) && (n != max_ngrams)
97+ ngram_type ng;
98+ std::size_t n;
99+ bool was_invalid, invalid;
100+ for ( n=0 , ng (placeholder_char), was_invalid=true ; (b != e) && (n != max_ngrams)
69101 ; was_invalid= invalid) {
70102 char32_t const new_codepoint (to_normal (from_utf8 (b, e)));
71- invalid= (new_char == U ' _ ' );
103+ invalid= (new_codepoint == placeholder_char );
72104 if (! (invalid && was_invalid)) {
73105 ng (new_codepoint);
74- ++counts [ng];
106+ ++counters [ng];
75107 ++n;
76108 }
77109 if (invalid && !was_invalid)
78- { ng= ngram_type (U ' _ ' );}
110+ { ng= ngram_type (placeholder_char );}
79111 }
80- return counts ;
112+ return counters ;
81113 }
82114
83115 template <unsigned char N>
@@ -86,20 +118,16 @@ namespace textcat {
86118 typedef typename counts<N>::value_type value_type;
87119 typename counts<N>::fingerprint_type result (c.begin (), c.end ());
88120 typedef typename value_type::second_type size_type;
89- if (maxngrams > tmp.size ()) { maxngrams= tmp.size (); }
90- std::partial_sort (tmp.begin (), tmp.begin ()+maxngrams, tmp.end ()
91- , std::tr1::bind (std::greater<size_type>()
92- , std::tr1::bind<size_type>(&pair_type::second
93- ,std::tr1::placeholders::_1)
94- , std::tr1::bind<size_type>(&pair_type::second
95- ,std::tr1::placeholders::_2)));
121+ if (maxngrams > result.size ()) { maxngrams= result.size (); }
122+ std::partial_sort (result.begin (), result.begin ()+maxngrams, result.end ()
123+ ,[](value_type const & v1, value_type const & v2){ return v1.second > v2.second ; });
96124 result.resize (maxngrams);
97125 return result;
98126 }
99127
100128 template <typename InOut> InOut count_to_rank (InOut b, InOut e){
101129 if (b!= e){
102- for (std::iterator_traits<InOut>::value_type last ((*b).second ), rnk (0 )
130+ for (typename std::iterator_traits<InOut>::value_type::second_type last ((*b).second ), rnk (0 )
103131 ; b != e; ++b) {
104132 if ((*b).second != last){ last= (*b).second ; ++rnk;}
105133 (*b).second = rnk;
@@ -110,90 +138,98 @@ namespace textcat {
110138
111139// Input Stream data is supposed to be already sorted
112140// imbue maxngrams and >> ?
113- template <unsigned char N, typename Is> typename counts<N>::fingerprint_type read (Is& is, std::size_t maxngrams) {
141+ template <unsigned char N, typename Is> typename counts<N>::fingerprint_type read (Is& is, std::size_t maxngrams= std::numeric_limits<std:: size_t >::max() ) {
114142 typename counts<N>::fingerprint_type result;
115143 std::string tmp;
116144 std::size_t count;
117145 while (std::getline (is, tmp, ' \t ' ) && (maxngrams--)){
118146 is >> count;
119147 if (!is) { break ; }
120148 ngram<N> ng;
121- result.push_back (std::for_each (tmp.begin (), tmp.end (), ng), count);
149+ result.push_back (std::make_pair (std::for_each (tmp.begin (), tmp.end (), ng)
150+ , count));
122151 }
123152 count_to_rank (result.begin (), result.end ());
124153 return result;
125154 }
126155
127156 template <unsigned char N, typename Os> Os& operator <<(Os& os, typename counts<N>::fingerprint_type fp) {
128- for (typename counts<N>::fingerprint_type::const_iterator it (fp.begin ()); it != fp.end (); ++it){
129- os<<(*it).first << ' \t ' << (*it).second << ' \n ' ;
130- }
157+ std::for_each (fp.begin (), fp.end ()
158+ , [&os](typename counts<N>::value_type const & v)
159+ {os << v.first << ' \t ' << v.second << ' \n ' ;});
160+ // for(typename counts<N>::fingerprint_type::const_iterator it(fp.begin()); it != fp.end(); ++it){
161+ // os<<(*it).first << '\t' << (*it).second << '\n';
162+ // }
131163 return os;
132164 }
133165
134- template <unsigned char N >
135166 struct scorer {
136- scorer (std::size_t out_of_place = 400
137- , std::size_t max_score = std::numeric_limits<std::size_t >::max()
138- , float treshold_ratio= 1.03 ):
139- out_of_place (out_of_place ), max_score(max_score )
140- , cutoff(max_score ), treshold_ratio(treshold_ratio ){}
141-
142- std::size_t operator ()(typename score <N>::fingerprint_type const & f1
143- ,typename score <N>::fingerprint_type const & f2){
144- typedef typename score <N>::fingerprint_type fp_type;
167+ scorer (float ratio= 1.03 , std::size_t oop = 400
168+ , std::size_t max_s = std::numeric_limits<std::size_t >::max()
169+ ):
170+ out_of_place (oop ), max_score(max_s )
171+ , cutoff(max_s ), treshold_ratio(ratio ){}
172+ template < unsigned char N>
173+ std::size_t operator ()(typename counts <N>::fingerprint_type const & f1
174+ , typename counts <N>::fingerprint_type const & f2){
175+ typedef typename counts <N>::fingerprint_type fp_type;
145176 std::size_t result (0 );
146177 for (typename fp_type::const_iterator it1 (f1.begin ()), it2 (f2.begin ())
147178 ; (it1 != f1.end ()) && (it2 != f2.end ()) && (result != max_score)
148179 ; ) {
149180 if ( (*it1).first < (*it2).first ) { ++it1; }
150181 else {
151182 result += ((*it1).first == (*it2).first )
152- ? std::abs ((*it1++).second - (*it2).second );
183+ ? std::abs ((*( it1++)) .second - (*it2).second )
153184 : out_of_place;
154185 if (result > cutoff) { result= max_score; }
155186 ++it2;
156187 }
157188 }
158- cutoff= std::min (cutoff, result* treshold_ratio);// check no overflow
189+ cutoff= std::min (cutoff, static_cast <std:: size_t >( result* treshold_ratio) );// check no overflow
159190 return result;
160191 }
161192 std::size_t const out_of_place, max_score;
162193 std::size_t cutoff;
163- float treshold_ratio;
194+ float const treshold_ratio;
164195 };
165196
197+
198+
166199 template <unsigned char N> struct classifier {
167- template < typename In> classifier (In b, In e)
168- std::size_t score( typename counts<N>:: fingerprint_type const &
169- std::array<CountsType::mapped_type::size * 4 > string_type ;
200+ typedef typename counts<N>::fingerprint_type fingerprint_type;
201+ typedef std::tuple<std::string, fingerprint_type> language_type;
202+ std::vector<language_type> languages ;
170203
171- struct fingerprint {
172- template <typename In>
173- fingerprint (In b, In e, std::size_t max_ngrams){
174- bool invalid, was_invalid;
175- typedef ngram<5 > ngram_type;
176- ngram_type ng;
177- std::size_t n;
178- for ( n=0 , ng (U' _' ), was_invalid=true ; (b != e) && (n != max_ngrams); was_invalid= invalid) {
179- char32_t const new_codepoint (to_normal (from_utf8 (b, e)));
180- invalid= (new_char == U' _' );
181- if (! (invalid && was_invalid)) {
182- ng (new_codepoint);
183- ++counts[ng];
184- ++n;
185- }
186- if (invalid && !was_invalid)
187- { ng= ngram_type (U' _' );}
188- }
204+ // for now, taking iterators to std::pairs of languagename, filename
205+ template <typename In> classifier (In b, In e) {
206+ std::transform (b, e, std::back_inserter (languages)
207+ , [](typename std::iterator_traits<In>::value_type const & v)
208+ -> language_type {
209+ std::ifstream ifs (std::get<1 >(v));
210+ return language_type (std::get<0 >(v)
211+ , read<N>(ifs)); });
189212 }
190- template <typename Out> Out ranks (Out o ) const {
191- std::vector<counts_type::value_type>
192- std::copy (counts.begin (), counts.end (),
213+
214+ template <typename In, typename Out> Out operator ()(In b, In e, Out o
215+ , std::size_t max_read=std::numeric_limits<std::size_t >::max()) const {
216+ float const treshold_ratio (1.03 );
217+ // :TODO: LRU caching of detected languages
218+ typename counts<N>::fingerprint_type fp (to_vector<N>(make_counts<N>(b, e, max_read), max_read));
219+ count_to_rank (fp.begin (), fp.end ());
220+ typename counts<N>::fingerprint_type const & cfp (fp);
221+ std::vector<std::size_t > scores (languages.size ()), idx (languages.size ());
222+ scorer s (treshold_ratio);
223+ std::transform (languages.begin (), languages.end (), scores.begin ()
224+ ,[&s, &fp](language_type const & lang){ return s.operator ()<N>(fp, std::get<1 >(lang));});
225+ indexes (scores.begin (), scores.end (), idx.begin ());
226+ std::size_t const best (scores[idx.front ()]);
227+ for (auto it (idx.begin ()); (it != idx.end ()) || (scores[*it] <= best*treshold_ratio); ++it, ++o)
228+ { *o= std::get<0 >(languages[*it]); }
229+ return o;
193230 }
194231 };
195- template <typename Os> Os& operator <<(Os& os, fingerprint const & fp){
196- return os;
197- }
198232}
233+
234+
199235#endif
0 commit comments