Skip to content

Commit 2fc9a01

Browse files
shaping up the C++ API
1 parent 7457dbc commit 2fc9a01

File tree

2 files changed

+325
-1
lines changed

2 files changed

+325
-1
lines changed

src/textcat++.hxx

Lines changed: 186 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,195 @@
11
#ifndef TEXTCAT_HXX
22
#define TEXTCAT_HXX
33
#include <iterator>
4+
45
namespace textcat {
6+
// storing ngrams for 1 <= n <= N
7+
template<unsigned char N>
8+
struct ngram {
9+
static unsigned char const size=N;
10+
ngram(char32_t init=0):data(static_cast<data_type>(init)){}
11+
char32_t operator()(char32_t to_push) {
12+
char32_t const to_pop( data >> (21*(N-1)));
13+
data= ((data << 21) & ( (static_cast<data_type>(1)<< (21*N))-1))
14+
| static_cast<data_type>(to_push);
15+
return to_pop;
16+
}
17+
std::size_t hash() const {
18+
return static_cast<std::size_t>(data)
19+
^ static_cast<std::size_t>(data>>(8*sizeof(std::size_t)));
20+
}
21+
friend bool operator<(ngram<N> const& n1, ngram<N> const& n2);
22+
friend bool operator==(ngram<N> const& n1, ngram<N> const& n2);
23+
template<typename Os> Os& operator<<(Os& os, ngram<N> const& n);
24+
typedef __int128 data_type;
25+
data_type data;
26+
};
27+
bool operator<(ngram<N> const& n1, ngram<N> const& n2)
28+
{ return n1.data < n2.data; }
29+
bool operator==(ngram<N> const& n1, ngram<N> const& n2)
30+
{ return n1.data == n2.data; }
31+
32+
template<typename Os> Os& operator<<(Os& os, ngram<N> const& n) {
33+
for(unsigned char i(0); i != N; ++i){
34+
char32_t const c (static_cast<char32_t>(n.data >> 21*i) & ((1 << 21)-1));
35+
if(c) { to_utf8(c, os);}
36+
else {
37+
os << '\0';
38+
break;
39+
}
40+
}
41+
return os;
42+
}
43+
namespace std {
44+
namespace tr1 {
45+
template<unsigned char N> hash<textcat::ngram<N> > {
46+
std::size_t operator()(textcat::ngram<N> const& ng) const
47+
{ return ng.hash() ; }
48+
};
49+
}
50+
}
51+
namespace textcat {
52+
53+
template<unsigned char N, typedef SizeType= std::size_t>
54+
struct counts {
55+
typedef ngram<N> ngram_type;
56+
typedef SizeType size_type;
57+
typedef std::tr1::map<ngram_type, size_type> map_type;
58+
typedef typename map_type::value_type value_type;
59+
typedef std::vector<value_type> fingerprint_type;
60+
};
61+
62+
template<unsigned char N, typename In>
63+
typename counts<N>::map_type make_counts(In b, In e, std::size_t max_n){
64+
typename counts<N>::map_type counts;
65+
typedef typename counts<N>::ngram_type ngram_type;
66+
ngram_type ng;
67+
std::size_t n;
68+
for( n=0, ng(U'_'), was_invalid=true; (b != e) && (n != max_ngrams)
69+
; was_invalid= invalid) {
70+
char32_t const new_codepoint(to_normal(from_utf8(b, e)));
71+
invalid= (new_char == U'_');
72+
if(! (invalid && was_invalid)) {
73+
ng(new_codepoint);
74+
++counts[ng];
75+
++n;
76+
}
77+
if(invalid && !was_invalid)
78+
{ ng= ngram_type(U'_');}
79+
}
80+
return counts;
81+
}
82+
83+
template<unsigned char N>
84+
typename counts<N>::fingerprint_type
85+
to_vector(typename counts<N>::map_type const& c, std::size_t maxngrams){
86+
typedef typename counts<N>::value_type value_type;
87+
typename counts<N>::fingerprint_type result(c.begin(), c.end());
88+
typedef typename value_type::second_type size_type;
89+
if(maxngrams > tmp.size()) { maxngrams= tmp.size(); }
90+
std::partial_sort(tmp.begin(), tmp.begin()+maxngrams, tmp.end()
91+
, std::tr1::bind(std::greater<size_type>()
92+
, std::tr1::bind<size_type>(&pair_type::second
93+
,std::tr1::placeholders::_1)
94+
, std::tr1::bind<size_type>(&pair_type::second
95+
,std::tr1::placeholders::_2)));
96+
result.resize(maxngrams);
97+
return result;
98+
}
99+
100+
template<typename InOut> InOut count_to_rank(InOut b, InOut e){
101+
if(b!= e){
102+
for(std::iterator_traits<InOut>::value_type last((*b).second), rnk(0)
103+
; b != e; ++b) {
104+
if((*b).second != last){ last= (*b).second; ++rnk;}
105+
(*b).second= rnk;
106+
}
107+
}
108+
return b;
109+
}
110+
111+
// Input Stream data is supposed to be already sorted
112+
// imbue maxngrams and >> ?
113+
template<unsigned char N, typename Is> typename counts<N>::fingerprint_type read(Is& is, std::size_t maxngrams) {
114+
typename counts<N>::fingerprint_type result;
115+
std::string tmp;
116+
std::size_t count;
117+
while(std::getline(is, tmp, '\t') && (maxngrams--)){
118+
is >> count;
119+
if(!is) { break; }
120+
ngram<N> ng;
121+
result.push_back(std::for_each(tmp.begin(), tmp.end(), ng), count);
122+
}
123+
count_to_rank(result.begin(), result.end());
124+
return result;
125+
}
126+
127+
template<unsigned char N, typename Os> Os& operator<<(Os& os, typename counts<N>::fingerprint_type fp) {
128+
for(typename counts<N>::fingerprint_type::const_iterator it(fp.begin()); it != fp.end(); ++it){
129+
os<<(*it).first << '\t' << (*it).second << '\n';
130+
}
131+
return os;
132+
}
133+
134+
template<unsigned char N >
135+
struct scorer {
136+
scorer(std::size_t out_of_place= 400
137+
, std::size_t max_score= std::numeric_limits<std::size_t>::max()
138+
, float treshold_ratio=1.03):
139+
out_of_place(out_of_place), max_score(max_score)
140+
, cutoff(max_score), treshold_ratio(treshold_ratio){}
141+
142+
std::size_t operator()(typename score<N>::fingerprint_type const& f1
143+
,typename score<N>::fingerprint_type const& f2){
144+
typedef typename score<N>::fingerprint_type fp_type;
145+
std::size_t result(0);
146+
for(typename fp_type::const_iterator it1(f1.begin()), it2(f2.begin())
147+
; (it1 != f1.end()) && (it2 != f2.end()) && (result != max_score)
148+
; ) {
149+
if( (*it1).first < (*it2).first) { ++it1; }
150+
else {
151+
result += ((*it1).first == (*it2).first)
152+
? std::abs((*it1++).second - (*it2).second);
153+
: out_of_place;
154+
if(result > cutoff) { result= max_score; }
155+
++it2;
156+
}
157+
}
158+
cutoff= std::min(cutoff, result* treshold_ratio);// check no overflow
159+
return result;
160+
}
161+
std::size_t const out_of_place, max_score;
162+
std::size_t cutoff;
163+
float treshold_ratio;
164+
};
165+
166+
template<unsigned char N> struct classifier {
167+
template<typename In> classifier(In b, In e)
168+
std::size_t score( typename counts<N>::fingerprint_type const&
169+
std::array<CountsType::mapped_type::size * 4> string_type;
170+
5171
struct fingerprint {
6172
template<typename In>
7-
fingerprint(In src, std::size_t max_ngrams){
173+
fingerprint(In b, In e, std::size_t max_ngrams){
174+
bool invalid, was_invalid;
175+
typedef ngram<5> ngram_type;
176+
ngram_type ng;
177+
std::size_t n;
178+
for( n=0, ng(U'_'), was_invalid=true; (b != e) && (n != max_ngrams); was_invalid= invalid) {
179+
char32_t const new_codepoint(to_normal(from_utf8(b, e)));
180+
invalid= (new_char == U'_');
181+
if(! (invalid && was_invalid)) {
182+
ng(new_codepoint);
183+
++counts[ng];
184+
++n;
185+
}
186+
if(invalid && !was_invalid)
187+
{ ng= ngram_type(U'_');}
188+
}
189+
}
190+
template<typename Out> Out ranks(Out o ) const {
191+
std::vector<counts_type::value_type>
192+
std::copy(counts.begin(), counts.end(),
8193
}
9194
};
10195
template<typename Os> Os& operator<<(Os& os, fingerprint const& fp){

src/utf_codecs.hxx

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#ifndef UTF_CODECS_HXX
2+
#define UTF_CODECS_HXX
3+
// good code (C) Mathias Gaunard to appear in Boost::Unicode
4+
// bugs (c) Bernard Hugueney, GPL v3+
5+
#include <stdexcept>
6+
#include <cstdlib>
7+
#include <cstdint>
8+
#include <boost/type_traits/make_unsigned.hpp>
9+
#include <boost/throw_exception.hpp>
10+
11+
#ifndef BOOST_NO_STD_LOCALE
12+
#include <sstream>
13+
#include <ios>
14+
#endif
15+
16+
char32_t constexpr invalid_codepoint= (0x10FFFFu +1);
17+
namespace {
18+
// (C) Mathias Gaunard in Boost::Unicode
19+
template<int UnitSize, typename Iterator>
20+
inline void invalid_utf_sequence(Iterator begin, Iterator end) {
21+
typedef typename std::iterator_traits<Iterator>::value_type ValueType;
22+
typedef typename boost::make_unsigned<ValueType>::type UnsignedType;
23+
24+
#ifndef BOOST_NO_STD_LOCALE
25+
std::stringstream ss;
26+
ss << "Invalid UTF-" << UnitSize << " sequence " << std::showbase << std::hex;
27+
for(Iterator it = begin; it != end; ++it) {
28+
// std::size_t const tmp(static_cast<std::size_t>(static_cast<UnsignedType>(*it)));
29+
// ss << tmp << " "; // bug in snapshot libstdc++ ???
30+
}
31+
ss << "encountered while trying to convert to UTF-32";
32+
std::out_of_range e(ss.str());
33+
#else
34+
std::out_of_range e("Invalid UTF sequence encountered while trying to convert to UTF-32");
35+
#endif
36+
boost::throw_exception(e);
37+
}
38+
template<typename In>
39+
void check(bool test, In begin, In end) {
40+
if(!test) { invalid_utf_sequence<8>(begin, end); }
41+
}
42+
}
43+
template<typename In> char32_t from_utf8(In& b, In e) {
44+
char32_t res(invalid_codepoint);
45+
In const backup(b);
46+
try {// silent exception : unchanged b signals an invalid utf-8 sequence
47+
if (b != e) {
48+
unsigned char const b0(*b++);
49+
// std::cerr<<std::hex<<"reading utf8 :"<< b0 <<std::endl;
50+
if (!(b0 & 0x80)) { res= b0; }
51+
else {
52+
check(b!=e, b, e);
53+
unsigned char const b1(*b++);
54+
if( (b0 & 0xE0) == 0xC0) {
55+
res= ((b0 & 0x1F)<<6) | (b1 & 0x3F);
56+
}
57+
else {
58+
check(b!=e, b, e);
59+
unsigned char const b2(*b++);
60+
if ( (b0 & 0xF0u) == 0xE0u) {
61+
res= ((b0 & 0x0Fu) << 12) | ( (b1 & 0x3Fu) << 6) | (b2 & 0x3Fu) ;
62+
} else {
63+
check(b!=e, b, e);
64+
unsigned char const b3(*b++);
65+
if( (b0 & 0xF8) == 0xF0) {
66+
res= ((b0 & 0x07u) << 18) | ((b1 & 0x3Fu) << 12) | ((b2 & 0x3Fu) << 6) | (b3 & 0x3Fu);
67+
}
68+
}
69+
}
70+
}
71+
}
72+
} catch (std::out_of_range const&e) { b= backup; throw; }
73+
return res;
74+
}
75+
76+
template<typename Out> Out to_utf8(char32_t c, Out out) {
77+
// std::cerr<<std::hex<<"writing char32 :"<< c <<std::endl;
78+
// should we throw detail::invalid_code_point(c) otherwise ???
79+
if (c <= 0x10FFFu) {
80+
if (c < 0x80u) {
81+
// std::cerr<<"writing "<<static_cast< char>(c)<<std::endl;
82+
*out++= static_cast<unsigned char>(c);
83+
} else if (c < 0x800u) {
84+
*out++= static_cast<unsigned char>(0xC0u + (c >> 6));
85+
*out++= static_cast<unsigned char>(0x80u + (c & 0x3Fu));
86+
} else if (c < 0x10000u) {
87+
*out++ = static_cast<unsigned char>(0xE0u + (c >> 12));
88+
*out++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
89+
*out++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
90+
} else {
91+
*out++ = static_cast<unsigned char>(0xF0u + (c >> 18));
92+
*out++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
93+
*out++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
94+
*out++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
95+
}
96+
}
97+
return out;
98+
}
99+
100+
template<typename Out> Out& operator<<(Out& o, char32_t c){
101+
to_utf8(c, std::ostream_iterator<char>(o, ""));
102+
return o;
103+
}
104+
105+
template<typename Out, std::size_t N> Out& operator<<(Out& o, std::array<char32_t, N> const& a){
106+
o<<"[";
107+
std::for_each(a.begin(), a.end(), [&o](char32_t c){ to_utf8(c, std::ostream_iterator<char>(o, "")); o<<",";});
108+
o<<"]";
109+
// std::copy(a.begin(), a.end(), std::ostream_iterator<char32_t>(o, ","));
110+
return o;
111+
}
112+
113+
/*
114+
*/
115+
template<std::size_t K, std::size_t N, typename T, typename Op> struct array_lexicographical_helper : std::binary_function<std::array<T, N>, std::array<T, N>, bool> {
116+
array_lexicographical_helper(Op o=Op()) : op(o) {}
117+
bool operator()(std::array<T, N> const& a1, std::array<T, N> const& a2) const
118+
{ return op(a1[K], a2[K]) ? true : array_lexicographical_helper<K+1, N, T, Op>(op)(a1, a2) ; }
119+
Op op;
120+
};
121+
template<std::size_t N, typename T, typename Op> struct array_lexicographical_helper<N, N, T, Op> : std::binary_function<std::array<T, N>, std::array<T, N>, bool> {
122+
array_lexicographical_helper(Op o=Op()) : op(o) {}
123+
bool operator()(std::array<T, N> const& a1, std::array<T, N> const& a2) const { return false;}
124+
Op op;
125+
};
126+
// providing an operator[] to ngram would allow direct use on ngrams but maybe acces would be too slow anyway
127+
template<typename T, std::size_t N, typename Op= std::less<T> > // bool LittleEndian= true ? or "included" in char32_t
128+
struct array_less : std::binary_function<std::array<T, N>, std::array<T, N>, bool> {
129+
array_less(Op o=Op()): op(o) {}
130+
bool operator()(std::array<char32_t, N> const& a1, std::array<char32_t, N> const& a2) const
131+
#ifdef TEMPLATED_UNROLL
132+
{ return array_lexicographical_helper<0, N, char32_t, Op>(op)(a1, a2); }
133+
#else
134+
{ return std::lexicographical_compare(a1.begin(), a1.end(), a2.begin(), a2.end(), op); }
135+
#endif
136+
Op op;
137+
};
138+
139+
#endif

0 commit comments

Comments
 (0)