… | |
… | |
2 | #include <stdio.h> |
2 | #include <stdio.h> |
3 | #include <glib.h> |
3 | #include <glib.h> |
4 | #include "debug.h" |
4 | #include "debug.h" |
5 | #include "linguistics.h" |
5 | #include "linguistics.h" |
6 | |
6 | |
7 | static const char *special[][3]={ |
7 | static const char *special[][3] = |
|
|
8 | { |
8 | /* Capital Diacritics */ |
9 | /* Capital Diacritics */ |
9 | /* ¨ Diaresis */ |
10 | /* ¨ Diaresis */ |
10 | {"Ä","A","AE"}, |
11 | { "Ä", "A", "AE" }, |
11 | {"Ë","E"}, |
12 | { "Ë", "E" }, |
12 | {"Ï","I"}, |
13 | { "Ï", "I" }, |
13 | {"Ö","O","OE"}, |
14 | { "Ö", "O", "OE" }, |
14 | {"Ü","U","UE"}, |
15 | { "Ü", "U", "UE" }, |
15 | {"Ÿ","Y"}, |
16 | { "Ÿ", "Y" }, |
16 | /* ˝ Double Acute Accent */ |
17 | /* ˝ Double Acute Accent */ |
17 | {"Ő","O","Ö"}, |
18 | { "Ő", "O", "Ö" }, |
18 | {"Ű","U","Ü"}, |
19 | { "Ű", "U", "Ü" }, |
19 | /* ´ Acute Accent */ |
20 | /* ´ Acute Accent */ |
20 | {"Á","A"}, |
21 | { "Á", "A" }, |
21 | {"Ć","C"}, |
22 | { "Ć", "C" }, |
22 | {"É","E"}, |
23 | { "É", "E" }, |
23 | {"Í","I"}, |
24 | { "Í", "I" }, |
24 | {"Ĺ","L"}, |
25 | { "Ĺ", "L" }, |
25 | {"Ń","N"}, |
26 | { "Ń", "N" }, |
26 | {"Ó","O"}, |
27 | { "Ó", "O" }, |
27 | {"Ŕ","R"}, |
28 | { "Ŕ", "R" }, |
28 | {"Ś","S"}, |
29 | { "Ś", "S" }, |
29 | {"Ú","U"}, |
30 | { "Ú", "U" }, |
30 | {"Ý","Y"}, |
31 | { "Ý", "Y" }, |
31 | {"Ź","Z"}, |
32 | { "Ź", "Z" }, |
32 | /* ˛ Ogonek (nosinė) */ |
33 | /* ˛ Ogonek (nosinė) */ |
33 | {"Ą","A"}, |
34 | { "Ą", "A" }, |
34 | {"Ę","E"}, |
35 | { "Ę", "E" }, |
35 | {"Į","I"}, |
36 | { "Į", "I" }, |
36 | {"Ų","U"}, |
37 | { "Ų", "U" }, |
37 | /* ˙ Dot */ |
38 | /* ˙ Dot */ |
38 | {"Ċ","C"}, |
39 | { "Ċ", "C" }, |
39 | {"Ė","E"}, |
40 | { "Ė", "E" }, |
40 | {"Ġ","G"}, |
41 | { "Ġ", "G" }, |
41 | {"İ","I"}, |
42 | { "İ", "I" }, |
42 | {"Ŀ","L"}, |
43 | { "Ŀ", "L" }, |
43 | {"Ż","Z"}, |
44 | { "Ż", "Z" }, |
44 | /* – Stroke */ |
45 | /* – Stroke */ |
45 | {"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */ |
46 | { "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */ |
46 | {"Ħ","H"}, |
47 | { "Ħ", "H" }, |
47 | {"Ł","L"}, |
48 | { "Ł", "L" }, |
48 | {"Ŧ","T"}, |
49 | { "Ŧ", "T" }, |
49 | /* ˚ Ring */ |
50 | /* ˚ Ring */ |
50 | {"Å","A","AA"}, |
51 | { "Å", "A", "AA" }, |
51 | {"Ů","U"}, |
52 | { "Ů", "U" }, |
52 | /* ˇ Caron (haček, paukščiukas) */ |
53 | /* ˇ Caron (haček, paukščiukas) */ |
53 | {"Č","C"}, |
54 | { "Č", "C" }, |
54 | {"Ď","D"}, |
55 | { "Ď", "D" }, |
55 | {"Ě","E"}, |
56 | { "Ě", "E" }, |
56 | {"Ľ","L"}, |
57 | { "Ľ", "L" }, |
57 | {"Ň","N"}, |
58 | { "Ň", "N" }, |
58 | {"Ř","R"}, |
59 | { "Ř", "R" }, |
59 | {"Š","S"}, |
60 | { "Š", "S" }, |
60 | {"Ť","T"}, |
61 | { "Ť", "T" }, |
61 | {"Ž","Z"}, |
62 | { "Ž", "Z" }, |
62 | /* / Slash */ |
63 | /* / Slash */ |
63 | {"Ø","O","OE"}, |
64 | { "Ø", "O", "OE" }, |
64 | /* ¯ Macron */ |
65 | /* ¯ Macron */ |
65 | {"Ā","A","AA"}, |
66 | { "Ā", "A", "AA" }, |
66 | {"Ē","E","EE"}, |
67 | { "Ē", "E", "EE" }, |
67 | {"Ī","I","II"}, |
68 | { "Ī", "I", "II" }, |
68 | {"Ō","O","OO"}, |
69 | { "Ō", "O", "OO" }, |
69 | {"Ū","U","UU"}, |
70 | { "Ū", "U", "UU" }, |
70 | /* ˘ Brevis */ |
71 | /* ˘ Brevis */ |
71 | {"Ă","A"}, |
72 | { "Ă", "A" }, |
72 | {"Ĕ","E"}, |
73 | { "Ĕ", "E" }, |
73 | {"Ğ","G"}, |
74 | { "Ğ", "G" }, |
74 | {"Ĭ","I"}, |
75 | { "Ĭ", "I" }, |
75 | {"Ŏ","O"}, |
76 | { "Ŏ", "O" }, |
76 | {"Ŭ","U"}, |
77 | { "Ŭ", "U" }, |
77 | /* ^ Circumflex */ |
78 | /* ^ Circumflex */ |
78 | {"Â","A"}, |
79 | { "Â", "A" }, |
79 | {"Ĉ","C"}, |
80 | { "Ĉ", "C" }, |
80 | {"Ê","E"}, |
81 | { "Ê", "E" }, |
81 | {"Ĝ","G"}, |
82 | { "Ĝ", "G" }, |
82 | {"Ĥ","H"}, |
83 | { "Ĥ", "H" }, |
83 | {"Î","I"}, |
84 | { "Î", "I" }, |
84 | {"Ĵ","J"}, |
85 | { "Ĵ", "J" }, |
85 | {"Ô","O"}, |
86 | { "Ô", "O" }, |
86 | {"Ŝ","S"}, |
87 | { "Ŝ", "S" }, |
87 | {"Û","U"}, |
88 | { "Û", "U" }, |
88 | {"Ŵ","W"}, |
89 | { "Ŵ", "W" }, |
89 | {"Ŷ","Y"}, |
90 | { "Ŷ", "Y" }, |
90 | /* ¸ Cedilla */ |
91 | /* ¸ Cedilla */ |
91 | {"Ç","C"}, |
92 | { "Ç", "C" }, |
92 | {"Ģ","G","GJ"}, |
93 | { "Ģ", "G", "GJ" }, |
93 | {"Ķ","K","KJ"}, |
94 | { "Ķ", "K", "KJ" }, |
94 | {"Ļ","L","LJ"}, |
95 | { "Ļ", "L", "LJ" }, |
95 | {"Ņ","N","NJ"}, |
96 | { "Ņ", "N", "NJ" }, |
96 | {"Ŗ","R"}, |
97 | { "Ŗ", "R" }, |
97 | {"Ş","S"}, |
98 | { "Ş", "S" }, |
98 | {"Ţ","T"}, |
99 | { "Ţ", "T" }, |
99 | /* ~ Tilde */ |
100 | /* ~ Tilde */ |
100 | {"Ã","A"}, |
101 | { "Ã", "A" }, |
101 | {"Ĩ","I"}, |
102 | { "Ĩ", "I" }, |
102 | {"Ñ","N"}, |
103 | { "Ñ", "N" }, |
103 | {"Õ","O"}, |
104 | { "Õ", "O" }, |
104 | {"Ũ","U"}, |
105 | { "Ũ", "U" }, |
105 | /* ` Grave */ |
106 | /* ` Grave */ |
106 | {"À","A"}, |
107 | { "À", "A" }, |
107 | {"È","E"}, |
108 | { "È", "E" }, |
108 | {"Ì","I"}, |
109 | { "Ì", "I" }, |
109 | {"Ò","O"}, |
110 | { "Ò", "O" }, |
110 | {"Ù","U"}, |
111 | { "Ù", "U" }, |
111 | /* ligatures */ |
112 | /* ligatures */ |
112 | {"Æ","A","AE"}, |
113 | { "Æ", "A", "AE" }, |
113 | {"IJ","IJ"}, |
114 | { "IJ", "IJ" }, |
114 | {"Œ","O","OE"}, |
115 | { "Œ", "O", "OE" }, |
115 | /* special letters */ |
116 | /* special letters */ |
116 | {"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */ |
117 | { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */ |
117 | {"Ŋ","N","NG"}, |
118 | { "Ŋ", "N", "NG" }, |
118 | {"Þ","T","TH"}, |
119 | { "Þ", "T", "TH" }, |
119 | /* Small Diacritics */ |
120 | /* Small Diacritics */ |
120 | /* ¨ Diaresis */ |
121 | /* ¨ Diaresis */ |
121 | {"ä","a","ae"}, |
122 | { "ä", "a", "ae" }, |
122 | {"ë","e"}, |
123 | { "ë", "e" }, |
123 | {"ï","i"}, |
124 | { "ï", "i" }, |
124 | {"ö","o","oe"}, |
125 | { "ö", "o", "oe" }, |
125 | {"ü","u","ue"}, |
126 | { "ü", "u", "ue" }, |
126 | {"ÿ","y"}, |
127 | { "ÿ", "y" }, |
127 | /* ˝ Double Acute Accent */ |
128 | /* ˝ Double Acute Accent */ |
128 | {"ő","o","ö"}, |
129 | { "ő", "o", "ö" }, |
129 | {"ű","u","ü"}, |
130 | { "ű", "u", "ü" }, |
130 | /* ´ Acute Accent */ |
131 | /* ´ Acute Accent */ |
131 | {"á","a"}, |
132 | { "á", "a" }, |
132 | {"ć","c"}, |
133 | { "ć", "c" }, |
133 | {"é","e"}, |
134 | { "é", "e" }, |
134 | {"í","i"}, |
135 | { "í", "i" }, |
135 | {"ĺ","l"}, |
136 | { "ĺ", "l" }, |
136 | {"ń","n"}, |
137 | { "ń", "n" }, |
137 | {"ó","o"}, |
138 | { "ó", "o" }, |
138 | {"ŕ","r"}, |
139 | { "ŕ", "r" }, |
139 | {"ś","s"}, |
140 | { "ś", "s" }, |
140 | {"ú","u"}, |
141 | { "ú", "u" }, |
141 | {"ý","y"}, |
142 | { "ý", "y" }, |
142 | {"ź","z"}, |
143 | { "ź", "z" }, |
143 | /* ˛ Ogonek (nosinė) */ |
144 | /* ˛ Ogonek (nosinė) */ |
144 | {"ą","a"}, |
145 | { "ą", "a" }, |
145 | {"ę","e"}, |
146 | { "ę", "e" }, |
146 | {"į","i"}, |
147 | { "į", "i" }, |
147 | {"ų","u"}, |
148 | { "ų", "u" }, |
148 | /* ˙ Dot (and dotless i) */ |
149 | /* ˙ Dot (and dotless i) */ |
149 | {"ċ","c"}, |
150 | { "ċ", "c" }, |
150 | {"ė","e"}, |
151 | { "ė", "e" }, |
151 | {"ġ","g"}, |
152 | { "ġ", "g" }, |
152 | {"ı","i"}, |
153 | { "ı", "i" }, |
153 | {"ŀ","l"}, |
154 | { "ŀ", "l" }, |
154 | {"ż","z"}, |
155 | { "ż", "z" }, |
155 | /* – Stroke */ |
156 | /* – Stroke */ |
156 | {"đ","d","dj"}, |
157 | { "đ", "d", "dj" }, |
157 | {"ħ","h"}, |
158 | { "ħ", "h" }, |
158 | {"ł","l"}, |
159 | { "ł", "l" }, |
159 | {"ŧ","t"}, |
160 | { "ŧ", "t" }, |
160 | /* ˚ Ring */ |
161 | /* ˚ Ring */ |
161 | {"å","a", "aa"}, |
162 | { "å", "a", "aa" }, |
162 | {"ů","u"}, |
163 | { "ů", "u" }, |
163 | /* ˇ Caron (haček, paukščiukas) */ |
164 | /* ˇ Caron (haček, paukščiukas) */ |
164 | {"č","c"}, |
165 | { "č", "c" }, |
165 | {"ď","d"}, |
166 | { "ď", "d" }, |
166 | {"ě","e"}, |
167 | { "ě", "e" }, |
167 | {"ľ","l"}, |
168 | { "ľ", "l" }, |
168 | {"ň","n"}, |
169 | { "ň", "n" }, |
169 | {"ř","r"}, |
170 | { "ř", "r" }, |
170 | {"š","s"}, |
171 | { "š", "s" }, |
171 | {"ť","t"}, |
172 | { "ť", "t" }, |
172 | {"ž","z"}, |
173 | { "ž", "z" }, |
173 | /* / Slash */ |
174 | /* / Slash */ |
174 | {"ø","o", "oe"}, |
175 | { "ø", "o", "oe" }, |
175 | /* Macron */ |
176 | /* Macron */ |
176 | {"ā","a","aa"}, |
177 | { "ā", "a", "aa" }, |
177 | {"ē","e","ee"}, |
178 | { "ē", "e", "ee" }, |
178 | {"ī","i","ii"}, |
179 | { "ī", "i", "ii" }, |
179 | {"ō","o","oo"}, |
180 | { "ō", "o", "oo" }, |
180 | {"ū","u","uu"}, |
181 | { "ū", "u", "uu" }, |
181 | /* ˘ Brevis */ |
182 | /* ˘ Brevis */ |
182 | {"ă","a"}, |
183 | { "ă", "a" }, |
183 | {"ĕ","e"}, |
184 | { "ĕ", "e" }, |
184 | {"ğ","g"}, |
185 | { "ğ", "g" }, |
185 | {"ĭ","i"}, |
186 | { "ĭ", "i" }, |
186 | {"ŏ","o"}, |
187 | { "ŏ", "o" }, |
187 | {"ŭ","u"}, |
188 | { "ŭ", "u" }, |
188 | /* ^ Circumflex */ |
189 | /* ^ Circumflex */ |
189 | {"â","a"}, |
190 | { "â", "a" }, |
190 | {"ĉ","c"}, |
191 | { "ĉ", "c" }, |
191 | {"ê","e"}, |
192 | { "ê", "e" }, |
192 | {"ĝ","g"}, |
193 | { "ĝ", "g" }, |
193 | {"ĥ","h"}, |
194 | { "ĥ", "h" }, |
194 | {"î","i"}, |
195 | { "î", "i" }, |
195 | {"ĵ","j"}, |
196 | { "ĵ", "j" }, |
196 | {"ô","o"}, |
197 | { "ô", "o" }, |
197 | {"ŝ","s"}, |
198 | { "ŝ", "s" }, |
198 | {"û","u"}, |
199 | { "û", "u" }, |
199 | {"ŵ","w"}, |
200 | { "ŵ", "w" }, |
200 | {"ŷ","y"}, |
201 | { "ŷ", "y" }, |
201 | /* ¸ Cedilla */ |
202 | /* ¸ Cedilla */ |
202 | {"ç","c"}, |
203 | { "ç", "c" }, |
203 | {"ģ","g","gj"}, |
204 | { "ģ", "g", "gj" }, |
204 | {"ķ","k","kj"}, |
205 | { "ķ", "k", "kj" }, |
205 | {"ļ","l","lj"}, |
206 | { "ļ", "l", "lj" }, |
206 | {"ņ","n","nj"}, |
207 | { "ņ", "n", "nj" }, |
207 | {"ŗ","r"}, |
208 | { "ŗ", "r" }, |
208 | {"ş","s"}, |
209 | { "ş", "s" }, |
209 | {"ţ","t"}, |
210 | { "ţ", "t" }, |
210 | /* ~ Tilde */ |
211 | /* ~ Tilde */ |
211 | {"ã","a"}, |
212 | { "ã", "a" }, |
212 | {"ĩ","i"}, |
213 | { "ĩ", "i" }, |
213 | {"õ","o"}, |
214 | { "õ", "o" }, |
214 | {"ñ","n"}, |
215 | { "ñ", "n" }, |
215 | {"ũ","u"}, |
216 | { "ũ", "u" }, |
216 | /* ` Grave */ |
217 | /* ` Grave */ |
217 | {"à","a"}, |
218 | { "à", "a" }, |
218 | {"è","e"}, |
219 | { "è", "e" }, |
219 | {"ì","i"}, |
220 | { "ì", "i" }, |
220 | {"ò","o"}, |
221 | { "ò", "o" }, |
221 | {"ù","u"}, |
222 | { "ù", "u" }, |
222 | /* ligatures */ |
223 | /* ligatures */ |
223 | {"æ","a","ae"}, |
224 | { "æ", "a", "ae" }, |
224 | {"ij","ij"}, |
225 | { "ij", "ij" }, |
225 | {"œ","o","oe"}, |
226 | { "œ", "o", "oe" }, |
226 | {"ß","s","ss"}, |
227 | { "ß", "s", "ss" }, |
227 | /* special letters */ |
228 | /* special letters */ |
228 | {"ð","d","dh"}, |
229 | { "ð", "d", "dh" }, |
229 | {"ŋ","n","ng"}, |
230 | { "ŋ", "n", "ng" }, |
230 | {"þ","t","th"}, |
231 | { "þ", "t", "th" }, |
231 | |
232 | |
232 | /* Cyrillic capital */ |
233 | /* Cyrillic capital */ |
233 | |
234 | |
234 | {"Ё","Е"}, |
235 | { "Ё", "Е" }, |
235 | {"Й","И"}, |
236 | { "Й", "И" }, |
236 | {"І","I"}, |
237 | { "І", "I" }, |
237 | {"Ї","I"}, |
238 | { "Ї", "I" }, |
238 | {"Ў","У"}, |
239 | { "Ў", "У" }, |
239 | {"Є","Е","Э"}, |
240 | { "Є", "Е", "Э" }, |
240 | {"Ґ","Г"}, |
241 | { "Ґ", "Г" }, |
241 | {"Ѓ","Г"}, |
242 | { "Ѓ", "Г" }, |
242 | {"Ђ","Д"}, |
243 | { "Ђ", "Д" }, |
243 | {"Ќ","К"}, |
244 | { "Ќ", "К" }, |
244 | //{"Љ","Л","ЛЬ"}, |
245 | //{"Љ","Л","ЛЬ"}, |
245 | //{"Њ","Н","НЬ"}, |
246 | //{"Њ","Н","НЬ"}, |
246 | {"Џ","Ц"}, |
247 | { "Џ", "Ц" }, |
247 | |
248 | |
248 | /* Cyrillic small */ |
249 | /* Cyrillic small */ |
249 | |
250 | |
250 | {"ё","е"}, |
251 | { "ё", "е" }, |
251 | {"й","и"}, |
252 | { "й", "и" }, |
252 | {"і","i"}, |
253 | { "і", "i" }, |
253 | {"ї","i"}, |
254 | { "ї", "i" }, |
254 | {"ў","у"}, |
255 | { "ў", "у" }, |
255 | //{"є","е","э"}, |
256 | //{"є","е","э"}, |
256 | {"ґ","г"}, |
257 | { "ґ", "г" }, |
257 | {"ѓ","г"}, |
258 | { "ѓ", "г" }, |
258 | {"ђ","д"}, |
259 | { "ђ", "д" }, |
259 | {"ќ","к"}, |
260 | { "ќ", "к" }, |
260 | //{"љ","л","ль"}, |
261 | //{"љ","л","ль"}, |
261 | //{"њ","н","нь"}, |
262 | //{"њ","н","нь"}, |
262 | {"џ","ц"}, |
263 | { "џ", "ц" }, |
263 | |
264 | |
264 | }; |
265 | }; |
265 | |
266 | |
266 | static GHashTable *special_hash; |
267 | static GHashTable *special_hash; |
267 | |
268 | |
268 | /* Array of strings for case conversion |
269 | /* Array of strings for case conversion |
269 | * Even elements of array are strings of upper-case letters |
270 | * Even elements of array are strings of upper-case letters |
270 | * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element. |
271 | * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element. |
271 | * Last element of array should be NULL. |
272 | * Last element of array should be NULL. |
272 | */ |
273 | */ |
273 | static const char *upperlower[]={ |
274 | static const char |
|
|
275 | *upperlower[] = |
|
|
276 | { |
274 | /*Latin diacritics*/ |
277 | /*Latin diacritics*/ |
275 | "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ", |
278 | "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ", |
276 | "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ", |
279 | "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ", |
277 | /*Cyrillic*/ |
280 | /*Cyrillic*/ |
278 | "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ", |
281 | "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ", |
279 | "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў", |
282 | "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў", |
280 | |
283 | |
281 | NULL |
284 | NULL }; |
282 | }; |
|
|
283 | |
285 | |
284 | static GHashTable *casefold_hash; |
286 | static GHashTable *casefold_hash; |
285 | |
287 | |
286 | |
|
|
287 | struct special_pos { |
288 | struct special_pos |
|
|
289 | { |
288 | char **variants; |
290 | char **variants; |
289 | int n; |
291 | int n; |
290 | char *s1, *s2; |
292 | char *s1, *s2; |
291 | }; |
293 | }; |
292 | |
294 | |
293 | |
|
|
294 | |
|
|
295 | static char** |
295 | static char** |
296 | linguistics_get_special(char *str, char *end) |
296 | linguistics_get_special(char *str, char *end) |
297 | { |
297 | { |
298 | char buf[10]; |
298 | char buf[10]; |
299 | int len; |
299 | int len; |
300 | if(!end) |
300 | if (!end) |
301 | end=g_utf8_find_next_char(str,NULL); |
301 | end = g_utf8_find_next_char(str, NULL); |
302 | len=end-str+1; |
302 | len = end - str + 1; |
303 | g_strlcpy(buf,str,len>10?10:len); |
303 | g_strlcpy(buf, str, len > 10 ? 10 : len); |
304 | return g_hash_table_lookup(special_hash,buf); |
304 | return g_hash_table_lookup(special_hash, buf); |
305 | } |
305 | } |
306 | |
|
|
307 | |
306 | |
308 | /* |
307 | /* |
309 | * @brief Prepare an utf-8 string for case insensitive comparison. |
308 | * @brief Prepare an utf-8 string for case insensitive comparison. |
310 | * @param in String to prepeare. |
309 | * @param in String to prepeare. |
311 | * @return String prepared for case insensitive search. Result shoud be g_free()d after use. |
310 | * @return String prepared for case insensitive search. Result shoud be g_free()d after use. |
312 | */ |
311 | */ |
313 | char* |
312 | char* |
314 | linguistics_casefold(char *in) |
313 | linguistics_casefold(char *in) |
315 | { |
314 | { |
316 | int len=strlen(in); |
315 | int len = strlen(in); |
317 | char *src=in; |
316 | char *src = in; |
318 | char *ret=g_new(char,len+1); |
317 | char *ret=g_new(char,len+1); |
319 | char *dest=ret; |
318 | char *dest = ret; |
320 | char buf[10]; |
319 | char buf[10]; |
321 | while(*src && dest-ret<len){ |
320 | while (*src && dest - ret < len) |
|
|
321 | { |
322 | if(*src>='A' && *src<='Z') { |
322 | if (*src >= 'A' && *src <= 'Z') |
|
|
323 | { |
323 | *dest++=*src++ - 'A' + 'a'; |
324 | *dest++ = *src++ - 'A' + 'a'; |
|
|
325 | } |
324 | } else if (!(*src&128)) { |
326 | else if (!(*src & 128)) |
|
|
327 | { |
325 | *dest++=*src++; |
328 | *dest++ = *src++; |
|
|
329 | } |
326 | } else { |
330 | else |
|
|
331 | { |
327 | int charlen; |
332 | int charlen; |
328 | char *tmp, *folded; |
333 | char *tmp, *folded; |
329 | tmp=g_utf8_find_next_char(src,NULL); |
334 | tmp = g_utf8_find_next_char(src, NULL); |
330 | charlen=tmp-src+1; |
335 | charlen = tmp - src + 1; |
331 | g_strlcpy(buf,src,charlen>10?10:charlen); |
336 | g_strlcpy(buf, src, charlen > 10 ? 10 : charlen); |
332 | folded=g_hash_table_lookup(casefold_hash,buf); |
337 | folded = g_hash_table_lookup(casefold_hash, buf); |
333 | |
338 | |
334 | if(folded) |
339 | if (folded) |
335 | { |
340 | { |
336 | while(*folded && dest-ret<len) |
341 | while (*folded && dest - ret < len) |
337 | *dest++=*folded++; |
342 | *dest++ = *folded++; |
338 | src=tmp; |
343 | src = tmp; |
339 | } |
344 | } |
340 | else |
345 | else |
341 | { |
346 | { |
342 | while(src<tmp && dest-ret<len) |
347 | while (src < tmp && dest - ret < len) |
343 | *dest++=*src++; |
348 | *dest++ = *src++; |
344 | } |
349 | } |
345 | } |
350 | } |
346 | } |
351 | } |
347 | *dest=0; |
352 | *dest = 0; |
348 | if(*src) |
353 | if (*src) |
|
|
354 | dbg( |
|
|
355 | 0, |
349 | dbg(0,"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",in,ret); |
356 | "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n", |
|
|
357 | in, ret); |
350 | return ret; |
358 | return ret; |
351 | } |
359 | } |
352 | |
360 | |
353 | /** |
361 | /** |
354 | * @brief Compare two strings using special characters expansion. |
362 | * @brief Compare two strings using special characters expansion. |
… | |
… | |
357 | * @param match second string to compare, special characters are not expanded. |
365 | * @param match second string to compare, special characters are not expanded. |
358 | * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed. |
366 | * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed. |
359 | * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp(). |
367 | * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp(). |
360 | */ |
368 | */ |
361 | |
369 | |
362 | int |
|
|
363 | linguistics_compare(char *str, char *match, int partial) |
370 | int linguistics_compare(char *str, char *match, int partial) |
364 | { |
371 | { |
365 | char *s1=str, *s2=match; |
372 | char *s1 = str, *s2 = match; |
366 | char **sp; |
373 | char **sp; |
367 | int ret=0; |
374 | int ret = 0; |
368 | int got_match; |
375 | int got_match; |
369 | GList *l=NULL; |
376 | GList *l = NULL; |
370 | while (*s1 && *s2) { |
377 | while (*s1 && *s2) |
|
|
378 | { |
371 | int j; |
379 | int j; |
372 | struct special_pos *spp; |
380 | struct special_pos *spp; |
373 | char *utf_boundary, *tmp; |
381 | char *utf_boundary, *tmp; |
374 | /* Skip all matching chars */ |
382 | /* Skip all matching chars */ |
375 | for(j=0;s1[j] && s1[j]==s2[j];j++); |
383 | for (j = 0; s1[j] && s1[j] == s2[j]; j++) |
|
|
384 | ; |
376 | if(!s2[j] && (partial || !s1[j])) { |
385 | if (!s2[j] && (partial || !s1[j])) |
|
|
386 | { |
377 | /* MATCH! */ |
387 | /* MATCH! */ |
378 | ret=0; |
388 | ret = 0; |
379 | break; |
389 | break; |
380 | } |
390 | } |
381 | /* Find beginning of first mismatching utf-8 encoded char */ |
391 | /* Find beginning of first mismatching utf-8 encoded char */ |
382 | utf_boundary=s1; |
392 | utf_boundary = s1; |
383 | while(*(tmp=g_utf8_find_next_char(utf_boundary, NULL))) { |
393 | while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL))) |
|
|
394 | { |
384 | if(tmp>s1+j) |
395 | if (tmp > s1 + j) |
385 | break; |
396 | break; |
386 | utf_boundary=tmp; |
397 | utf_boundary = tmp; |
387 | } |
398 | } |
388 | /* Push first mismatching char to the list if it's a special char */ |
399 | /* Push first mismatching char to the list if it's a special char */ |
389 | sp=linguistics_get_special(utf_boundary,tmp); |
400 | sp = linguistics_get_special(utf_boundary, tmp); |
390 | |
401 | |
391 | if(sp) |
402 | if (sp) |
392 | { |
403 | { |
393 | spp=g_new(struct special_pos,1); |
404 | spp=g_new(struct special_pos,1); |
394 | spp->variants=sp; |
405 | spp->variants = sp; |
395 | spp->n=1; |
406 | spp->n = 1; |
396 | spp->s1=utf_boundary; |
407 | spp->s1 = utf_boundary; |
397 | spp->s2=s2+(utf_boundary-s1); |
408 | spp->s2 = s2 + (utf_boundary - s1); |
398 | l=g_list_prepend(l,spp); |
409 | l = g_list_prepend(l, spp); |
399 | } |
410 | } |
400 | |
411 | |
401 | /* Try to find a match using special char variants from the list */ |
412 | /* Try to find a match using special char variants from the list */ |
402 | got_match=0; |
413 | got_match = 0; |
403 | while(l && !got_match) { |
414 | while (l && !got_match) |
|
|
415 | { |
404 | spp=l->data; |
416 | spp = l->data; |
405 | s1=spp->s1; |
417 | s1 = spp->s1; |
406 | s2=spp->s2; |
418 | s2 = spp->s2; |
407 | while(spp->n<3 && !got_match) { |
419 | while (spp->n < 3 && !got_match) |
|
|
420 | { |
408 | char *s=spp->variants[(spp->n)++]; |
421 | char *s = spp->variants[(spp->n)++]; |
409 | int len; |
422 | int len; |
410 | if(!s) |
423 | if (!s) |
411 | break; |
424 | break; |
412 | len=strlen(s); |
425 | len = strlen(s); |
413 | if(!strncmp(s,s2,len)) { |
426 | if (!strncmp(s, s2, len)) |
|
|
427 | { |
414 | s2+=len; |
428 | s2 += len; |
415 | s1+=strlen(spp->variants[0]); |
429 | s1 += strlen(spp->variants[0]); |
416 | got_match=1; |
430 | got_match = 1; |
417 | break; |
431 | break; |
418 | } |
432 | } |
419 | } |
433 | } |
420 | if(spp->n>=3 || !spp->variants[spp->n]) { |
434 | if (spp->n >= 3 || !spp->variants[spp->n]) |
|
|
435 | { |
421 | /* No matches for current top list element, go to the closest special char towards beginning of the string */ |
436 | /* No matches for current top list element, go to the closest special char towards beginning of the string */ |
422 | g_free(spp); |
437 | g_free(spp); |
423 | l=g_list_delete_link(l,l); |
438 | l = g_list_delete_link(l, l); |
424 | } |
439 | } |
425 | } |
440 | } |
426 | if(!got_match) { |
441 | if (!got_match) |
|
|
442 | { |
427 | /* NO MATCH |
443 | /* NO MATCH |
428 | * FIXME: If we're going to use this function to sort a string list alphabetically we should use |
444 | * FIXME: If we're going to use this function to sort a string list alphabetically we should use |
429 | * utf-aware comparison here. |
445 | * utf-aware comparison here. |
430 | */ |
446 | */ |
431 | ret=1; |
447 | ret = 1; |
432 | break; |
448 | break; |
433 | } |
449 | } |
434 | } |
450 | } |
435 | while(l) { |
451 | while (l) |
|
|
452 | { |
436 | g_free(l->data); |
453 | g_free(l->data); |
437 | l=g_list_delete_link(l,l); |
454 | l = g_list_delete_link(l, l); |
438 | } |
455 | } |
439 | return ret; |
456 | return ret; |
440 | } |
457 | } |
441 | |
|
|
442 | |
458 | |
443 | char * |
459 | char * |
444 | linguistics_expand_special(char *str, int mode) |
460 | linguistics_expand_special(char *str, int mode) |
445 | { |
461 | { |
446 | char *in=str; |
462 | char *in = str; |
447 | char *out,*ret; |
463 | char *out, *ret; |
448 | int found=0; |
464 | int found = 0; |
449 | |
465 | |
450 | if (!str) |
466 | if (!str) |
451 | { |
467 | { |
452 | return NULL; |
468 | return NULL; |
453 | } |
469 | } |
454 | |
470 | |
455 | ret=g_strdup(str); |
471 | ret = g_strdup(str); |
456 | out=ret; |
472 | out = ret; |
457 | |
473 | |
458 | if (!mode) |
474 | if (!mode) |
459 | { |
475 | { |
460 | return ret; |
476 | return ret; |
461 | } |
477 | } |
462 | |
478 | |
463 | while (*in) |
479 | while (*in) |
464 | { |
480 | { |
465 | char *next=g_utf8_find_next_char(in, NULL); |
481 | char *next = g_utf8_find_next_char(in, NULL); |
466 | int i,len=next-in; |
482 | int i, len = next - in; |
467 | int match=0; |
483 | int match = 0; |
468 | if (len > 1) |
484 | if (len > 1) |
469 | { |
485 | { |
470 | for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) |
486 | for (i = 0; i < sizeof(special) / sizeof(special[0]); i++) |
471 | { |
487 | { |
472 | const char *search=special[i][0]; |
488 | const char *search = special[i][0]; |
473 | if (!strncmp(in,search,len)) |
489 | if (!strncmp(in, search, len)) |
474 | { |
490 | { |
475 | const char *replace=special[i][mode]; |
491 | const char *replace = special[i][mode]; |
476 | if (replace) |
492 | if (replace) |
477 | { |
493 | { |
478 | int replace_len=strlen(replace); |
494 | int replace_len = strlen(replace); |
479 | |
495 | |
480 | if (replace_len > len) |
496 | if (replace_len > len) |
481 | { |
497 | { |
482 | fprintf(stderr,"* ERROR !! ERROR !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); |
498 | fprintf( |
|
|
499 | stderr, |
|
|
500 | "* ERROR !! ERROR !! found %s %s %d %s %d\n", |
|
|
501 | in, search, len, replace, replace_len); |
483 | } |
502 | } |
484 | dbg_assert(replace_len <= len); |
503 | dbg_assert(replace_len <= len); |
485 | if (replace_len > len) |
504 | if (replace_len > len) |
486 | { |
505 | { |
487 | out+=len; |
506 | out += len; |
488 | match=0; |
507 | match = 0; |
489 | break; |
508 | break; |
490 | } |
509 | } |
491 | else |
510 | else |
492 | { |
511 | { |
493 | // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); |
512 | // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); |
494 | strcpy(out, replace); |
513 | strcpy(out, replace); |
495 | out+=replace_len; |
514 | out += replace_len; |
496 | match=1; |
515 | match = 1; |
497 | break; |
516 | break; |
498 | } |
517 | } |
499 | } |
518 | } |
500 | } |
519 | } |
501 | } |
520 | } |
502 | } |
521 | } |
503 | |
522 | |
504 | if (match) |
523 | if (match) |
505 | { |
524 | { |
506 | found=1; |
525 | found = 1; |
507 | in=next; |
526 | in = next; |
508 | } |
527 | } |
509 | else |
528 | else |
510 | { |
529 | { |
511 | while (len-- > 0) |
530 | while (len-- > 0) |
512 | { |
531 | { |
513 | *out++=*in++; |
532 | *out++ = *in++; |
514 | } |
533 | } |
515 | } |
534 | } |
516 | } |
535 | } |
517 | *out++='\0'; |
536 | *out++ = '\0'; |
518 | if (!found) |
537 | if (!found) |
519 | { |
538 | { |
520 | if (ret) |
539 | if (ret) |
521 | { |
540 | { |
522 | g_free(ret); |
541 | g_free(ret); |
523 | } |
542 | } |
524 | ret=NULL; |
543 | ret = NULL; |
525 | } |
544 | } |
526 | return ret; |
545 | return ret; |
527 | } |
546 | } |
528 | |
547 | |
529 | char * |
548 | char * |
530 | linguistics_next_word(char *str) |
549 | linguistics_next_word(char *str) |
531 | { |
550 | { |
532 | char* ret=strtok(str, " -/()\"\',.;_[]{}\\"); |
551 | char* ret = strtok(str, " -/()\"\',.;_[]{}\\"); |
533 | return ret; |
552 | return ret; |
534 | |
553 | |
535 | // int len=strcspn(str, " -/()"); |
554 | // int len=strcspn(str, " -/()"); |
536 | // if (!str[len] || !str[len+1]) |
555 | // if (!str[len] || !str[len+1]) |
537 | // return NULL; |
556 | // return NULL; |
538 | // return str+len+1; |
557 | // return str+len+1; |
539 | |
558 | |
540 | } |
559 | } |
541 | |
560 | |
542 | int |
|
|
543 | linguistics_search(char *str) |
561 | int linguistics_search(char *str) |
544 | { |
562 | { |
545 | if (!g_strcasecmp(str,"str")) |
563 | if (!g_strcasecmp(str, "str")) |
546 | return 0; |
564 | return 0; |
547 | if (!g_strcasecmp(str,"str.")) |
565 | if (!g_strcasecmp(str, "str.")) |
548 | return 0; |
566 | return 0; |
549 | if (!g_strcasecmp(str,"strasse")) |
567 | if (!g_strcasecmp(str, "strasse")) |
550 | return 0; |
568 | return 0; |
551 | if (!g_strcasecmp(str,"weg")) |
569 | if (!g_strcasecmp(str, "weg")) |
552 | return 0; |
570 | return 0; |
553 | return 1; |
571 | return 1; |
554 | } |
572 | } |
555 | |
573 | |
556 | /** |
574 | /** |
557 | * @brief Copy one utf8 encoded char to newly allocated buffer. |
575 | * @brief Copy one utf8 encoded char to newly allocated buffer. |
558 | * |
576 | * |
559 | * @param s pointer to the beginning of the char. |
577 | * @param s pointer to the beginning of the char. |
560 | * @return newly allocated nul-terminated string containing one utf8 encoded character. |
578 | * @return newly allocated nul-terminated string containing one utf8 encoded character. |
561 | */ |
579 | */ |
562 | static char |
|
|
563 | *linguistics_dup_utf8_char(const char *s) |
580 | static char *linguistics_dup_utf8_char(const char *s) |
564 | { |
581 | { |
565 | char *ret, *next; |
582 | char *ret, *next; |
566 | next=g_utf8_find_next_char(s,NULL); |
583 | next = g_utf8_find_next_char(s, NULL); |
567 | ret=g_new(char, next-s+1); |
584 | ret=g_new(char, next-s+1); |
568 | g_strlcpy(ret,s,next-s+1); |
585 | g_strlcpy(ret, s, next - s + 1); |
569 | return ret; |
586 | return ret; |
570 | } |
587 | } |
571 | |
588 | |
572 | void |
|
|
573 | linguistics_init(void) |
589 | void linguistics_init(void) |
574 | { |
590 | { |
575 | int i; |
591 | int i; |
576 | special_hash=g_hash_table_new(g_str_hash, g_str_equal); |
592 | special_hash = g_hash_table_new(g_str_hash, g_str_equal); |
577 | casefold_hash=g_hash_table_new(g_str_hash, g_str_equal); |
593 | casefold_hash = g_hash_table_new(g_str_hash, g_str_equal); |
578 | |
594 | |
579 | for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) |
595 | for (i = 0; i < sizeof(special) / sizeof(special[0]); i++) |
580 | { |
596 | { |
581 | g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]); |
597 | g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]); |
582 | } |
598 | } |
583 | |
599 | |
584 | for (i = 0 ; upperlower[i]; i+=2) |
600 | for (i = 0; upperlower[i]; i += 2) |
585 | { |
601 | { |
586 | int j,k; |
602 | int j, k; |
587 | for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) |
603 | for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];) |
588 | { |
604 | { |
589 | char *s1=linguistics_dup_utf8_char(upperlower[i]+j); |
605 | char *s1 = linguistics_dup_utf8_char(upperlower[i] + j); |
590 | char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k); |
606 | char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k); |
591 | g_hash_table_insert(casefold_hash,s1,s2); |
607 | g_hash_table_insert(casefold_hash, s1, s2); |
592 | j+=strlen(s1); |
608 | j += strlen(s1); |
593 | k+=strlen(s2); |
609 | k += strlen(s2); |
594 | } |
|
|
595 | } |
610 | } |
|
|
611 | } |
596 | } |
612 | } |
597 | |
613 | |