/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Diff of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

Revision 26 Revision 27
2#include <stdio.h> 2#include <stdio.h>
3#include <glib.h> 3#include <glib.h>
4#include "debug.h" 4#include "debug.h"
5#include "linguistics.h" 5#include "linguistics.h"
6 6
7static const char *special[][3]={ 7static const char *special[][3] =
8{
8/* Capital Diacritics */ 9/* Capital Diacritics */
9/* ¨ Diaresis */ 10/* ¨ Diaresis */
10{"Ä","A","AE"}, 11{ "Ä", "A", "AE" },
11{"Ë","E"}, 12{ "Ë", "E" },
12{"Ï","I"}, 13{ "Ï", "I" },
13{"Ö","O","OE"}, 14{ "Ö", "O", "OE" },
14{"Ü","U","UE"}, 15{ "Ü", "U", "UE" },
15{"Ÿ","Y"}, 16{ "Ÿ", "Y" },
16/* ˝ Double Acute Accent */ 17/* ˝ Double Acute Accent */
17{"Ő","O","Ö"}, 18{ "Ő", "O", "Ö" },
18{"Ű","U","Ü"}, 19{ "Ű", "U", "Ü" },
19/* ´ Acute Accent */ 20/* ´ Acute Accent */
20{"Á","A"}, 21{ "Á", "A" },
21{"Ć","C"}, 22{ "Ć", "C" },
22{"É","E"}, 23{ "É", "E" },
23{"Í","I"}, 24{ "Í", "I" },
24{"Ĺ","L"}, 25{ "Ĺ", "L" },
25{"Ń","N"}, 26{ "Ń", "N" },
26{"Ó","O"}, 27{ "Ó", "O" },
27{"Ŕ","R"}, 28{ "Ŕ", "R" },
28{"Ś","S"}, 29{ "Ś", "S" },
29{"Ú","U"}, 30{ "Ú", "U" },
30{"Ý","Y"}, 31{ "Ý", "Y" },
31{"Ź","Z"}, 32{ "Ź", "Z" },
32/* ˛ Ogonek (nosinė) */ 33/* ˛ Ogonek (nosinė) */
33{"Ą","A"}, 34{ "Ą", "A" },
34{"Ę","E"}, 35{ "Ę", "E" },
35{"Į","I"}, 36{ "Į", "I" },
36{"Ų","U"}, 37{ "Ų", "U" },
37/* ˙ Dot */ 38/* ˙ Dot */
38{"Ċ","C"}, 39{ "Ċ", "C" },
39{"Ė","E"}, 40{ "Ė", "E" },
40{"Ġ","G"}, 41{ "Ġ", "G" },
41{"İ","I"}, 42{ "İ", "I" },
42{"Ŀ","L"}, 43{ "Ŀ", "L" },
43{"Ż","Z"}, 44{ "Ż", "Z" },
44/* – Stroke */ 45/* – Stroke */
45{"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */ 46{ "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
46{"Ħ","H"}, 47{ "Ħ", "H" },
47{"Ł","L"}, 48{ "Ł", "L" },
48{"Ŧ","T"}, 49{ "Ŧ", "T" },
49/* ˚ Ring */ 50/* ˚ Ring */
50{"Å","A","AA"}, 51{ "Å", "A", "AA" },
51{"Ů","U"}, 52{ "Ů", "U" },
52/* ˇ Caron (haček, paukščiukas) */ 53/* ˇ Caron (haček, paukščiukas) */
53{"Č","C"}, 54{ "Č", "C" },
54{"Ď","D"}, 55{ "Ď", "D" },
55{"Ě","E"}, 56{ "Ě", "E" },
56{"Ľ","L"}, 57{ "Ľ", "L" },
57{"Ň","N"}, 58{ "Ň", "N" },
58{"Ř","R"}, 59{ "Ř", "R" },
59{"Š","S"}, 60{ "Š", "S" },
60{"Ť","T"}, 61{ "Ť", "T" },
61{"Ž","Z"}, 62{ "Ž", "Z" },
62/* / Slash */ 63/* / Slash */
63{"Ø","O","OE"}, 64{ "Ø", "O", "OE" },
64/* ¯ Macron */ 65/* ¯ Macron */
65{"Ā","A","AA"}, 66{ "Ā", "A", "AA" },
66{"Ē","E","EE"}, 67{ "Ē", "E", "EE" },
67{"Ī","I","II"}, 68{ "Ī", "I", "II" },
68{"Ō","O","OO"}, 69{ "Ō", "O", "OO" },
69{"Ū","U","UU"}, 70{ "Ū", "U", "UU" },
70/* ˘ Brevis */ 71/* ˘ Brevis */
71{"Ă","A"}, 72{ "Ă", "A" },
72{"Ĕ","E"}, 73{ "Ĕ", "E" },
73{"Ğ","G"}, 74{ "Ğ", "G" },
74{"Ĭ","I"}, 75{ "Ĭ", "I" },
75{"Ŏ","O"}, 76{ "Ŏ", "O" },
76{"Ŭ","U"}, 77{ "Ŭ", "U" },
77/* ^ Circumflex */ 78/* ^ Circumflex */
78{"Â","A"}, 79{ "Â", "A" },
79{"Ĉ","C"}, 80{ "Ĉ", "C" },
80{"Ê","E"}, 81{ "Ê", "E" },
81{"Ĝ","G"}, 82{ "Ĝ", "G" },
82{"Ĥ","H"}, 83{ "Ĥ", "H" },
83{"Î","I"}, 84{ "Î", "I" },
84{"Ĵ","J"}, 85{ "Ĵ", "J" },
85{"Ô","O"}, 86{ "Ô", "O" },
86{"Ŝ","S"}, 87{ "Ŝ", "S" },
87{"Û","U"}, 88{ "Û", "U" },
88{"Ŵ","W"}, 89{ "Ŵ", "W" },
89{"Ŷ","Y"}, 90{ "Ŷ", "Y" },
90/* ¸ Cedilla */ 91/* ¸ Cedilla */
91{"Ç","C"}, 92{ "Ç", "C" },
92{"Ģ","G","GJ"}, 93{ "Ģ", "G", "GJ" },
93{"Ķ","K","KJ"}, 94{ "Ķ", "K", "KJ" },
94{"Ļ","L","LJ"}, 95{ "Ļ", "L", "LJ" },
95{"Ņ","N","NJ"}, 96{ "Ņ", "N", "NJ" },
96{"Ŗ","R"}, 97{ "Ŗ", "R" },
97{"Ş","S"}, 98{ "Ş", "S" },
98{"Ţ","T"}, 99{ "Ţ", "T" },
99/* ~ Tilde */ 100/* ~ Tilde */
100{"Ã","A"}, 101{ "Ã", "A" },
101{"Ĩ","I"}, 102{ "Ĩ", "I" },
102{"Ñ","N"}, 103{ "Ñ", "N" },
103{"Õ","O"}, 104{ "Õ", "O" },
104{"Ũ","U"}, 105{ "Ũ", "U" },
105/* ` Grave */ 106/* ` Grave */
106{"À","A"}, 107{ "À", "A" },
107{"È","E"}, 108{ "È", "E" },
108{"Ì","I"}, 109{ "Ì", "I" },
109{"Ò","O"}, 110{ "Ò", "O" },
110{"Ù","U"}, 111{ "Ù", "U" },
111/* ligatures */ 112/* ligatures */
112{"Æ","A","AE"}, 113{ "Æ", "A", "AE" },
113{"IJ","IJ"}, 114{ "IJ", "IJ" },
114{"Œ","O","OE"}, 115{ "Œ", "O", "OE" },
115/* special letters */ 116/* special letters */
116{"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */ 117{ "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
117{"Ŋ","N","NG"}, 118{ "Ŋ", "N", "NG" },
118{"Þ","T","TH"}, 119{ "Þ", "T", "TH" },
119/* Small Diacritics */ 120/* Small Diacritics */
120/* ¨ Diaresis */ 121/* ¨ Diaresis */
121{"ä","a","ae"}, 122{ "ä", "a", "ae" },
122{"ë","e"}, 123{ "ë", "e" },
123{"ï","i"}, 124{ "ï", "i" },
124{"ö","o","oe"}, 125{ "ö", "o", "oe" },
125{"ü","u","ue"}, 126{ "ü", "u", "ue" },
126{"ÿ","y"}, 127{ "ÿ", "y" },
127/* ˝ Double Acute Accent */ 128/* ˝ Double Acute Accent */
128{"ő","o","ö"}, 129{ "ő", "o", "ö" },
129{"ű","u","ü"}, 130{ "ű", "u", "ü" },
130/* ´ Acute Accent */ 131/* ´ Acute Accent */
131{"á","a"}, 132{ "á", "a" },
132{"ć","c"}, 133{ "ć", "c" },
133{"é","e"}, 134{ "é", "e" },
134{"í","i"}, 135{ "í", "i" },
135{"ĺ","l"}, 136{ "ĺ", "l" },
136{"ń","n"}, 137{ "ń", "n" },
137{"ó","o"}, 138{ "ó", "o" },
138{"ŕ","r"}, 139{ "ŕ", "r" },
139{"ś","s"}, 140{ "ś", "s" },
140{"ú","u"}, 141{ "ú", "u" },
141{"ý","y"}, 142{ "ý", "y" },
142{"ź","z"}, 143{ "ź", "z" },
143/* ˛ Ogonek (nosinė) */ 144/* ˛ Ogonek (nosinė) */
144{"ą","a"}, 145{ "ą", "a" },
145{"ę","e"}, 146{ "ę", "e" },
146{"į","i"}, 147{ "į", "i" },
147{"ų","u"}, 148{ "ų", "u" },
148/* ˙ Dot (and dotless i) */ 149/* ˙ Dot (and dotless i) */
149{"ċ","c"}, 150{ "ċ", "c" },
150{"ė","e"}, 151{ "ė", "e" },
151{"ġ","g"}, 152{ "ġ", "g" },
152{"ı","i"}, 153{ "ı", "i" },
153{"ŀ","l"}, 154{ "ŀ", "l" },
154{"ż","z"}, 155{ "ż", "z" },
155/* – Stroke */ 156/* – Stroke */
156{"đ","d","dj"}, 157{ "đ", "d", "dj" },
157{"ħ","h"}, 158{ "ħ", "h" },
158{"ł","l"}, 159{ "ł", "l" },
159{"ŧ","t"}, 160{ "ŧ", "t" },
160/* ˚ Ring */ 161/* ˚ Ring */
161{"å","a", "aa"}, 162{ "å", "a", "aa" },
162{"ů","u"}, 163{ "ů", "u" },
163/* ˇ Caron (haček, paukščiukas) */ 164/* ˇ Caron (haček, paukščiukas) */
164{"č","c"}, 165{ "č", "c" },
165{"ď","d"}, 166{ "ď", "d" },
166{"ě","e"}, 167{ "ě", "e" },
167{"ľ","l"}, 168{ "ľ", "l" },
168{"ň","n"}, 169{ "ň", "n" },
169{"ř","r"}, 170{ "ř", "r" },
170{"š","s"}, 171{ "š", "s" },
171{"ť","t"}, 172{ "ť", "t" },
172{"ž","z"}, 173{ "ž", "z" },
173/* / Slash */ 174/* / Slash */
174{"ø","o", "oe"}, 175{ "ø", "o", "oe" },
175/* Macron */ 176/* Macron */
176{"ā","a","aa"}, 177{ "ā", "a", "aa" },
177{"ē","e","ee"}, 178{ "ē", "e", "ee" },
178{"ī","i","ii"}, 179{ "ī", "i", "ii" },
179{"ō","o","oo"}, 180{ "ō", "o", "oo" },
180{"ū","u","uu"}, 181{ "ū", "u", "uu" },
181/* ˘ Brevis */ 182/* ˘ Brevis */
182{"ă","a"}, 183{ "ă", "a" },
183{"ĕ","e"}, 184{ "ĕ", "e" },
184{"ğ","g"}, 185{ "ğ", "g" },
185{"ĭ","i"}, 186{ "ĭ", "i" },
186{"ŏ","o"}, 187{ "ŏ", "o" },
187{"ŭ","u"}, 188{ "ŭ", "u" },
188/* ^ Circumflex */ 189/* ^ Circumflex */
189{"â","a"}, 190{ "â", "a" },
190{"ĉ","c"}, 191{ "ĉ", "c" },
191{"ê","e"}, 192{ "ê", "e" },
192{"ĝ","g"}, 193{ "ĝ", "g" },
193{"ĥ","h"}, 194{ "ĥ", "h" },
194{"î","i"}, 195{ "î", "i" },
195{"ĵ","j"}, 196{ "ĵ", "j" },
196{"ô","o"}, 197{ "ô", "o" },
197{"ŝ","s"}, 198{ "ŝ", "s" },
198{"û","u"}, 199{ "û", "u" },
199{"ŵ","w"}, 200{ "ŵ", "w" },
200{"ŷ","y"}, 201{ "ŷ", "y" },
201/* ¸ Cedilla */ 202/* ¸ Cedilla */
202{"ç","c"}, 203{ "ç", "c" },
203{"ģ","g","gj"}, 204{ "ģ", "g", "gj" },
204{"ķ","k","kj"}, 205{ "ķ", "k", "kj" },
205{"ļ","l","lj"}, 206{ "ļ", "l", "lj" },
206{"ņ","n","nj"}, 207{ "ņ", "n", "nj" },
207{"ŗ","r"}, 208{ "ŗ", "r" },
208{"ş","s"}, 209{ "ş", "s" },
209{"ţ","t"}, 210{ "ţ", "t" },
210/* ~ Tilde */ 211/* ~ Tilde */
211{"ã","a"}, 212{ "ã", "a" },
212{"ĩ","i"}, 213{ "ĩ", "i" },
213{"õ","o"}, 214{ "õ", "o" },
214{"ñ","n"}, 215{ "ñ", "n" },
215{"ũ","u"}, 216{ "ũ", "u" },
216/* ` Grave */ 217/* ` Grave */
217{"à","a"}, 218{ "à", "a" },
218{"è","e"}, 219{ "è", "e" },
219{"ì","i"}, 220{ "ì", "i" },
220{"ò","o"}, 221{ "ò", "o" },
221{"ù","u"}, 222{ "ù", "u" },
222/* ligatures */ 223/* ligatures */
223{"æ","a","ae"}, 224{ "æ", "a", "ae" },
224{"ij","ij"}, 225{ "ij", "ij" },
225{"œ","o","oe"}, 226{ "œ", "o", "oe" },
226{"ß","s","ss"}, 227{ "ß", "s", "ss" },
227/* special letters */ 228/* special letters */
228{"ð","d","dh"}, 229{ "ð", "d", "dh" },
229{"ŋ","n","ng"}, 230{ "ŋ", "n", "ng" },
230{"þ","t","th"}, 231{ "þ", "t", "th" },
231 232
232/* Cyrillic capital */ 233/* Cyrillic capital */
233 234
234{"Ё","Е"}, 235{ "Ё", "Е" },
235{"Й","И"}, 236{ "Й", "И" },
236{"І","I"}, 237{ "І", "I" },
237{"Ї","I"}, 238{ "Ї", "I" },
238{"Ў","У"}, 239{ "Ў", "У" },
239{"Є","Е","Э"}, 240{ "Є", "Е", "Э" },
240{"Ґ","Г"}, 241{ "Ґ", "Г" },
241{"Ѓ","Г"}, 242{ "Ѓ", "Г" },
242{"Ђ","Д"}, 243{ "Ђ", "Д" },
243{"Ќ","К"}, 244{ "Ќ", "К" },
244//{"Љ","Л","ЛЬ"}, 245//{"Љ","Л","ЛЬ"},
245//{"Њ","Н","НЬ"}, 246 //{"Њ","Н","НЬ"},
246{"Џ","Ц"}, 247 { "Џ", "Ц" },
247 248
248/* Cyrillic small */ 249 /* Cyrillic small */
249 250
250{"ё","е"}, 251 { "ё", "е" },
251{"й","и"}, 252 { "й", "и" },
252{"і","i"}, 253 { "і", "i" },
253{"ї","i"}, 254 { "ї", "i" },
254{"ў","у"}, 255 { "ў", "у" },
255//{"є","е","э"}, 256 //{"є","е","э"},
256{"ґ","г"}, 257 { "ґ", "г" },
257{"ѓ","г"}, 258 { "ѓ", "г" },
258{"ђ","д"}, 259 { "ђ", "д" },
259{"ќ","к"}, 260 { "ќ", "к" },
260//{"љ","л","ль"}, 261 //{"љ","л","ль"},
261//{"њ","н","нь"}, 262 //{"њ","н","нь"},
262{"џ","ц"}, 263 { "џ", "ц" },
263 264
264}; 265};
265 266
266static GHashTable *special_hash; 267static GHashTable *special_hash;
267 268
268/* Array of strings for case conversion 269/* Array of strings for case conversion
269 * Even elements of array are strings of upper-case letters 270 * Even elements of array are strings of upper-case letters
270 * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element. 271 * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
271 * Last element of array should be NULL. 272 * Last element of array should be NULL.
272 */ 273 */
273static const char *upperlower[]={ 274static const char
275 *upperlower[] =
276 {
274/*Latin diacritics*/ 277 /*Latin diacritics*/
275"ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ", 278 "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
276"äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ", 279 "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
277/*Cyrillic*/ 280 /*Cyrillic*/
278"АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ", 281 "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
279"абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў", 282 "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
280 283
281NULL 284 NULL };
282};
283 285
284static GHashTable *casefold_hash; 286static GHashTable *casefold_hash;
285 287
286
287struct special_pos { 288struct special_pos
289{
288 char **variants; 290 char **variants;
289 int n; 291 int n;
290 char *s1, *s2; 292 char *s1, *s2;
291}; 293};
292 294
293
294
295static char** 295static char**
296linguistics_get_special(char *str, char *end) 296linguistics_get_special(char *str, char *end)
297{ 297{
298 char buf[10]; 298 char buf[10];
299 int len; 299 int len;
300 if(!end) 300 if (!end)
301 end=g_utf8_find_next_char(str,NULL); 301 end = g_utf8_find_next_char(str, NULL);
302 len=end-str+1; 302 len = end - str + 1;
303 g_strlcpy(buf,str,len>10?10:len); 303 g_strlcpy(buf, str, len > 10 ? 10 : len);
304 return g_hash_table_lookup(special_hash,buf); 304 return g_hash_table_lookup(special_hash, buf);
305} 305}
306
307 306
308/* 307/*
309 * @brief Prepare an utf-8 string for case insensitive comparison. 308 * @brief Prepare an utf-8 string for case insensitive comparison.
310 * @param in String to prepeare. 309 * @param in String to prepeare.
311 * @return String prepared for case insensitive search. Result shoud be g_free()d after use. 310 * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
312 */ 311 */
313char* 312char*
314linguistics_casefold(char *in) 313linguistics_casefold(char *in)
315{ 314{
316 int len=strlen(in); 315 int len = strlen(in);
317 char *src=in; 316 char *src = in;
318 char *ret=g_new(char,len+1); 317 char *ret=g_new(char,len+1);
319 char *dest=ret; 318 char *dest = ret;
320 char buf[10]; 319 char buf[10];
321 while(*src && dest-ret<len){ 320 while (*src && dest - ret < len)
321 {
322 if(*src>='A' && *src<='Z') { 322 if (*src >= 'A' && *src <= 'Z')
323 {
323 *dest++=*src++ - 'A' + 'a'; 324 *dest++ = *src++ - 'A' + 'a';
325 }
324 } else if (!(*src&128)) { 326 else if (!(*src & 128))
327 {
325 *dest++=*src++; 328 *dest++ = *src++;
329 }
326 } else { 330 else
331 {
327 int charlen; 332 int charlen;
328 char *tmp, *folded; 333 char *tmp, *folded;
329 tmp=g_utf8_find_next_char(src,NULL); 334 tmp = g_utf8_find_next_char(src, NULL);
330 charlen=tmp-src+1; 335 charlen = tmp - src + 1;
331 g_strlcpy(buf,src,charlen>10?10:charlen); 336 g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
332 folded=g_hash_table_lookup(casefold_hash,buf); 337 folded = g_hash_table_lookup(casefold_hash, buf);
333 338
334 if(folded) 339 if (folded)
335 { 340 {
336 while(*folded && dest-ret<len) 341 while (*folded && dest - ret < len)
337 *dest++=*folded++; 342 *dest++ = *folded++;
338 src=tmp; 343 src = tmp;
339 } 344 }
340 else 345 else
341 { 346 {
342 while(src<tmp && dest-ret<len) 347 while (src < tmp && dest - ret < len)
343 *dest++=*src++; 348 *dest++ = *src++;
344 } 349 }
345 } 350 }
346 } 351 }
347 *dest=0; 352 *dest = 0;
348 if(*src) 353 if (*src)
354 dbg(
355 0,
349 dbg(0,"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",in,ret); 356 "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
357 in, ret);
350 return ret; 358 return ret;
351} 359}
352 360
353/** 361/**
354 * @brief Compare two strings using special characters expansion. 362 * @brief Compare two strings using special characters expansion.
357 * @param match second string to compare, special characters are not expanded. 365 * @param match second string to compare, special characters are not expanded.
358 * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed. 366 * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
359 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp(). 367 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
360 */ 368 */
361 369
362int
363linguistics_compare(char *str, char *match, int partial) 370int linguistics_compare(char *str, char *match, int partial)
364{ 371{
365 char *s1=str, *s2=match; 372 char *s1 = str, *s2 = match;
366 char **sp; 373 char **sp;
367 int ret=0; 374 int ret = 0;
368 int got_match; 375 int got_match;
369 GList *l=NULL; 376 GList *l = NULL;
370 while (*s1 && *s2) { 377 while (*s1 && *s2)
378 {
371 int j; 379 int j;
372 struct special_pos *spp; 380 struct special_pos *spp;
373 char *utf_boundary, *tmp; 381 char *utf_boundary, *tmp;
374 /* Skip all matching chars */ 382 /* Skip all matching chars */
375 for(j=0;s1[j] && s1[j]==s2[j];j++); 383 for (j = 0; s1[j] && s1[j] == s2[j]; j++)
384 ;
376 if(!s2[j] && (partial || !s1[j])) { 385 if (!s2[j] && (partial || !s1[j]))
386 {
377 /* MATCH! */ 387 /* MATCH! */
378 ret=0; 388 ret = 0;
379 break; 389 break;
380 } 390 }
381 /* Find beginning of first mismatching utf-8 encoded char */ 391 /* Find beginning of first mismatching utf-8 encoded char */
382 utf_boundary=s1; 392 utf_boundary = s1;
383 while(*(tmp=g_utf8_find_next_char(utf_boundary, NULL))) { 393 while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
394 {
384 if(tmp>s1+j) 395 if (tmp > s1 + j)
385 break; 396 break;
386 utf_boundary=tmp; 397 utf_boundary = tmp;
387 } 398 }
388 /* Push first mismatching char to the list if it's a special char */ 399 /* Push first mismatching char to the list if it's a special char */
389 sp=linguistics_get_special(utf_boundary,tmp); 400 sp = linguistics_get_special(utf_boundary, tmp);
390 401
391 if(sp) 402 if (sp)
392 { 403 {
393 spp=g_new(struct special_pos,1); 404 spp=g_new(struct special_pos,1);
394 spp->variants=sp; 405 spp->variants = sp;
395 spp->n=1; 406 spp->n = 1;
396 spp->s1=utf_boundary; 407 spp->s1 = utf_boundary;
397 spp->s2=s2+(utf_boundary-s1); 408 spp->s2 = s2 + (utf_boundary - s1);
398 l=g_list_prepend(l,spp); 409 l = g_list_prepend(l, spp);
399 } 410 }
400 411
401 /* Try to find a match using special char variants from the list */ 412 /* Try to find a match using special char variants from the list */
402 got_match=0; 413 got_match = 0;
403 while(l && !got_match) { 414 while (l && !got_match)
415 {
404 spp=l->data; 416 spp = l->data;
405 s1=spp->s1; 417 s1 = spp->s1;
406 s2=spp->s2; 418 s2 = spp->s2;
407 while(spp->n<3 && !got_match) { 419 while (spp->n < 3 && !got_match)
420 {
408 char *s=spp->variants[(spp->n)++]; 421 char *s = spp->variants[(spp->n)++];
409 int len; 422 int len;
410 if(!s) 423 if (!s)
411 break; 424 break;
412 len=strlen(s); 425 len = strlen(s);
413 if(!strncmp(s,s2,len)) { 426 if (!strncmp(s, s2, len))
427 {
414 s2+=len; 428 s2 += len;
415 s1+=strlen(spp->variants[0]); 429 s1 += strlen(spp->variants[0]);
416 got_match=1; 430 got_match = 1;
417 break; 431 break;
418 } 432 }
419 } 433 }
420 if(spp->n>=3 || !spp->variants[spp->n]) { 434 if (spp->n >= 3 || !spp->variants[spp->n])
435 {
421 /* No matches for current top list element, go to the closest special char towards beginning of the string */ 436 /* No matches for current top list element, go to the closest special char towards beginning of the string */
422 g_free(spp); 437 g_free(spp);
423 l=g_list_delete_link(l,l); 438 l = g_list_delete_link(l, l);
424 } 439 }
425 } 440 }
426 if(!got_match) { 441 if (!got_match)
442 {
427 /* NO MATCH 443 /* NO MATCH
428 * FIXME: If we're going to use this function to sort a string list alphabetically we should use 444 * FIXME: If we're going to use this function to sort a string list alphabetically we should use
429 * utf-aware comparison here. 445 * utf-aware comparison here.
430 */ 446 */
431 ret=1; 447 ret = 1;
432 break; 448 break;
433 } 449 }
434 } 450 }
435 while(l) { 451 while (l)
452 {
436 g_free(l->data); 453 g_free(l->data);
437 l=g_list_delete_link(l,l); 454 l = g_list_delete_link(l, l);
438 } 455 }
439 return ret; 456 return ret;
440} 457}
441
442 458
443char * 459char *
444linguistics_expand_special(char *str, int mode) 460linguistics_expand_special(char *str, int mode)
445{ 461{
446 char *in=str; 462 char *in = str;
447 char *out,*ret; 463 char *out, *ret;
448 int found=0; 464 int found = 0;
449 465
450 if (!str) 466 if (!str)
451 { 467 {
452 return NULL; 468 return NULL;
453 } 469 }
454 470
455 ret=g_strdup(str); 471 ret = g_strdup(str);
456 out=ret; 472 out = ret;
457 473
458 if (!mode) 474 if (!mode)
459 { 475 {
460 return ret; 476 return ret;
461 } 477 }
462 478
463 while (*in) 479 while (*in)
464 { 480 {
465 char *next=g_utf8_find_next_char(in, NULL); 481 char *next = g_utf8_find_next_char(in, NULL);
466 int i,len=next-in; 482 int i, len = next - in;
467 int match=0; 483 int match = 0;
468 if (len > 1) 484 if (len > 1)
469 { 485 {
470 for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) 486 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
471 { 487 {
472 const char *search=special[i][0]; 488 const char *search = special[i][0];
473 if (!strncmp(in,search,len)) 489 if (!strncmp(in, search, len))
474 { 490 {
475 const char *replace=special[i][mode]; 491 const char *replace = special[i][mode];
476 if (replace) 492 if (replace)
477 { 493 {
478 int replace_len=strlen(replace); 494 int replace_len = strlen(replace);
479 495
480 if (replace_len > len) 496 if (replace_len > len)
481 { 497 {
482 fprintf(stderr,"* ERROR !! ERROR !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); 498 fprintf(
499 stderr,
500 "* ERROR !! ERROR !! found %s %s %d %s %d\n",
501 in, search, len, replace, replace_len);
483 } 502 }
484 dbg_assert(replace_len <= len); 503 dbg_assert(replace_len <= len);
485 if (replace_len > len) 504 if (replace_len > len)
486 { 505 {
487 out+=len; 506 out += len;
488 match=0; 507 match = 0;
489 break; 508 break;
490 } 509 }
491 else 510 else
492 { 511 {
493 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); 512 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
494 strcpy(out, replace); 513 strcpy(out, replace);
495 out+=replace_len; 514 out += replace_len;
496 match=1; 515 match = 1;
497 break; 516 break;
498 } 517 }
499 } 518 }
500 } 519 }
501 } 520 }
502 } 521 }
503 522
504 if (match) 523 if (match)
505 { 524 {
506 found=1; 525 found = 1;
507 in=next; 526 in = next;
508 } 527 }
509 else 528 else
510 { 529 {
511 while (len-- > 0) 530 while (len-- > 0)
512 { 531 {
513 *out++=*in++; 532 *out++ = *in++;
514 } 533 }
515 } 534 }
516 } 535 }
517 *out++='\0'; 536 *out++ = '\0';
518 if (!found) 537 if (!found)
519 { 538 {
520 if (ret) 539 if (ret)
521 { 540 {
522 g_free(ret); 541 g_free(ret);
523 } 542 }
524 ret=NULL; 543 ret = NULL;
525 } 544 }
526 return ret; 545 return ret;
527} 546}
528 547
529char * 548char *
530linguistics_next_word(char *str) 549linguistics_next_word(char *str)
531{ 550{
532 char* ret=strtok(str, " -/()\"\',.;_[]{}\\"); 551 char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
533 return ret; 552 return ret;
534 553
535// int len=strcspn(str, " -/()"); 554 // int len=strcspn(str, " -/()");
536// if (!str[len] || !str[len+1]) 555 // if (!str[len] || !str[len+1])
537// return NULL; 556 // return NULL;
538// return str+len+1; 557 // return str+len+1;
539 558
540} 559}
541 560
542int
543linguistics_search(char *str) 561int linguistics_search(char *str)
544{ 562{
545 if (!g_strcasecmp(str,"str")) 563 if (!g_strcasecmp(str, "str"))
546 return 0; 564 return 0;
547 if (!g_strcasecmp(str,"str.")) 565 if (!g_strcasecmp(str, "str."))
548 return 0; 566 return 0;
549 if (!g_strcasecmp(str,"strasse")) 567 if (!g_strcasecmp(str, "strasse"))
550 return 0; 568 return 0;
551 if (!g_strcasecmp(str,"weg")) 569 if (!g_strcasecmp(str, "weg"))
552 return 0; 570 return 0;
553 return 1; 571 return 1;
554} 572}
555 573
556/** 574/**
557 * @brief Copy one utf8 encoded char to newly allocated buffer. 575 * @brief Copy one utf8 encoded char to newly allocated buffer.
558 * 576 *
559 * @param s pointer to the beginning of the char. 577 * @param s pointer to the beginning of the char.
560 * @return newly allocated nul-terminated string containing one utf8 encoded character. 578 * @return newly allocated nul-terminated string containing one utf8 encoded character.
561 */ 579 */
562static char
563*linguistics_dup_utf8_char(const char *s) 580static char *linguistics_dup_utf8_char(const char *s)
564{ 581{
565 char *ret, *next; 582 char *ret, *next;
566 next=g_utf8_find_next_char(s,NULL); 583 next = g_utf8_find_next_char(s, NULL);
567 ret=g_new(char, next-s+1); 584 ret=g_new(char, next-s+1);
568 g_strlcpy(ret,s,next-s+1); 585 g_strlcpy(ret, s, next - s + 1);
569 return ret; 586 return ret;
570} 587}
571 588
572void
573linguistics_init(void) 589void linguistics_init(void)
574{ 590{
575 int i; 591 int i;
576 special_hash=g_hash_table_new(g_str_hash, g_str_equal); 592 special_hash = g_hash_table_new(g_str_hash, g_str_equal);
577 casefold_hash=g_hash_table_new(g_str_hash, g_str_equal); 593 casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);
578 594
579 for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) 595 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
580 { 596 {
581 g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]); 597 g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
582 } 598 }
583 599
584 for (i = 0 ; upperlower[i]; i+=2) 600 for (i = 0; upperlower[i]; i += 2)
585 { 601 {
586 int j,k; 602 int j, k;
587 for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) 603 for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
588 { 604 {
589 char *s1=linguistics_dup_utf8_char(upperlower[i]+j); 605 char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
590 char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k); 606 char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
591 g_hash_table_insert(casefold_hash,s1,s2); 607 g_hash_table_insert(casefold_hash, s1, s2);
592 j+=strlen(s1); 608 j += strlen(s1);
593 k+=strlen(s2); 609 k += strlen(s2);
594 }
595 } 610 }
611 }
596} 612}
597 613

Legend:
Removed from v.26  
changed lines
  Added in v.27

   
Visit the ZANavi Wiki