/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Contents of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (show annotations) (download)
Mon Nov 21 20:54:48 2011 UTC (12 years, 4 months ago) by zoff99
File MIME type: text/plain
File size: 11173 byte(s)
better search, make search interuptable, new translations
1 #include <string.h>
2 #include <stdio.h>
3 #include <glib.h>
4 #include "debug.h"
5 #include "linguistics.h"
6
7 static const char *special[][3]={
8 /* Capital Diacritics */
9 /* ¨ Diaresis */
10 {"Ä","A","AE"},
11 {"Ë","E"},
12 {"Ï","I"},
13 {"Ö","O","OE"},
14 {"Ü","U","UE"},
15 {"Ÿ","Y"},
16 /* ˝ Double Acute Accent */
17 {"Ő","O","Ö"},
18 {"Ű","U","Ü"},
19 /* ´ Acute Accent */
20 {"Á","A"},
21 {"Ć","C"},
22 {"É","E"},
23 {"Í","I"},
24 {"Ĺ","L"},
25 {"Ń","N"},
26 {"Ó","O"},
27 {"Ŕ","R"},
28 {"Ś","S"},
29 {"Ú","U"},
30 {"Ý","Y"},
31 {"Ź","Z"},
32 /* ˛ Ogonek (nosinė) */
33 {"Ą","A"},
34 {"Ę","E"},
35 {"Į","I"},
36 {"Ų","U"},
37 /* ˙ Dot */
38 {"Ċ","C"},
39 {"Ė","E"},
40 {"Ġ","G"},
41 {"İ","I"},
42 {"Ŀ","L"},
43 {"Ż","Z"},
44 /* – Stroke */
45 {"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
46 {"Ħ","H"},
47 {"Ł","L"},
48 {"Ŧ","T"},
49 /* ˚ Ring */
50 {"Å","A","AA"},
51 {"Ů","U"},
52 /* ˇ Caron (haček, paukščiukas) */
53 {"Č","C"},
54 {"Ď","D"},
55 {"Ě","E"},
56 {"Ľ","L"},
57 {"Ň","N"},
58 {"Ř","R"},
59 {"Š","S"},
60 {"Ť","T"},
61 {"Ž","Z"},
62 /* / Slash */
63 {"Ø","O","OE"},
64 /* ¯ Macron */
65 {"Ā","A","AA"},
66 {"Ē","E","EE"},
67 {"Ī","I","II"},
68 {"Ō","O","OO"},
69 {"Ū","U","UU"},
70 /* ˘ Brevis */
71 {"Ă","A"},
72 {"Ĕ","E"},
73 {"Ğ","G"},
74 {"Ĭ","I"},
75 {"Ŏ","O"},
76 {"Ŭ","U"},
77 /* ^ Circumflex */
78 {"Â","A"},
79 {"Ĉ","C"},
80 {"Ê","E"},
81 {"Ĝ","G"},
82 {"Ĥ","H"},
83 {"Î","I"},
84 {"Ĵ","J"},
85 {"Ô","O"},
86 {"Ŝ","S"},
87 {"Û","U"},
88 {"Ŵ","W"},
89 {"Ŷ","Y"},
90 /* ¸ Cedilla */
91 {"Ç","C"},
92 {"Ģ","G","GJ"},
93 {"Ķ","K","KJ"},
94 {"Ļ","L","LJ"},
95 {"Ņ","N","NJ"},
96 {"Ŗ","R"},
97 {"Ş","S"},
98 {"Ţ","T"},
99 /* ~ Tilde */
100 {"Ã","A"},
101 {"Ĩ","I"},
102 {"Ñ","N"},
103 {"Õ","O"},
104 {"Ũ","U"},
105 /* ` Grave */
106 {"À","A"},
107 {"È","E"},
108 {"Ì","I"},
109 {"Ò","O"},
110 {"Ù","U"},
111 /* ligatures */
112 {"Æ","A","AE"},
113 {"IJ","IJ"},
114 {"Œ","O","OE"},
115 /* special letters */
116 {"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
117 {"Ŋ","N","NG"},
118 {"Þ","T","TH"},
119 /* Small Diacritics */
120 /* ¨ Diaresis */
121 {"ä","a","ae"},
122 {"ë","e"},
123 {"ï","i"},
124 {"ö","o","oe"},
125 {"ü","u","ue"},
126 {"ÿ","y"},
127 /* ˝ Double Acute Accent */
128 {"ő","o","ö"},
129 {"ű","u","ü"},
130 /* ´ Acute Accent */
131 {"á","a"},
132 {"ć","c"},
133 {"é","e"},
134 {"í","i"},
135 {"ĺ","l"},
136 {"ń","n"},
137 {"ó","o"},
138 {"ŕ","r"},
139 {"ś","s"},
140 {"ú","u"},
141 {"ý","y"},
142 {"ź","z"},
143 /* ˛ Ogonek (nosinė) */
144 {"ą","a"},
145 {"ę","e"},
146 {"į","i"},
147 {"ų","u"},
148 /* ˙ Dot (and dotless i) */
149 {"ċ","c"},
150 {"ė","e"},
151 {"ġ","g"},
152 {"ı","i"},
153 {"ŀ","l"},
154 {"ż","z"},
155 /* – Stroke */
156 {"đ","d","dj"},
157 {"ħ","h"},
158 {"ł","l"},
159 {"ŧ","t"},
160 /* ˚ Ring */
161 {"å","a", "aa"},
162 {"ů","u"},
163 /* ˇ Caron (haček, paukščiukas) */
164 {"č","c"},
165 {"ď","d"},
166 {"ě","e"},
167 {"ľ","l"},
168 {"ň","n"},
169 {"ř","r"},
170 {"š","s"},
171 {"ť","t"},
172 {"ž","z"},
173 /* / Slash */
174 {"ø","o", "oe"},
175 /* Macron */
176 {"ā","a","aa"},
177 {"ē","e","ee"},
178 {"ī","i","ii"},
179 {"ō","o","oo"},
180 {"ū","u","uu"},
181 /* ˘ Brevis */
182 {"ă","a"},
183 {"ĕ","e"},
184 {"ğ","g"},
185 {"ĭ","i"},
186 {"ŏ","o"},
187 {"ŭ","u"},
188 /* ^ Circumflex */
189 {"â","a"},
190 {"ĉ","c"},
191 {"ê","e"},
192 {"ĝ","g"},
193 {"ĥ","h"},
194 {"î","i"},
195 {"ĵ","j"},
196 {"ô","o"},
197 {"ŝ","s"},
198 {"û","u"},
199 {"ŵ","w"},
200 {"ŷ","y"},
201 /* ¸ Cedilla */
202 {"ç","c"},
203 {"ģ","g","gj"},
204 {"ķ","k","kj"},
205 {"ļ","l","lj"},
206 {"ņ","n","nj"},
207 {"ŗ","r"},
208 {"ş","s"},
209 {"ţ","t"},
210 /* ~ Tilde */
211 {"ã","a"},
212 {"ĩ","i"},
213 {"õ","o"},
214 {"ñ","n"},
215 {"ũ","u"},
216 /* ` Grave */
217 {"à","a"},
218 {"è","e"},
219 {"ì","i"},
220 {"ò","o"},
221 {"ù","u"},
222 /* ligatures */
223 {"æ","a","ae"},
224 {"ij","ij"},
225 {"œ","o","oe"},
226 {"ß","s","ss"},
227 /* special letters */
228 {"ð","d","dh"},
229 {"ŋ","n","ng"},
230 {"þ","t","th"},
231
232 /* Cyrillic capital */
233
234 {"Ё","Е"},
235 {"Й","И"},
236 {"І","I"},
237 {"Ї","I"},
238 {"Ў","У"},
239 {"Є","Е","Э"},
240 {"Ґ","Г"},
241 {"Ѓ","Г"},
242 {"Ђ","Д"},
243 {"Ќ","К"},
244 //{"Љ","Л","ЛЬ"},
245 //{"Њ","Н","НЬ"},
246 {"Џ","Ц"},
247
248 /* Cyrillic small */
249
250 {"ё","е"},
251 {"й","и"},
252 {"і","i"},
253 {"ї","i"},
254 {"ў","у"},
255 //{"є","е","э"},
256 {"ґ","г"},
257 {"ѓ","г"},
258 {"ђ","д"},
259 {"ќ","к"},
260 //{"љ","л","ль"},
261 //{"њ","н","нь"},
262 {"џ","ц"},
263
264 };
265
266 static GHashTable *special_hash;
267
268 /* Array of strings for case conversion
269 * Even elements of array are strings of upper-case letters
270 * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
271 * Last element of array should be NULL.
272 */
273 static const char *upperlower[]={
274 /*Latin diacritics*/
275 "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
276 "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
277 /*Cyrillic*/
278 "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
279 "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
280
281 NULL
282 };
283
284 static GHashTable *casefold_hash;
285
286
287 struct special_pos {
288 char **variants;
289 int n;
290 char *s1, *s2;
291 };
292
293
294
295 static char**
296 linguistics_get_special(char *str, char *end)
297 {
298 char buf[10];
299 int len;
300 if(!end)
301 end=g_utf8_find_next_char(str,NULL);
302 len=end-str+1;
303 g_strlcpy(buf,str,len>10?10:len);
304 return g_hash_table_lookup(special_hash,buf);
305 }
306
307
308 /*
309 * @brief Prepare an utf-8 string for case insensitive comparison.
310 * @param in String to prepeare.
311 * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
312 */
313 char*
314 linguistics_casefold(char *in)
315 {
316 int len=strlen(in);
317 char *src=in;
318 char *ret=g_new(char,len+1);
319 char *dest=ret;
320 char buf[10];
321 while(*src && dest-ret<len){
322 if(*src>='A' && *src<='Z') {
323 *dest++=*src++ - 'A' + 'a';
324 } else if (!(*src&128)) {
325 *dest++=*src++;
326 } else {
327 int charlen;
328 char *tmp, *folded;
329 tmp=g_utf8_find_next_char(src,NULL);
330 charlen=tmp-src+1;
331 g_strlcpy(buf,src,charlen>10?10:charlen);
332 folded=g_hash_table_lookup(casefold_hash,buf);
333
334 if(folded)
335 {
336 while(*folded && dest-ret<len)
337 *dest++=*folded++;
338 src=tmp;
339 }
340 else
341 {
342 while(src<tmp && dest-ret<len)
343 *dest++=*src++;
344 }
345 }
346 }
347 *dest=0;
348 if(*src)
349 dbg(0,"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",in,ret);
350 return ret;
351 }
352
353 /**
354 * @brief Compare two strings using special characters expansion.
355 *
356 * @param str first string to compare, special characters are expanded.
357 * @param match second string to compare, special characters are not expanded.
358 * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
359 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
360 */
361
362 int
363 linguistics_compare(char *str, char *match, int partial)
364 {
365 char *s1=str, *s2=match;
366 char **sp;
367 int ret=0;
368 int got_match;
369 GList *l=NULL;
370 while (*s1 && *s2) {
371 int j;
372 struct special_pos *spp;
373 char *utf_boundary, *tmp;
374 /* Skip all matching chars */
375 for(j=0;s1[j] && s1[j]==s2[j];j++);
376 if(!s2[j] && (partial || !s1[j])) {
377 /* MATCH! */
378 ret=0;
379 break;
380 }
381 /* Find beginning of first mismatching utf-8 encoded char */
382 utf_boundary=s1;
383 while(*(tmp=g_utf8_find_next_char(utf_boundary, NULL))) {
384 if(tmp>s1+j)
385 break;
386 utf_boundary=tmp;
387 }
388 /* Push first mismatching char to the list if it's a special char */
389 sp=linguistics_get_special(utf_boundary,tmp);
390
391 if(sp)
392 {
393 spp=g_new(struct special_pos,1);
394 spp->variants=sp;
395 spp->n=1;
396 spp->s1=utf_boundary;
397 spp->s2=s2+(utf_boundary-s1);
398 l=g_list_prepend(l,spp);
399 }
400
401 /* Try to find a match using special char variants from the list */
402 got_match=0;
403 while(l && !got_match) {
404 spp=l->data;
405 s1=spp->s1;
406 s2=spp->s2;
407 while(spp->n<3 && !got_match) {
408 char *s=spp->variants[(spp->n)++];
409 int len;
410 if(!s)
411 break;
412 len=strlen(s);
413 if(!strncmp(s,s2,len)) {
414 s2+=len;
415 s1+=strlen(spp->variants[0]);
416 got_match=1;
417 break;
418 }
419 }
420 if(spp->n>=3 || !spp->variants[spp->n]) {
421 /* No matches for current top list element, go to the closest special char towards beginning of the string */
422 g_free(spp);
423 l=g_list_delete_link(l,l);
424 }
425 }
426 if(!got_match) {
427 /* NO MATCH
428 * FIXME: If we're going to use this function to sort a string list alphabetically we should use
429 * utf-aware comparison here.
430 */
431 ret=1;
432 break;
433 }
434 }
435 while(l) {
436 g_free(l->data);
437 l=g_list_delete_link(l,l);
438 }
439 return ret;
440 }
441
442
443 char *
444 linguistics_expand_special(char *str, int mode)
445 {
446 char *in=str;
447 char *out,*ret;
448 int found=0;
449
450 if (!str)
451 {
452 return NULL;
453 }
454
455 ret=g_strdup(str);
456 out=ret;
457
458 if (!mode)
459 {
460 return ret;
461 }
462
463 while (*in)
464 {
465 char *next=g_utf8_find_next_char(in, NULL);
466 int i,len=next-in;
467 int match=0;
468 if (len > 1)
469 {
470 for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++)
471 {
472 const char *search=special[i][0];
473 if (!strncmp(in,search,len))
474 {
475 const char *replace=special[i][mode];
476 if (replace)
477 {
478 int replace_len=strlen(replace);
479
480 if (replace_len > len)
481 {
482 fprintf(stderr,"* ERROR !! ERROR !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
483 }
484 dbg_assert(replace_len <= len);
485 if (replace_len > len)
486 {
487 out+=len;
488 match=0;
489 break;
490 }
491 else
492 {
493 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
494 strcpy(out, replace);
495 out+=replace_len;
496 match=1;
497 break;
498 }
499 }
500 }
501 }
502 }
503
504 if (match)
505 {
506 found=1;
507 in=next;
508 }
509 else
510 {
511 while (len-- > 0)
512 {
513 *out++=*in++;
514 }
515 }
516 }
517 *out++='\0';
518 if (!found)
519 {
520 if (ret)
521 {
522 g_free(ret);
523 }
524 ret=NULL;
525 }
526 return ret;
527 }
528
529 char *
530 linguistics_next_word(char *str)
531 {
532 char* ret=strtok(str, " -/()\"\',.;_[]{}\\");
533 return ret;
534
535 // int len=strcspn(str, " -/()");
536 // if (!str[len] || !str[len+1])
537 // return NULL;
538 // return str+len+1;
539
540 }
541
542 int
543 linguistics_search(char *str)
544 {
545 if (!g_strcasecmp(str,"str"))
546 return 0;
547 if (!g_strcasecmp(str,"str."))
548 return 0;
549 if (!g_strcasecmp(str,"strasse"))
550 return 0;
551 if (!g_strcasecmp(str,"weg"))
552 return 0;
553 return 1;
554 }
555
556 /**
557 * @brief Copy one utf8 encoded char to newly allocated buffer.
558 *
559 * @param s pointer to the beginning of the char.
560 * @return newly allocated nul-terminated string containing one utf8 encoded character.
561 */
562 static char
563 *linguistics_dup_utf8_char(const char *s)
564 {
565 char *ret, *next;
566 next=g_utf8_find_next_char(s,NULL);
567 ret=g_new(char, next-s+1);
568 g_strlcpy(ret,s,next-s+1);
569 return ret;
570 }
571
572 void
573 linguistics_init(void)
574 {
575 int i;
576 special_hash=g_hash_table_new(g_str_hash, g_str_equal);
577 casefold_hash=g_hash_table_new(g_str_hash, g_str_equal);
578
579 for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++)
580 {
581 g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]);
582 }
583
584 for (i = 0 ; upperlower[i]; i+=2)
585 {
586 int j,k;
587 for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];)
588 {
589 char *s1=linguistics_dup_utf8_char(upperlower[i]+j);
590 char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k);
591 g_hash_table_insert(casefold_hash,s1,s2);
592 j+=strlen(s1);
593 k+=strlen(s2);
594 }
595 }
596 }
597

   
Visit the ZANavi Wiki