/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Contents of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (hide annotations) (download)
Mon Apr 9 21:27:36 2012 UTC (11 years, 11 months ago) by zoff99
File MIME type: text/plain
File size: 12273 byte(s)
lots of new stuff, tranlsations, bug fixes ...
1 zoff99 2 #include <string.h>
2     #include <stdio.h>
3     #include <glib.h>
4     #include "debug.h"
5     #include "linguistics.h"
6    
7 zoff99 27 static const char *special[][3] =
8     {
9 zoff99 2 /* Capital Diacritics */
10     /* ¨ Diaresis */
11 zoff99 27 { "Ä", "A", "AE" },
12     { "Ë", "E" },
13     { "Ï", "I" },
14     { "Ö", "O", "OE" },
15     { "Ü", "U", "UE" },
16     { "Ÿ", "Y" },
17 zoff99 2 /* ˝ Double Acute Accent */
18 zoff99 27 { "Ő", "O", "Ö" },
19     { "Ű", "U", "Ü" },
20 zoff99 2 /* ´ Acute Accent */
21 zoff99 27 { "Á", "A" },
22     { "Ć", "C" },
23     { "É", "E" },
24     { "Í", "I" },
25     { "Ĺ", "L" },
26     { "Ń", "N" },
27     { "Ó", "O" },
28     { "Ŕ", "R" },
29     { "Ś", "S" },
30     { "Ú", "U" },
31     { "Ý", "Y" },
32     { "Ź", "Z" },
33 zoff99 2 /* ˛ Ogonek (nosinė) */
34 zoff99 27 { "Ą", "A" },
35     { "Ę", "E" },
36     { "Į", "I" },
37     { "Ų", "U" },
38 zoff99 2 /* ˙ Dot */
39 zoff99 27 { "Ċ", "C" },
40     { "Ė", "E" },
41     { "Ġ", "G" },
42     { "İ", "I" },
43     { "Ŀ", "L" },
44     { "Ż", "Z" },
45 zoff99 2 /* – Stroke */
46 zoff99 27 { "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
47     { "Ħ", "H" },
48     { "Ł", "L" },
49     { "Ŧ", "T" },
50 zoff99 2 /* ˚ Ring */
51 zoff99 27 { "Å", "A", "AA" },
52     { "Ů", "U" },
53 zoff99 2 /* ˇ Caron (haček, paukščiukas) */
54 zoff99 27 { "Č", "C" },
55     { "Ď", "D" },
56     { "Ě", "E" },
57     { "Ľ", "L" },
58     { "Ň", "N" },
59     { "Ř", "R" },
60     { "Š", "S" },
61     { "Ť", "T" },
62     { "Ž", "Z" },
63 zoff99 2 /* / Slash */
64 zoff99 27 { "Ø", "O", "OE" },
65 zoff99 2 /* ¯ Macron */
66 zoff99 27 { "Ā", "A", "AA" },
67     { "Ē", "E", "EE" },
68     { "Ī", "I", "II" },
69     { "Ō", "O", "OO" },
70     { "Ū", "U", "UU" },
71 zoff99 2 /* ˘ Brevis */
72 zoff99 27 { "Ă", "A" },
73     { "Ĕ", "E" },
74     { "Ğ", "G" },
75     { "Ĭ", "I" },
76     { "Ŏ", "O" },
77     { "Ŭ", "U" },
78 zoff99 2 /* ^ Circumflex */
79 zoff99 27 { "Â", "A" },
80     { "Ĉ", "C" },
81     { "Ê", "E" },
82     { "Ĝ", "G" },
83     { "Ĥ", "H" },
84     { "Î", "I" },
85     { "Ĵ", "J" },
86     { "Ô", "O" },
87     { "Ŝ", "S" },
88     { "Û", "U" },
89     { "Ŵ", "W" },
90     { "Ŷ", "Y" },
91 zoff99 2 /* ¸ Cedilla */
92 zoff99 27 { "Ç", "C" },
93     { "Ģ", "G", "GJ" },
94     { "Ķ", "K", "KJ" },
95     { "Ļ", "L", "LJ" },
96     { "Ņ", "N", "NJ" },
97     { "Ŗ", "R" },
98     { "Ş", "S" },
99     { "Ţ", "T" },
100 zoff99 2 /* ~ Tilde */
101 zoff99 27 { "Ã", "A" },
102     { "Ĩ", "I" },
103     { "Ñ", "N" },
104     { "Õ", "O" },
105     { "Ũ", "U" },
106 zoff99 2 /* ` Grave */
107 zoff99 27 { "À", "A" },
108     { "È", "E" },
109     { "Ì", "I" },
110     { "Ò", "O" },
111     { "Ù", "U" },
112 zoff99 2 /* ligatures */
113 zoff99 27 { "Æ", "A", "AE" },
114     { "IJ", "IJ" },
115     { "Œ", "O", "OE" },
116 zoff99 2 /* special letters */
117 zoff99 27 { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
118     { "Ŋ", "N", "NG" },
119     { "Þ", "T", "TH" },
120 zoff99 2 /* Small Diacritics */
121     /* ¨ Diaresis */
122 zoff99 27 { "ä", "a", "ae" },
123     { "ë", "e" },
124     { "ï", "i" },
125     { "ö", "o", "oe" },
126     { "ü", "u", "ue" },
127     { "ÿ", "y" },
128 zoff99 2 /* ˝ Double Acute Accent */
129 zoff99 27 { "ő", "o", "ö" },
130     { "ű", "u", "ü" },
131 zoff99 2 /* ´ Acute Accent */
132 zoff99 27 { "á", "a" },
133     { "ć", "c" },
134     { "é", "e" },
135     { "í", "i" },
136     { "ĺ", "l" },
137     { "ń", "n" },
138     { "ó", "o" },
139     { "ŕ", "r" },
140     { "ś", "s" },
141     { "ú", "u" },
142     { "ý", "y" },
143     { "ź", "z" },
144 zoff99 2 /* ˛ Ogonek (nosinė) */
145 zoff99 27 { "ą", "a" },
146     { "ę", "e" },
147     { "į", "i" },
148     { "ų", "u" },
149 zoff99 2 /* ˙ Dot (and dotless i) */
150 zoff99 27 { "ċ", "c" },
151     { "ė", "e" },
152     { "ġ", "g" },
153     { "ı", "i" },
154     { "ŀ", "l" },
155     { "ż", "z" },
156 zoff99 2 /* – Stroke */
157 zoff99 27 { "đ", "d", "dj" },
158     { "ħ", "h" },
159     { "ł", "l" },
160     { "ŧ", "t" },
161 zoff99 2 /* ˚ Ring */
162 zoff99 27 { "å", "a", "aa" },
163     { "ů", "u" },
164 zoff99 2 /* ˇ Caron (haček, paukščiukas) */
165 zoff99 27 { "č", "c" },
166     { "ď", "d" },
167     { "ě", "e" },
168     { "ľ", "l" },
169     { "ň", "n" },
170     { "ř", "r" },
171     { "š", "s" },
172     { "ť", "t" },
173     { "ž", "z" },
174 zoff99 2 /* / Slash */
175 zoff99 27 { "ø", "o", "oe" },
176 zoff99 2 /* Macron */
177 zoff99 27 { "ā", "a", "aa" },
178     { "ē", "e", "ee" },
179     { "ī", "i", "ii" },
180     { "ō", "o", "oo" },
181     { "ū", "u", "uu" },
182 zoff99 2 /* ˘ Brevis */
183 zoff99 27 { "ă", "a" },
184     { "ĕ", "e" },
185     { "ğ", "g" },
186     { "ĭ", "i" },
187     { "ŏ", "o" },
188     { "ŭ", "u" },
189 zoff99 2 /* ^ Circumflex */
190 zoff99 27 { "â", "a" },
191     { "ĉ", "c" },
192     { "ê", "e" },
193     { "ĝ", "g" },
194     { "ĥ", "h" },
195     { "î", "i" },
196     { "ĵ", "j" },
197     { "ô", "o" },
198     { "ŝ", "s" },
199     { "û", "u" },
200     { "ŵ", "w" },
201     { "ŷ", "y" },
202 zoff99 2 /* ¸ Cedilla */
203 zoff99 27 { "ç", "c" },
204     { "ģ", "g", "gj" },
205     { "ķ", "k", "kj" },
206     { "ļ", "l", "lj" },
207     { "ņ", "n", "nj" },
208     { "ŗ", "r" },
209     { "ş", "s" },
210     { "ţ", "t" },
211 zoff99 2 /* ~ Tilde */
212 zoff99 27 { "ã", "a" },
213     { "ĩ", "i" },
214     { "õ", "o" },
215     { "ñ", "n" },
216     { "ũ", "u" },
217 zoff99 2 /* ` Grave */
218 zoff99 27 { "à", "a" },
219     { "è", "e" },
220     { "ì", "i" },
221     { "ò", "o" },
222     { "ù", "u" },
223 zoff99 2 /* ligatures */
224 zoff99 27 { "æ", "a", "ae" },
225     { "ij", "ij" },
226     { "œ", "o", "oe" },
227     { "ß", "s", "ss" },
228 zoff99 2 /* special letters */
229 zoff99 27 { "ð", "d", "dh" },
230     { "ŋ", "n", "ng" },
231     { "þ", "t", "th" },
232 zoff99 2
233     /* Cyrillic capital */
234 zoff99 15
235 zoff99 27 { "Ё", "Е" },
236     { "Й", "И" },
237     { "І", "I" },
238     { "Ї", "I" },
239     { "Ў", "У" },
240     { "Є", "Е", "Э" },
241     { "Ґ", "Г" },
242     { "Ѓ", "Г" },
243     { "Ђ", "Д" },
244     { "Ќ", "К" },
245 zoff99 15 //{"Љ","Л","ЛЬ"},
246 zoff99 27 //{"Њ","Н","НЬ"},
247     { "Џ", "Ц" },
248 zoff99 2
249 zoff99 27 /* Cyrillic small */
250 zoff99 15
251 zoff99 27 { "ё", "е" },
252     { "й", "и" },
253     { "і", "i" },
254     { "ї", "i" },
255     { "ў", "у" },
256     //{"є","е","э"},
257     { "ґ", "г" },
258     { "ѓ", "г" },
259     { "ђ", "д" },
260     { "ќ", "к" },
261     //{"љ","л","ль"},
262     //{"њ","н","нь"},
263     { "џ", "ц" },
264 zoff99 2
265     };
266 zoff99 15
267 zoff99 2 static GHashTable *special_hash;
268    
269     /* Array of strings for case conversion
270     * Even elements of array are strings of upper-case letters
271     * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
272     * Last element of array should be NULL.
273     */
274 zoff99 27 static const char
275     *upperlower[] =
276     {
277     /*Latin diacritics*/
278     "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
279     "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
280     /*Cyrillic*/
281     "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
282     "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
283 zoff99 2
284 zoff99 27 NULL };
285 zoff99 2
286     static GHashTable *casefold_hash;
287    
288 zoff99 27 struct special_pos
289     {
290 zoff99 2 char **variants;
291     int n;
292     char *s1, *s2;
293     };
294    
295 zoff99 27 static char**
296 zoff99 2 linguistics_get_special(char *str, char *end)
297     {
298     char buf[10];
299     int len;
300 zoff99 27 if (!end)
301     end = g_utf8_find_next_char(str, NULL);
302     len = end - str + 1;
303     g_strlcpy(buf, str, len > 10 ? 10 : len);
304     return g_hash_table_lookup(special_hash, buf);
305 zoff99 2 }
306    
307     /*
308     * @brief Prepare an utf-8 string for case insensitive comparison.
309     * @param in String to prepeare.
310     * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
311     */
312     char*
313     linguistics_casefold(char *in)
314     {
315 zoff99 27 int len = strlen(in);
316     char *src = in;
317 zoff99 2 char *ret=g_new(char,len+1);
318 zoff99 27 char *dest = ret;
319 zoff99 2 char buf[10];
320 zoff99 27 while (*src && dest - ret < len)
321     {
322     if (*src >= 'A' && *src <= 'Z')
323     {
324     *dest++ = *src++ - 'A' + 'a';
325     }
326     else if (!(*src & 128))
327     {
328     *dest++ = *src++;
329     }
330     else
331     {
332 zoff99 2 int charlen;
333     char *tmp, *folded;
334 zoff99 27 tmp = g_utf8_find_next_char(src, NULL);
335     charlen = tmp - src + 1;
336     g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
337     folded = g_hash_table_lookup(casefold_hash, buf);
338 zoff99 15
339 zoff99 27 if (folded)
340 zoff99 15 {
341 zoff99 27 while (*folded && dest - ret < len)
342     *dest++ = *folded++;
343     src = tmp;
344 zoff99 15 }
345     else
346     {
347 zoff99 27 while (src < tmp && dest - ret < len)
348     *dest++ = *src++;
349 zoff99 2 }
350     }
351     }
352 zoff99 27 *dest = 0;
353     if (*src)
354     dbg(
355     0,
356     "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
357     in, ret);
358 zoff99 2 return ret;
359     }
360    
361     /**
362     * @brief Compare two strings using special characters expansion.
363     *
364     * @param str first string to compare, special characters are expanded.
365     * @param match second string to compare, special characters are not expanded.
366     * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
367     * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
368     */
369    
370 zoff99 27 int linguistics_compare(char *str, char *match, int partial)
371 zoff99 2 {
372 zoff99 27 char *s1 = str, *s2 = match;
373 zoff99 2 char **sp;
374 zoff99 27 int ret = 0;
375 zoff99 2 int got_match;
376 zoff99 27 GList *l = NULL;
377     while (*s1 && *s2)
378     {
379 zoff99 2 int j;
380     struct special_pos *spp;
381     char *utf_boundary, *tmp;
382     /* Skip all matching chars */
383 zoff99 27 for (j = 0; s1[j] && s1[j] == s2[j]; j++)
384     ;
385     if (!s2[j] && (partial || !s1[j]))
386     {
387 zoff99 2 /* MATCH! */
388 zoff99 27 ret = 0;
389 zoff99 2 break;
390     }
391     /* Find beginning of first mismatching utf-8 encoded char */
392 zoff99 27 utf_boundary = s1;
393     while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
394     {
395     if (tmp > s1 + j)
396 zoff99 2 break;
397 zoff99 27 utf_boundary = tmp;
398 zoff99 2 }
399     /* Push first mismatching char to the list if it's a special char */
400 zoff99 27 sp = linguistics_get_special(utf_boundary, tmp);
401 zoff99 15
402 zoff99 27 if (sp)
403 zoff99 15 {
404 zoff99 2 spp=g_new(struct special_pos,1);
405 zoff99 27 spp->variants = sp;
406     spp->n = 1;
407     spp->s1 = utf_boundary;
408     spp->s2 = s2 + (utf_boundary - s1);
409     l = g_list_prepend(l, spp);
410     }
411 zoff99 2
412     /* Try to find a match using special char variants from the list */
413 zoff99 27 got_match = 0;
414     while (l && !got_match)
415     {
416     spp = l->data;
417     s1 = spp->s1;
418     s2 = spp->s2;
419     while (spp->n < 3 && !got_match)
420     {
421     char *s = spp->variants[(spp->n)++];
422 zoff99 2 int len;
423 zoff99 27 if (!s)
424 zoff99 2 break;
425 zoff99 27 len = strlen(s);
426     if (!strncmp(s, s2, len))
427     {
428     s2 += len;
429     s1 += strlen(spp->variants[0]);
430     got_match = 1;
431 zoff99 2 break;
432     }
433     }
434 zoff99 27 if (spp->n >= 3 || !spp->variants[spp->n])
435     {
436 zoff99 2 /* No matches for current top list element, go to the closest special char towards beginning of the string */
437     g_free(spp);
438 zoff99 27 l = g_list_delete_link(l, l);
439 zoff99 2 }
440     }
441 zoff99 27 if (!got_match)
442     {
443 zoff99 2 /* NO MATCH
444     * FIXME: If we're going to use this function to sort a string list alphabetically we should use
445     * utf-aware comparison here.
446     */
447 zoff99 27 ret = 1;
448 zoff99 2 break;
449     }
450     }
451 zoff99 27 while (l)
452     {
453 zoff99 2 g_free(l->data);
454 zoff99 27 l = g_list_delete_link(l, l);
455 zoff99 2 }
456     return ret;
457     }
458    
459     char *
460     linguistics_expand_special(char *str, int mode)
461     {
462 zoff99 27 char *in = str;
463     char *out, *ret;
464     int found = 0;
465 zoff99 2
466 zoff99 15 if (!str)
467     {
468     return NULL;
469     }
470    
471 zoff99 27 ret = g_strdup(str);
472     out = ret;
473 zoff99 15
474     if (!mode)
475     {
476 zoff99 2 return ret;
477 zoff99 15 }
478 zoff99 2
479     while (*in)
480     {
481 zoff99 27 char *next = g_utf8_find_next_char(in, NULL);
482     int i, len = next - in;
483     int match = 0;
484 zoff99 2 if (len > 1)
485     {
486 zoff99 27 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
487 zoff99 2 {
488 zoff99 27 const char *search = special[i][0];
489     if (!strncmp(in, search, len))
490 zoff99 2 {
491 zoff99 27 const char *replace = special[i][mode];
492 zoff99 2 if (replace)
493     {
494 zoff99 27 int replace_len = strlen(replace);
495 zoff99 2
496 zoff99 15 if (replace_len > len)
497 zoff99 2 {
498 zoff99 27 fprintf(
499     stderr,
500     "* ERROR !! ERROR !! found %s %s %d %s %d\n",
501     in, search, len, replace, replace_len);
502 zoff99 2 }
503 zoff99 15 dbg_assert(replace_len <= len);
504     if (replace_len > len)
505     {
506 zoff99 27 out += len;
507     match = 0;
508 zoff99 15 break;
509     }
510 zoff99 2 else
511     {
512 zoff99 15 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
513 zoff99 2 strcpy(out, replace);
514 zoff99 27 out += replace_len;
515     match = 1;
516 zoff99 2 break;
517     }
518     }
519     }
520     }
521     }
522    
523     if (match)
524     {
525 zoff99 27 found = 1;
526     in = next;
527 zoff99 2 }
528     else
529     {
530     while (len-- > 0)
531     {
532 zoff99 27 *out++ = *in++;
533 zoff99 2 }
534     }
535     }
536 zoff99 27 *out++ = '\0';
537 zoff99 2 if (!found)
538     {
539 zoff99 15 if (ret)
540     {
541     g_free(ret);
542     }
543 zoff99 27 ret = NULL;
544 zoff99 2 }
545     return ret;
546     }
547    
548     char *
549     linguistics_next_word(char *str)
550     {
551 zoff99 27 char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
552 zoff99 15 return ret;
553    
554 zoff99 27 // int len=strcspn(str, " -/()");
555     // if (!str[len] || !str[len+1])
556     // return NULL;
557     // return str+len+1;
558 zoff99 15
559 zoff99 2 }
560    
561 zoff99 27 int linguistics_search(char *str)
562 zoff99 2 {
563 zoff99 27 if (!g_strcasecmp(str, "str"))
564 zoff99 2 return 0;
565 zoff99 27 if (!g_strcasecmp(str, "str."))
566 zoff99 2 return 0;
567 zoff99 27 if (!g_strcasecmp(str, "strasse"))
568 zoff99 2 return 0;
569 zoff99 27 if (!g_strcasecmp(str, "weg"))
570 zoff99 2 return 0;
571     return 1;
572     }
573    
574     /**
575     * @brief Copy one utf8 encoded char to newly allocated buffer.
576     *
577     * @param s pointer to the beginning of the char.
578     * @return newly allocated nul-terminated string containing one utf8 encoded character.
579     */
580 zoff99 27 static char *linguistics_dup_utf8_char(const char *s)
581 zoff99 2 {
582     char *ret, *next;
583 zoff99 27 next = g_utf8_find_next_char(s, NULL);
584 zoff99 2 ret=g_new(char, next-s+1);
585 zoff99 27 g_strlcpy(ret, s, next - s + 1);
586 zoff99 2 return ret;
587     }
588    
589 zoff99 27 void linguistics_init(void)
590 zoff99 2 {
591     int i;
592 zoff99 27 special_hash = g_hash_table_new(g_str_hash, g_str_equal);
593     casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);
594 zoff99 2
595 zoff99 27 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
596 zoff99 15 {
597 zoff99 27 g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
598 zoff99 15 }
599 zoff99 2
600 zoff99 27 for (i = 0; upperlower[i]; i += 2)
601 zoff99 15 {
602 zoff99 27 int j, k;
603     for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
604 zoff99 15 {
605 zoff99 27 char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
606     char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
607     g_hash_table_insert(casefold_hash, s1, s2);
608     j += strlen(s1);
609     k += strlen(s2);
610 zoff99 2 }
611     }
612     }
613    

   
Visit the ZANavi Wiki