/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Contents of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 27 - (show annotations) (download)
Mon Apr 9 21:27:36 2012 UTC (12 years ago) by zoff99
File MIME type: text/plain
File size: 12273 byte(s)
lots of new stuff, tranlsations, bug fixes ...
1 #include <string.h>
2 #include <stdio.h>
3 #include <glib.h>
4 #include "debug.h"
5 #include "linguistics.h"
6
7 static const char *special[][3] =
8 {
9 /* Capital Diacritics */
10 /* ¨ Diaresis */
11 { "Ä", "A", "AE" },
12 { "Ë", "E" },
13 { "Ï", "I" },
14 { "Ö", "O", "OE" },
15 { "Ü", "U", "UE" },
16 { "Ÿ", "Y" },
17 /* ˝ Double Acute Accent */
18 { "Ő", "O", "Ö" },
19 { "Ű", "U", "Ü" },
20 /* ´ Acute Accent */
21 { "Á", "A" },
22 { "Ć", "C" },
23 { "É", "E" },
24 { "Í", "I" },
25 { "Ĺ", "L" },
26 { "Ń", "N" },
27 { "Ó", "O" },
28 { "Ŕ", "R" },
29 { "Ś", "S" },
30 { "Ú", "U" },
31 { "Ý", "Y" },
32 { "Ź", "Z" },
33 /* ˛ Ogonek (nosinė) */
34 { "Ą", "A" },
35 { "Ę", "E" },
36 { "Į", "I" },
37 { "Ų", "U" },
38 /* ˙ Dot */
39 { "Ċ", "C" },
40 { "Ė", "E" },
41 { "Ġ", "G" },
42 { "İ", "I" },
43 { "Ŀ", "L" },
44 { "Ż", "Z" },
45 /* – Stroke */
46 { "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
47 { "Ħ", "H" },
48 { "Ł", "L" },
49 { "Ŧ", "T" },
50 /* ˚ Ring */
51 { "Å", "A", "AA" },
52 { "Ů", "U" },
53 /* ˇ Caron (haček, paukščiukas) */
54 { "Č", "C" },
55 { "Ď", "D" },
56 { "Ě", "E" },
57 { "Ľ", "L" },
58 { "Ň", "N" },
59 { "Ř", "R" },
60 { "Š", "S" },
61 { "Ť", "T" },
62 { "Ž", "Z" },
63 /* / Slash */
64 { "Ø", "O", "OE" },
65 /* ¯ Macron */
66 { "Ā", "A", "AA" },
67 { "Ē", "E", "EE" },
68 { "Ī", "I", "II" },
69 { "Ō", "O", "OO" },
70 { "Ū", "U", "UU" },
71 /* ˘ Brevis */
72 { "Ă", "A" },
73 { "Ĕ", "E" },
74 { "Ğ", "G" },
75 { "Ĭ", "I" },
76 { "Ŏ", "O" },
77 { "Ŭ", "U" },
78 /* ^ Circumflex */
79 { "Â", "A" },
80 { "Ĉ", "C" },
81 { "Ê", "E" },
82 { "Ĝ", "G" },
83 { "Ĥ", "H" },
84 { "Î", "I" },
85 { "Ĵ", "J" },
86 { "Ô", "O" },
87 { "Ŝ", "S" },
88 { "Û", "U" },
89 { "Ŵ", "W" },
90 { "Ŷ", "Y" },
91 /* ¸ Cedilla */
92 { "Ç", "C" },
93 { "Ģ", "G", "GJ" },
94 { "Ķ", "K", "KJ" },
95 { "Ļ", "L", "LJ" },
96 { "Ņ", "N", "NJ" },
97 { "Ŗ", "R" },
98 { "Ş", "S" },
99 { "Ţ", "T" },
100 /* ~ Tilde */
101 { "Ã", "A" },
102 { "Ĩ", "I" },
103 { "Ñ", "N" },
104 { "Õ", "O" },
105 { "Ũ", "U" },
106 /* ` Grave */
107 { "À", "A" },
108 { "È", "E" },
109 { "Ì", "I" },
110 { "Ò", "O" },
111 { "Ù", "U" },
112 /* ligatures */
113 { "Æ", "A", "AE" },
114 { "IJ", "IJ" },
115 { "Œ", "O", "OE" },
116 /* special letters */
117 { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
118 { "Ŋ", "N", "NG" },
119 { "Þ", "T", "TH" },
120 /* Small Diacritics */
121 /* ¨ Diaresis */
122 { "ä", "a", "ae" },
123 { "ë", "e" },
124 { "ï", "i" },
125 { "ö", "o", "oe" },
126 { "ü", "u", "ue" },
127 { "ÿ", "y" },
128 /* ˝ Double Acute Accent */
129 { "ő", "o", "ö" },
130 { "ű", "u", "ü" },
131 /* ´ Acute Accent */
132 { "á", "a" },
133 { "ć", "c" },
134 { "é", "e" },
135 { "í", "i" },
136 { "ĺ", "l" },
137 { "ń", "n" },
138 { "ó", "o" },
139 { "ŕ", "r" },
140 { "ś", "s" },
141 { "ú", "u" },
142 { "ý", "y" },
143 { "ź", "z" },
144 /* ˛ Ogonek (nosinė) */
145 { "ą", "a" },
146 { "ę", "e" },
147 { "į", "i" },
148 { "ų", "u" },
149 /* ˙ Dot (and dotless i) */
150 { "ċ", "c" },
151 { "ė", "e" },
152 { "ġ", "g" },
153 { "ı", "i" },
154 { "ŀ", "l" },
155 { "ż", "z" },
156 /* – Stroke */
157 { "đ", "d", "dj" },
158 { "ħ", "h" },
159 { "ł", "l" },
160 { "ŧ", "t" },
161 /* ˚ Ring */
162 { "å", "a", "aa" },
163 { "ů", "u" },
164 /* ˇ Caron (haček, paukščiukas) */
165 { "č", "c" },
166 { "ď", "d" },
167 { "ě", "e" },
168 { "ľ", "l" },
169 { "ň", "n" },
170 { "ř", "r" },
171 { "š", "s" },
172 { "ť", "t" },
173 { "ž", "z" },
174 /* / Slash */
175 { "ø", "o", "oe" },
176 /* Macron */
177 { "ā", "a", "aa" },
178 { "ē", "e", "ee" },
179 { "ī", "i", "ii" },
180 { "ō", "o", "oo" },
181 { "ū", "u", "uu" },
182 /* ˘ Brevis */
183 { "ă", "a" },
184 { "ĕ", "e" },
185 { "ğ", "g" },
186 { "ĭ", "i" },
187 { "ŏ", "o" },
188 { "ŭ", "u" },
189 /* ^ Circumflex */
190 { "â", "a" },
191 { "ĉ", "c" },
192 { "ê", "e" },
193 { "ĝ", "g" },
194 { "ĥ", "h" },
195 { "î", "i" },
196 { "ĵ", "j" },
197 { "ô", "o" },
198 { "ŝ", "s" },
199 { "û", "u" },
200 { "ŵ", "w" },
201 { "ŷ", "y" },
202 /* ¸ Cedilla */
203 { "ç", "c" },
204 { "ģ", "g", "gj" },
205 { "ķ", "k", "kj" },
206 { "ļ", "l", "lj" },
207 { "ņ", "n", "nj" },
208 { "ŗ", "r" },
209 { "ş", "s" },
210 { "ţ", "t" },
211 /* ~ Tilde */
212 { "ã", "a" },
213 { "ĩ", "i" },
214 { "õ", "o" },
215 { "ñ", "n" },
216 { "ũ", "u" },
217 /* ` Grave */
218 { "à", "a" },
219 { "è", "e" },
220 { "ì", "i" },
221 { "ò", "o" },
222 { "ù", "u" },
223 /* ligatures */
224 { "æ", "a", "ae" },
225 { "ij", "ij" },
226 { "œ", "o", "oe" },
227 { "ß", "s", "ss" },
228 /* special letters */
229 { "ð", "d", "dh" },
230 { "ŋ", "n", "ng" },
231 { "þ", "t", "th" },
232
233 /* Cyrillic capital */
234
235 { "Ё", "Е" },
236 { "Й", "И" },
237 { "І", "I" },
238 { "Ї", "I" },
239 { "Ў", "У" },
240 { "Є", "Е", "Э" },
241 { "Ґ", "Г" },
242 { "Ѓ", "Г" },
243 { "Ђ", "Д" },
244 { "Ќ", "К" },
245 //{"Љ","Л","ЛЬ"},
246 //{"Њ","Н","НЬ"},
247 { "Џ", "Ц" },
248
249 /* Cyrillic small */
250
251 { "ё", "е" },
252 { "й", "и" },
253 { "і", "i" },
254 { "ї", "i" },
255 { "ў", "у" },
256 //{"є","е","э"},
257 { "ґ", "г" },
258 { "ѓ", "г" },
259 { "ђ", "д" },
260 { "ќ", "к" },
261 //{"љ","л","ль"},
262 //{"њ","н","нь"},
263 { "џ", "ц" },
264
265 };
266
267 static GHashTable *special_hash;
268
269 /* Array of strings for case conversion
270 * Even elements of array are strings of upper-case letters
271 * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
272 * Last element of array should be NULL.
273 */
274 static const char
275 *upperlower[] =
276 {
277 /*Latin diacritics*/
278 "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
279 "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
280 /*Cyrillic*/
281 "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
282 "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
283
284 NULL };
285
286 static GHashTable *casefold_hash;
287
288 struct special_pos
289 {
290 char **variants;
291 int n;
292 char *s1, *s2;
293 };
294
295 static char**
296 linguistics_get_special(char *str, char *end)
297 {
298 char buf[10];
299 int len;
300 if (!end)
301 end = g_utf8_find_next_char(str, NULL);
302 len = end - str + 1;
303 g_strlcpy(buf, str, len > 10 ? 10 : len);
304 return g_hash_table_lookup(special_hash, buf);
305 }
306
307 /*
308 * @brief Prepare an utf-8 string for case insensitive comparison.
309 * @param in String to prepeare.
310 * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
311 */
312 char*
313 linguistics_casefold(char *in)
314 {
315 int len = strlen(in);
316 char *src = in;
317 char *ret=g_new(char,len+1);
318 char *dest = ret;
319 char buf[10];
320 while (*src && dest - ret < len)
321 {
322 if (*src >= 'A' && *src <= 'Z')
323 {
324 *dest++ = *src++ - 'A' + 'a';
325 }
326 else if (!(*src & 128))
327 {
328 *dest++ = *src++;
329 }
330 else
331 {
332 int charlen;
333 char *tmp, *folded;
334 tmp = g_utf8_find_next_char(src, NULL);
335 charlen = tmp - src + 1;
336 g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
337 folded = g_hash_table_lookup(casefold_hash, buf);
338
339 if (folded)
340 {
341 while (*folded && dest - ret < len)
342 *dest++ = *folded++;
343 src = tmp;
344 }
345 else
346 {
347 while (src < tmp && dest - ret < len)
348 *dest++ = *src++;
349 }
350 }
351 }
352 *dest = 0;
353 if (*src)
354 dbg(
355 0,
356 "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
357 in, ret);
358 return ret;
359 }
360
361 /**
362 * @brief Compare two strings using special characters expansion.
363 *
364 * @param str first string to compare, special characters are expanded.
365 * @param match second string to compare, special characters are not expanded.
366 * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
367 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
368 */
369
370 int linguistics_compare(char *str, char *match, int partial)
371 {
372 char *s1 = str, *s2 = match;
373 char **sp;
374 int ret = 0;
375 int got_match;
376 GList *l = NULL;
377 while (*s1 && *s2)
378 {
379 int j;
380 struct special_pos *spp;
381 char *utf_boundary, *tmp;
382 /* Skip all matching chars */
383 for (j = 0; s1[j] && s1[j] == s2[j]; j++)
384 ;
385 if (!s2[j] && (partial || !s1[j]))
386 {
387 /* MATCH! */
388 ret = 0;
389 break;
390 }
391 /* Find beginning of first mismatching utf-8 encoded char */
392 utf_boundary = s1;
393 while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
394 {
395 if (tmp > s1 + j)
396 break;
397 utf_boundary = tmp;
398 }
399 /* Push first mismatching char to the list if it's a special char */
400 sp = linguistics_get_special(utf_boundary, tmp);
401
402 if (sp)
403 {
404 spp=g_new(struct special_pos,1);
405 spp->variants = sp;
406 spp->n = 1;
407 spp->s1 = utf_boundary;
408 spp->s2 = s2 + (utf_boundary - s1);
409 l = g_list_prepend(l, spp);
410 }
411
412 /* Try to find a match using special char variants from the list */
413 got_match = 0;
414 while (l && !got_match)
415 {
416 spp = l->data;
417 s1 = spp->s1;
418 s2 = spp->s2;
419 while (spp->n < 3 && !got_match)
420 {
421 char *s = spp->variants[(spp->n)++];
422 int len;
423 if (!s)
424 break;
425 len = strlen(s);
426 if (!strncmp(s, s2, len))
427 {
428 s2 += len;
429 s1 += strlen(spp->variants[0]);
430 got_match = 1;
431 break;
432 }
433 }
434 if (spp->n >= 3 || !spp->variants[spp->n])
435 {
436 /* No matches for current top list element, go to the closest special char towards beginning of the string */
437 g_free(spp);
438 l = g_list_delete_link(l, l);
439 }
440 }
441 if (!got_match)
442 {
443 /* NO MATCH
444 * FIXME: If we're going to use this function to sort a string list alphabetically we should use
445 * utf-aware comparison here.
446 */
447 ret = 1;
448 break;
449 }
450 }
451 while (l)
452 {
453 g_free(l->data);
454 l = g_list_delete_link(l, l);
455 }
456 return ret;
457 }
458
459 char *
460 linguistics_expand_special(char *str, int mode)
461 {
462 char *in = str;
463 char *out, *ret;
464 int found = 0;
465
466 if (!str)
467 {
468 return NULL;
469 }
470
471 ret = g_strdup(str);
472 out = ret;
473
474 if (!mode)
475 {
476 return ret;
477 }
478
479 while (*in)
480 {
481 char *next = g_utf8_find_next_char(in, NULL);
482 int i, len = next - in;
483 int match = 0;
484 if (len > 1)
485 {
486 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
487 {
488 const char *search = special[i][0];
489 if (!strncmp(in, search, len))
490 {
491 const char *replace = special[i][mode];
492 if (replace)
493 {
494 int replace_len = strlen(replace);
495
496 if (replace_len > len)
497 {
498 fprintf(
499 stderr,
500 "* ERROR !! ERROR !! found %s %s %d %s %d\n",
501 in, search, len, replace, replace_len);
502 }
503 dbg_assert(replace_len <= len);
504 if (replace_len > len)
505 {
506 out += len;
507 match = 0;
508 break;
509 }
510 else
511 {
512 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
513 strcpy(out, replace);
514 out += replace_len;
515 match = 1;
516 break;
517 }
518 }
519 }
520 }
521 }
522
523 if (match)
524 {
525 found = 1;
526 in = next;
527 }
528 else
529 {
530 while (len-- > 0)
531 {
532 *out++ = *in++;
533 }
534 }
535 }
536 *out++ = '\0';
537 if (!found)
538 {
539 if (ret)
540 {
541 g_free(ret);
542 }
543 ret = NULL;
544 }
545 return ret;
546 }
547
548 char *
549 linguistics_next_word(char *str)
550 {
551 char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
552 return ret;
553
554 // int len=strcspn(str, " -/()");
555 // if (!str[len] || !str[len+1])
556 // return NULL;
557 // return str+len+1;
558
559 }
560
561 int linguistics_search(char *str)
562 {
563 if (!g_strcasecmp(str, "str"))
564 return 0;
565 if (!g_strcasecmp(str, "str."))
566 return 0;
567 if (!g_strcasecmp(str, "strasse"))
568 return 0;
569 if (!g_strcasecmp(str, "weg"))
570 return 0;
571 return 1;
572 }
573
574 /**
575 * @brief Copy one utf8 encoded char to newly allocated buffer.
576 *
577 * @param s pointer to the beginning of the char.
578 * @return newly allocated nul-terminated string containing one utf8 encoded character.
579 */
580 static char *linguistics_dup_utf8_char(const char *s)
581 {
582 char *ret, *next;
583 next = g_utf8_find_next_char(s, NULL);
584 ret=g_new(char, next-s+1);
585 g_strlcpy(ret, s, next - s + 1);
586 return ret;
587 }
588
589 void linguistics_init(void)
590 {
591 int i;
592 special_hash = g_hash_table_new(g_str_hash, g_str_equal);
593 casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);
594
595 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
596 {
597 g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
598 }
599
600 for (i = 0; upperlower[i]; i += 2)
601 {
602 int j, k;
603 for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
604 {
605 char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
606 char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
607 g_hash_table_insert(casefold_hash, s1, s2);
608 j += strlen(s1);
609 k += strlen(s2);
610 }
611 }
612 }
613

   
Visit the ZANavi Wiki