/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Contents of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (show annotations) (download)
Mon Feb 4 17:41:59 2013 UTC (11 years, 1 month ago) by zoff99
File MIME type: text/plain
File size: 17500 byte(s)
new map version, lots of fixes and experimental new features
1 #include <string.h>
2 #include <stdio.h>
3 #include <glib.h>
4 #include "debug.h"
5 #include "linguistics.h"
6
7 static const char *special[][3] =
8 {
9 /* Capital Diacritics */
10 /* ¨ Diaresis */
11 { "Ä", "A", "AE" },
12 { "Ë", "E" },
13 { "Ï", "I" },
14 { "Ö", "O", "OE" },
15 { "Ü", "U", "UE" },
16 { "Ÿ", "Y" },
17 /* ˝ Double Acute Accent */
18 { "Ő", "O", "Ö" },
19 { "Ű", "U", "Ü" },
20 /* ´ Acute Accent */
21 { "Á", "A" },
22 { "Ć", "C" },
23 { "É", "E" },
24 { "Í", "I" },
25 { "Ĺ", "L" },
26 { "Ń", "N" },
27 { "Ó", "O" },
28 { "Ŕ", "R" },
29 { "Ś", "S" },
30 { "Ú", "U" },
31 { "Ý", "Y" },
32 { "Ź", "Z" },
33 /* ˛ Ogonek (nosinė) */
34 { "Ą", "A" },
35 { "Ę", "E" },
36 { "Į", "I" },
37 { "Ų", "U" },
38 /* ˙ Dot */
39 { "Ċ", "C" },
40 { "Ė", "E" },
41 { "Ġ", "G" },
42 { "İ", "I" },
43 { "Ŀ", "L" },
44 { "Ż", "Z" },
45 /* – Stroke */
46 { "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
47 { "Ħ", "H" },
48 { "Ł", "L" },
49 { "Ŧ", "T" },
50 /* ˚ Ring */
51 { "Å", "A", "AA" },
52 { "Ů", "U" },
53 /* ˇ Caron (haček, paukščiukas) */
54 { "Č", "C" },
55 { "Ď", "D" },
56 { "Ě", "E" },
57 { "Ľ", "L" },
58 { "Ň", "N" },
59 { "Ř", "R" },
60 { "Š", "S" },
61 { "Ť", "T" },
62 { "Ž", "Z" },
63 /* / Slash */
64 { "Ø", "O", "OE" },
65 /* ¯ Macron */
66 { "Ā", "A", "AA" },
67 { "Ē", "E", "EE" },
68 { "Ī", "I", "II" },
69 { "Ō", "O", "OO" },
70 { "Ū", "U", "UU" },
71 /* ˘ Brevis */
72 { "Ă", "A" },
73 { "Ĕ", "E" },
74 { "Ğ", "G" },
75 { "Ĭ", "I" },
76 { "Ŏ", "O" },
77 { "Ŭ", "U" },
78 /* ^ Circumflex */
79 { "Â", "A" },
80 { "Ĉ", "C" },
81 { "Ê", "E" },
82 { "Ĝ", "G" },
83 { "Ĥ", "H" },
84 { "Î", "I" },
85 { "Ĵ", "J" },
86 { "Ô", "O" },
87 { "Ŝ", "S" },
88 { "Û", "U" },
89 { "Ŵ", "W" },
90 { "Ŷ", "Y" },
91 /* ¸ Cedilla */
92 { "Ç", "C" },
93 { "Ģ", "G", "GJ" },
94 { "Ķ", "K", "KJ" },
95 { "Ļ", "L", "LJ" },
96 { "Ņ", "N", "NJ" },
97 { "Ŗ", "R" },
98 { "Ş", "S" },
99 { "Ţ", "T" },
100 /* ~ Tilde */
101 { "Ã", "A" },
102 { "Ĩ", "I" },
103 { "Ñ", "N" },
104 { "Õ", "O" },
105 { "Ũ", "U" },
106 /* ` Grave */
107 { "À", "A" },
108 { "È", "E" },
109 { "Ì", "I" },
110 { "Ò", "O" },
111 { "Ù", "U" },
112 /* ligatures */
113 { "Æ", "A", "AE" },
114 //{ "IJ", "IJ" },
115 { "Œ", "O", "OE" },
116 /* special letters */
117 { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
118 { "Ŋ", "N", "NG" },
119 { "Þ", "T", "TH" },
120 /* Small Diacritics */
121 /* ¨ Diaresis */
122 { "ä", "a", "ae" },
123 { "ë", "e" },
124 { "ï", "i" },
125 { "ö", "o", "oe" },
126 { "ü", "u", "ue" },
127 { "ÿ", "y" },
128 /* ˝ Double Acute Accent */
129 { "ő", "o", "ö" },
130 { "ű", "u", "ü" },
131 /* ´ Acute Accent */
132 { "á", "a" },
133 { "ć", "c" },
134 { "é", "e" },
135 { "í", "i" },
136 { "ĺ", "l" },
137 { "ń", "n" },
138 { "ó", "o" },
139 { "ŕ", "r" },
140 { "ś", "s" },
141 { "ú", "u" },
142 { "ý", "y" },
143 { "ź", "z" },
144 /* ˛ Ogonek (nosinė) */
145 { "ą", "a" },
146 { "ę", "e" },
147 { "į", "i" },
148 { "ų", "u" },
149 /* ˙ Dot (and dotless i) */
150 { "ċ", "c" },
151 { "ė", "e" },
152 { "ġ", "g" },
153 { "ı", "i" },
154 { "ŀ", "l" },
155 { "ż", "z" },
156 /* – Stroke */
157 { "đ", "d", "dj" },
158 { "ħ", "h" },
159 { "ł", "l" },
160 { "ŧ", "t" },
161 /* ˚ Ring */
162 { "å", "a", "aa" },
163 { "ů", "u" },
164 /* ˇ Caron (haček, paukščiukas) */
165 { "č", "c" },
166 { "ď", "d" },
167 { "ě", "e" },
168 { "ľ", "l" },
169 { "ň", "n" },
170 { "ř", "r" },
171 { "š", "s" },
172 { "ť", "t" },
173 { "ž", "z" },
174 /* / Slash */
175 { "ø", "o", "oe" },
176 /* Macron */
177 { "ā", "a", "aa" },
178 { "ē", "e", "ee" },
179 { "ī", "i", "ii" },
180 { "ō", "o", "oo" },
181 { "ū", "u", "uu" },
182 /* ˘ Brevis */
183 { "ă", "a" },
184 { "ĕ", "e" },
185 { "ğ", "g" },
186 { "ĭ", "i" },
187 { "ŏ", "o" },
188 { "ŭ", "u" },
189 /* ^ Circumflex */
190 { "â", "a" },
191 { "ĉ", "c" },
192 { "ê", "e" },
193 { "ĝ", "g" },
194 { "ĥ", "h" },
195 { "î", "i" },
196 { "ĵ", "j" },
197 { "ô", "o" },
198 { "ŝ", "s" },
199 { "û", "u" },
200 { "ŵ", "w" },
201 { "ŷ", "y" },
202 /* ¸ Cedilla */
203 { "ç", "c" },
204 { "ģ", "g", "gj" },
205 { "ķ", "k", "kj" },
206 { "ļ", "l", "lj" },
207 { "ņ", "n", "nj" },
208 { "ŗ", "r" },
209 { "ş", "s" },
210 { "ţ", "t" },
211 /* ~ Tilde */
212 { "ã", "a" },
213 { "ĩ", "i" },
214 { "õ", "o" },
215 { "ñ", "n" },
216 { "ũ", "u" },
217 /* ` Grave */
218 { "à", "a" },
219 { "è", "e" },
220 { "ì", "i" },
221 { "ò", "o" },
222 { "ù", "u" },
223 /* ligatures */
224 { "æ", "a", "ae" },
225 //{ "ij", "ij" },
226 { "œ", "o", "oe" },
227 { "ß", "s", "ss" },
228 /* special letters */
229 { "ð", "d", "dh" },
230 { "ŋ", "n", "ng" },
231 { "þ", "t", "th" },
232
233 /* Cyrillic capital */
234
235 { "Ё", "Е" },
236 { "Й", "И" },
237 { "І", "I" },
238 { "Ї", "I" },
239 { "Ў", "У" },
240 { "Є", "Е", "Э" },
241 { "Ґ", "Г" },
242 { "Ѓ", "Г" },
243 { "Ђ", "Д" },
244 { "Ќ", "К" },
245 //{"Љ","Л","ЛЬ"},
246 //{"Њ","Н","НЬ"},
247 { "Џ", "Ц" },
248
249 /* Cyrillic small */
250
251 { "ё", "е" },
252 { "й", "и" },
253 { "і", "i" },
254 { "ї", "i" },
255 { "ў", "у" },
256 //{"є","е","э"},
257 { "ґ", "г" },
258 { "ѓ", "г" },
259 { "ђ", "д" },
260 { "ќ", "к" },
261 //{"љ","л","ль"},
262 //{"њ","н","нь"},
263 { "џ", "ц" },
264
265 };
266
267
268 static GHashTable *special_hash;
269
270 /* Array of strings for case conversion
271 * Even elements of array are strings of upper-case letters
272 * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
273 * Last element of array should be NULL.
274 */
275 static const char
276 *upperlower[] =
277 {
278 /*Latin diacritics*/
279 "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
280 "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
281 /*Cyrillic*/
282 "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
283 "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
284
285 NULL };
286
287 static GHashTable *casefold_hash;
288
289 struct special_pos
290 {
291 char **variants;
292 int n;
293 char *s1, *s2;
294 };
295
296 static char**
297 linguistics_get_special(char *str, char *end)
298 {
299 char buf[11];
300 int len;
301 if (!end)
302 {
303 end = g_utf8_find_next_char(str, NULL);
304 }
305 len = end - str + 1;
306 g_strlcpy(buf, str, len > 10 ? 10 : len);
307 return g_hash_table_lookup(special_hash, buf);
308 }
309
310 /*
311 * @brief Prepare an utf-8 string for case insensitive comparison.
312 * @param in String to prepeare.
313 * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
314 */
315 char*
316 linguistics_casefold(char *in)
317 {
318 int len = strlen(in);
319 char *src = in;
320 //char *ret=g_new(char,len+1);
321 char *ret=g_new(char,len+20); // try to fix strange BUG
322 char *dest = ret;
323 char buf[10];
324
325 // string end
326 ret[19] = '\0';
327 // fprintf(stderr, "xxxsssssssssssss\n");
328
329 while (*src && ((dest - ret) < len))
330 {
331 if (*src >= 'A' && *src <= 'Z')
332 {
333 *dest++ = *src++ - 'A' + 'a';
334 }
335 else if (!(*src & 128))
336 {
337 *dest++ = *src++;
338 }
339 else
340 {
341 int charlen;
342 char *tmp, *folded;
343 tmp = g_utf8_find_next_char(src, NULL);
344 charlen = tmp - src + 1;
345 g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
346 folded = g_hash_table_lookup(casefold_hash, buf);
347
348 if (folded)
349 {
350 while (*folded && dest - ret < len)
351 {
352 *dest++ = *folded++;
353 }
354 src = tmp;
355 }
356 else
357 {
358 while (src < tmp && dest - ret < len)
359 {
360 *dest++ = *src++;
361 }
362 }
363 }
364 }
365
366 *dest = 0;
367 if (*src)
368 {
369 dbg(
370 0,
371 "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
372 in, ret);
373 }
374
375 return ret;
376 }
377
378 char* linguistics_fold_and_prepare_complete(char *in, int free_input)
379 {
380 char *tmp1;
381 char *tmp2;
382
383 if (in == NULL)
384 {
385 return NULL;
386 }
387
388 tmp1 = linguistics_casefold(in);
389 if (tmp1)
390 {
391 tmp2 = linguistics_remove_all_specials(tmp1);
392 if (tmp2)
393 {
394 g_free(tmp1);
395 tmp1 = tmp2;
396 }
397 tmp2 = linguistics_expand_special(tmp1, 1);
398 if (tmp2)
399 {
400 g_free(tmp1);
401 tmp1 = tmp2;
402 }
403 }
404
405 if (free_input)
406 {
407 if (in)
408 {
409 g_free(in);
410 in = NULL;
411 }
412 }
413
414 return tmp1;
415 }
416
417
418
419
420 /*
421 * Verify that "in" points to valid "modified UTF-8" data.
422 * returns: string of useable utf-8 bytes (dont need to free, original input is just truncated)
423 */
424 char* linguistics_check_utf8_string(char* in)
425 {
426 char* bytes = in;
427
428 if (bytes == NULL)
429 {
430 return NULL;
431 }
432
433 while (*bytes != '\0')
434 {
435 guint32 utf8 = *(bytes++);
436 // Switch on the high four bits.
437 switch (utf8 >> 4)
438 {
439 case 0x00:
440 case 0x01:
441 case 0x02:
442 case 0x03:
443 case 0x04:
444 case 0x05:
445 case 0x06:
446 case 0x07:
447 {
448 // Bit pattern 0xxx. No need for any extra bytes.
449 break;
450 }
451 case 0x08:
452 case 0x09:
453 case 0x0a:
454 case 0x0b:
455 case 0x0f:
456 {
457 /*
458 * Bit pattern 10xx or 1111, which are illegal start bytes.
459 * Note: 1111 is valid for normal UTF-8, but not the
460 * modified UTF-8 used here.
461 */
462 // LOGW("JNI WARNING: illegal start byte 0x%x\n", utf8);
463 *(bytes--) = '\0';
464 return in;
465 }
466 case 0x0e:
467 {
468 // Bit pattern 1110, so there are two additional bytes.
469 utf8 = *(bytes++);
470 if ((utf8 & 0xc0) != 0x80)
471 {
472 // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8);
473 *(bytes-2) = '\0';
474 return in;
475 }
476 // Fall through to take care of the final byte.
477 }
478 case 0x0c:
479 case 0x0d:
480 {
481 // Bit pattern 110x, so there is one additional byte.
482 utf8 = *(bytes++);
483 if ((utf8 & 0xc0) != 0x80)
484 {
485 // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8);
486 *(bytes-2) = '\0';
487 return in;
488 }
489 break;
490 }
491 }
492 }
493
494 return in;
495 }
496
497
498
499 /*
500 *
501 * @brief find match anywhere in str (only complete match, not partial!)
502 * both strings should have beend folded (and specials removed) before calling this function
503 *
504 * @return =0 on match =1 on not matched
505 *
506 */
507
508 int linguistics_compare_anywhere(char *str, char *match)
509 {
510 char *match_1;
511 char *match_next;
512 char *next;
513 char *next_char;
514 int found = 1;
515 gunichar match_1_unichar;
516
517 if ((str == NULL)||(match == NULL))
518 {
519 return found;
520 }
521
522 match_1 = g_strdup(match);
523 next = g_utf8_find_next_char(match_1, NULL);
524 if (next == NULL)
525 {
526 g_free(match_1);
527 return found;
528 }
529
530 *next = '\0'; // cut off after first utf-8 char
531
532 //dbg(0, "match=%s match_1=%s", match, match_1);
533
534 match_1_unichar = g_utf8_get_char(match_1);
535
536 match_next = g_utf8_strchr(str, -1, match_1_unichar);
537 while (match_next)
538 {
539
540 //dbg(0, "cmp1: match=%s match_next=%s", match, match_next);
541
542 // see if the utf-8 chars match
543 if (!strncmp(match, match_next, strlen(match)))
544 {
545 found = 0;
546 break;
547 }
548 match_next = g_utf8_strchr(g_utf8_find_next_char(match_next, NULL), -1, match_1_unichar);
549
550 //dbg(0, "cmp2: match=%s match_next=%s", match, match_next);
551 }
552
553 g_free(match_1);
554
555 return found;
556 }
557
558
559 /**
560 * @brief Compare two strings using special characters expansion.
561 *
562 * @param str first string to compare, special characters are expanded.
563 * @param match second string to compare, special characters are not expanded.
564 * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
565 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
566 */
567
568 int linguistics_compare(char *str, char *match, int partial)
569 {
570
571 if ((str == NULL)||(match == NULL))
572 {
573 return 1;
574 }
575
576 char *s1 = str, *s2 = match;
577 char **sp;
578 int ret = 0;
579 int got_match;
580 GList *l = NULL;
581 while (*s1 && *s2)
582 {
583 int j;
584 struct special_pos *spp;
585 char *utf_boundary, *tmp;
586 /* Skip all matching chars */
587 for (j = 0; s1[j] && s1[j] == s2[j]; j++)
588 {
589 ;
590 }
591
592 if (!s2[j] && (partial || !s1[j]))
593 {
594 /* MATCH! */
595 ret = 0;
596 break;
597 }
598
599 /* Find beginning of first mismatching utf-8 encoded char */
600 utf_boundary = s1;
601 while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
602 {
603 if (tmp > s1 + j)
604 {
605 break;
606 }
607 utf_boundary = tmp;
608 }
609
610 /* Push first mismatching char to the list if it's a special char */
611 sp = linguistics_get_special(utf_boundary, tmp);
612
613 if (sp)
614 {
615 spp=g_new(struct special_pos,1);
616 spp->variants = sp;
617 spp->n = 1;
618 spp->s1 = utf_boundary;
619 spp->s2 = s2 + (utf_boundary - s1);
620 l = g_list_prepend(l, spp);
621 }
622
623 /* Try to find a match using special char variants from the list */
624 got_match = 0;
625 while (l && !got_match)
626 {
627 spp = l->data;
628 s1 = spp->s1;
629 s2 = spp->s2;
630 while (spp->n < 3 && !got_match)
631 {
632 char *s = spp->variants[(spp->n)++];
633 int len;
634 if (!s)
635 break;
636 len = strlen(s);
637 if (!strncmp(s, s2, len))
638 {
639 s2 += len;
640 s1 += strlen(spp->variants[0]);
641 got_match = 1;
642 break;
643 }
644 }
645 if (spp->n >= 3 || !spp->variants[spp->n])
646 {
647 /* No matches for current top list element, go to the closest special char towards beginning of the string */
648 g_free(spp);
649 l = g_list_delete_link(l, l);
650 }
651 }
652
653 if (!got_match)
654 {
655 /* NO MATCH
656 * FIXME: If we're going to use this function to sort a string list alphabetically we should use
657 * utf-aware comparison here.
658 */
659 ret = 1;
660 break;
661 }
662 }
663
664 while (l)
665 {
666 g_free(l->data);
667 l = g_list_delete_link(l, l);
668 }
669 return ret;
670 }
671
672 char *
673 linguistics_expand_special(char *str, int mode)
674 {
675 char *in = str;
676 char *out, *ret;
677 int found = 0;
678
679 if (!str)
680 {
681 return NULL;
682 }
683
684 ret = g_strdup(str);
685 out = ret;
686
687 if (!mode)
688 {
689 return ret;
690 }
691
692 while (*in)
693 {
694 char *next = g_utf8_find_next_char(in, NULL);
695 int i, len = next - in;
696 int match = 0;
697 if (len > 1)
698 {
699 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
700 {
701 const char *search = special[i][0];
702 if (!strncmp(in, search, len))
703 {
704 const char *replace = special[i][mode];
705 if (replace)
706 {
707 int replace_len = strlen(replace);
708
709 if (replace_len > len)
710 {
711 fprintf(
712 stderr,
713 "* ERROR !! ERROR !! found %s %s %d %s %d\n",
714 in, search, len, replace, replace_len);
715 }
716 dbg_assert(replace_len <= len);
717 if (replace_len > len)
718 {
719 out += len;
720 match = 0;
721 break;
722 }
723 else
724 {
725 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
726 strcpy(out, replace);
727 out += replace_len;
728 match = 1;
729 break;
730 }
731 }
732 }
733 }
734 }
735
736 if (match)
737 {
738 found = 1;
739 in = next;
740 }
741 else
742 {
743 while (len-- > 0)
744 {
745 *out++ = *in++;
746 }
747 }
748 }
749 *out++ = '\0';
750 if (!found)
751 {
752 if (ret)
753 {
754 g_free(ret);
755 }
756 ret = NULL;
757 }
758 return ret;
759 }
760
761 char *linguistics_remove_all_spaces(char *str)
762 {
763 char *p;
764 char *next = NULL;
765 int len = 0;
766 char *ret;
767 char *out;
768
769 ret = g_strdup(str);
770 out = ret;
771 p = str;
772 while (*p)
773 {
774 next = g_utf8_find_next_char(p, NULL);
775 len = next - p;
776 if ((len > 1)||(p[0] != ' '))
777 {
778 strncpy(out, p, len);
779 out = out + len;
780 }
781 p = next;
782 }
783 *out = '\0';
784
785 return ret;
786 }
787
788 // special characters
789 static const char *remove_those = " _-.—,;:*#?=%&$§!@~()[]{}'`´^°|<>\\/\n\r\t\"\'";
790
791 char *linguistics_remove_all_specials(char *str)
792 {
793 char *p;
794 char *next = NULL;
795 int len = 0;
796 char *ret;
797 char *out;
798 int i;
799 int found_special;
800 int so_rtz = sizeof(remove_those[0]); // should be 1, but lets calculate it anyway
801 int so_rt = strlen(remove_those) * so_rtz;
802
803 ret = g_strdup(str);
804 out = ret;
805 p = str;
806 while (*p)
807 {
808 next = g_utf8_find_next_char(p, NULL);
809 len = next - p;
810 if (len > 1)
811 {
812 strncpy(out, p, len);
813 out = out + len;
814 }
815 else
816 {
817 found_special = 0;
818 for (i = 0; i < (so_rt / so_rtz); i++)
819 {
820 if (p[0] == remove_those[i])
821 {
822 // special found -> skip it
823 found_special = 1;
824 break;
825 }
826 }
827
828 if (found_special == 0)
829 {
830 strncpy(out, p, len);
831 out = out + len;
832 }
833 }
834 p = next;
835 }
836 *out = '\0';
837
838 return ret;
839 }
840
841
842 char *
843 linguistics_next_word(char *str)
844 {
845 char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
846 return ret;
847
848 // int len=strcspn(str, " -/()");
849 // if (!str[len] || !str[len+1])
850 // return NULL;
851 // return str+len+1;
852
853 }
854
855 int linguistics_search(char *str)
856 {
857 if (!g_strcasecmp(str, "str"))
858 return 0;
859 if (!g_strcasecmp(str, "str."))
860 return 0;
861 if (!g_strcasecmp(str, "strasse"))
862 return 0;
863 if (!g_strcasecmp(str, "weg"))
864 return 0;
865 return 1;
866 }
867
868 /**
869 * @brief Copy one utf8 encoded char to newly allocated buffer.
870 *
871 * @param s pointer to the beginning of the char.
872 * @return newly allocated nul-terminated string containing one utf8 encoded character.
873 */
874 static char *linguistics_dup_utf8_char(const char *s)
875 {
876 char *ret, *next;
877 next = g_utf8_find_next_char(s, NULL);
878 ret=g_new(char, next-s+1);
879 g_strlcpy(ret, s, next - s + 1);
880 return ret;
881 }
882
883 void linguistics_init(void)
884 {
885 int i;
886 special_hash = g_hash_table_new(g_str_hash, g_str_equal);
887 casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);
888
889 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
890 {
891 g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
892 }
893
894 for (i = 0; upperlower[i]; i += 2)
895 {
896 int j, k;
897 for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
898 {
899 char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
900 char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
901 g_hash_table_insert(casefold_hash, s1, s2);
902 j += strlen(s1);
903 k += strlen(s2);
904 }
905 }
906 }
907

   
Visit the ZANavi Wiki