/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Contents of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 31 - (hide annotations) (download)
Mon Feb 4 17:41:59 2013 UTC (11 years, 1 month ago) by zoff99
File MIME type: text/plain
File size: 17500 byte(s)
new map version, lots of fixes and experimental new features
1 zoff99 2 #include <string.h>
2     #include <stdio.h>
3     #include <glib.h>
4     #include "debug.h"
5     #include "linguistics.h"
6    
7 zoff99 27 static const char *special[][3] =
8     {
9 zoff99 2 /* Capital Diacritics */
10     /* ¨ Diaresis */
11 zoff99 27 { "Ä", "A", "AE" },
12     { "Ë", "E" },
13     { "Ï", "I" },
14     { "Ö", "O", "OE" },
15     { "Ü", "U", "UE" },
16     { "Ÿ", "Y" },
17 zoff99 2 /* ˝ Double Acute Accent */
18 zoff99 27 { "Ő", "O", "Ö" },
19     { "Ű", "U", "Ü" },
20 zoff99 2 /* ´ Acute Accent */
21 zoff99 27 { "Á", "A" },
22     { "Ć", "C" },
23     { "É", "E" },
24     { "Í", "I" },
25     { "Ĺ", "L" },
26     { "Ń", "N" },
27     { "Ó", "O" },
28     { "Ŕ", "R" },
29     { "Ś", "S" },
30     { "Ú", "U" },
31     { "Ý", "Y" },
32     { "Ź", "Z" },
33 zoff99 2 /* ˛ Ogonek (nosinė) */
34 zoff99 27 { "Ą", "A" },
35     { "Ę", "E" },
36     { "Į", "I" },
37     { "Ų", "U" },
38 zoff99 2 /* ˙ Dot */
39 zoff99 27 { "Ċ", "C" },
40     { "Ė", "E" },
41     { "Ġ", "G" },
42     { "İ", "I" },
43     { "Ŀ", "L" },
44     { "Ż", "Z" },
45 zoff99 2 /* – Stroke */
46 zoff99 27 { "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
47     { "Ħ", "H" },
48     { "Ł", "L" },
49     { "Ŧ", "T" },
50 zoff99 2 /* ˚ Ring */
51 zoff99 27 { "Å", "A", "AA" },
52     { "Ů", "U" },
53 zoff99 2 /* ˇ Caron (haček, paukščiukas) */
54 zoff99 27 { "Č", "C" },
55     { "Ď", "D" },
56     { "Ě", "E" },
57     { "Ľ", "L" },
58     { "Ň", "N" },
59     { "Ř", "R" },
60     { "Š", "S" },
61     { "Ť", "T" },
62     { "Ž", "Z" },
63 zoff99 2 /* / Slash */
64 zoff99 27 { "Ø", "O", "OE" },
65 zoff99 2 /* ¯ Macron */
66 zoff99 27 { "Ā", "A", "AA" },
67     { "Ē", "E", "EE" },
68     { "Ī", "I", "II" },
69     { "Ō", "O", "OO" },
70     { "Ū", "U", "UU" },
71 zoff99 2 /* ˘ Brevis */
72 zoff99 27 { "Ă", "A" },
73     { "Ĕ", "E" },
74     { "Ğ", "G" },
75     { "Ĭ", "I" },
76     { "Ŏ", "O" },
77     { "Ŭ", "U" },
78 zoff99 2 /* ^ Circumflex */
79 zoff99 27 { "Â", "A" },
80     { "Ĉ", "C" },
81     { "Ê", "E" },
82     { "Ĝ", "G" },
83     { "Ĥ", "H" },
84     { "Î", "I" },
85     { "Ĵ", "J" },
86     { "Ô", "O" },
87     { "Ŝ", "S" },
88     { "Û", "U" },
89     { "Ŵ", "W" },
90     { "Ŷ", "Y" },
91 zoff99 2 /* ¸ Cedilla */
92 zoff99 27 { "Ç", "C" },
93     { "Ģ", "G", "GJ" },
94     { "Ķ", "K", "KJ" },
95     { "Ļ", "L", "LJ" },
96     { "Ņ", "N", "NJ" },
97     { "Ŗ", "R" },
98     { "Ş", "S" },
99     { "Ţ", "T" },
100 zoff99 2 /* ~ Tilde */
101 zoff99 27 { "Ã", "A" },
102     { "Ĩ", "I" },
103     { "Ñ", "N" },
104     { "Õ", "O" },
105     { "Ũ", "U" },
106 zoff99 2 /* ` Grave */
107 zoff99 27 { "À", "A" },
108     { "È", "E" },
109     { "Ì", "I" },
110     { "Ò", "O" },
111     { "Ù", "U" },
112 zoff99 2 /* ligatures */
113 zoff99 27 { "Æ", "A", "AE" },
114 zoff99 31 //{ "IJ", "IJ" },
115 zoff99 27 { "Œ", "O", "OE" },
116 zoff99 2 /* special letters */
117 zoff99 27 { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
118     { "Ŋ", "N", "NG" },
119     { "Þ", "T", "TH" },
120 zoff99 2 /* Small Diacritics */
121     /* ¨ Diaresis */
122 zoff99 27 { "ä", "a", "ae" },
123     { "ë", "e" },
124     { "ï", "i" },
125     { "ö", "o", "oe" },
126     { "ü", "u", "ue" },
127     { "ÿ", "y" },
128 zoff99 2 /* ˝ Double Acute Accent */
129 zoff99 27 { "ő", "o", "ö" },
130     { "ű", "u", "ü" },
131 zoff99 2 /* ´ Acute Accent */
132 zoff99 27 { "á", "a" },
133     { "ć", "c" },
134     { "é", "e" },
135     { "í", "i" },
136     { "ĺ", "l" },
137     { "ń", "n" },
138     { "ó", "o" },
139     { "ŕ", "r" },
140     { "ś", "s" },
141     { "ú", "u" },
142     { "ý", "y" },
143     { "ź", "z" },
144 zoff99 2 /* ˛ Ogonek (nosinė) */
145 zoff99 27 { "ą", "a" },
146     { "ę", "e" },
147     { "į", "i" },
148     { "ų", "u" },
149 zoff99 2 /* ˙ Dot (and dotless i) */
150 zoff99 27 { "ċ", "c" },
151     { "ė", "e" },
152     { "ġ", "g" },
153     { "ı", "i" },
154     { "ŀ", "l" },
155     { "ż", "z" },
156 zoff99 2 /* – Stroke */
157 zoff99 27 { "đ", "d", "dj" },
158     { "ħ", "h" },
159     { "ł", "l" },
160     { "ŧ", "t" },
161 zoff99 2 /* ˚ Ring */
162 zoff99 27 { "å", "a", "aa" },
163     { "ů", "u" },
164 zoff99 2 /* ˇ Caron (haček, paukščiukas) */
165 zoff99 27 { "č", "c" },
166     { "ď", "d" },
167     { "ě", "e" },
168     { "ľ", "l" },
169     { "ň", "n" },
170     { "ř", "r" },
171     { "š", "s" },
172     { "ť", "t" },
173     { "ž", "z" },
174 zoff99 2 /* / Slash */
175 zoff99 27 { "ø", "o", "oe" },
176 zoff99 2 /* Macron */
177 zoff99 27 { "ā", "a", "aa" },
178     { "ē", "e", "ee" },
179     { "ī", "i", "ii" },
180     { "ō", "o", "oo" },
181     { "ū", "u", "uu" },
182 zoff99 2 /* ˘ Brevis */
183 zoff99 27 { "ă", "a" },
184     { "ĕ", "e" },
185     { "ğ", "g" },
186     { "ĭ", "i" },
187     { "ŏ", "o" },
188     { "ŭ", "u" },
189 zoff99 2 /* ^ Circumflex */
190 zoff99 27 { "â", "a" },
191     { "ĉ", "c" },
192     { "ê", "e" },
193     { "ĝ", "g" },
194     { "ĥ", "h" },
195     { "î", "i" },
196     { "ĵ", "j" },
197     { "ô", "o" },
198     { "ŝ", "s" },
199     { "û", "u" },
200     { "ŵ", "w" },
201     { "ŷ", "y" },
202 zoff99 2 /* ¸ Cedilla */
203 zoff99 27 { "ç", "c" },
204     { "ģ", "g", "gj" },
205     { "ķ", "k", "kj" },
206     { "ļ", "l", "lj" },
207     { "ņ", "n", "nj" },
208     { "ŗ", "r" },
209     { "ş", "s" },
210     { "ţ", "t" },
211 zoff99 2 /* ~ Tilde */
212 zoff99 27 { "ã", "a" },
213     { "ĩ", "i" },
214     { "õ", "o" },
215     { "ñ", "n" },
216     { "ũ", "u" },
217 zoff99 2 /* ` Grave */
218 zoff99 27 { "à", "a" },
219     { "è", "e" },
220     { "ì", "i" },
221     { "ò", "o" },
222     { "ù", "u" },
223 zoff99 2 /* ligatures */
224 zoff99 27 { "æ", "a", "ae" },
225 zoff99 31 //{ "ij", "ij" },
226 zoff99 27 { "œ", "o", "oe" },
227     { "ß", "s", "ss" },
228 zoff99 2 /* special letters */
229 zoff99 27 { "ð", "d", "dh" },
230     { "ŋ", "n", "ng" },
231     { "þ", "t", "th" },
232 zoff99 2
233     /* Cyrillic capital */
234 zoff99 15
235 zoff99 27 { "Ё", "Е" },
236     { "Й", "И" },
237     { "І", "I" },
238     { "Ї", "I" },
239     { "Ў", "У" },
240     { "Є", "Е", "Э" },
241     { "Ґ", "Г" },
242     { "Ѓ", "Г" },
243     { "Ђ", "Д" },
244     { "Ќ", "К" },
245 zoff99 15 //{"Љ","Л","ЛЬ"},
246 zoff99 27 //{"Њ","Н","НЬ"},
247     { "Џ", "Ц" },
248 zoff99 2
249 zoff99 27 /* Cyrillic small */
250 zoff99 15
251 zoff99 27 { "ё", "е" },
252     { "й", "и" },
253     { "і", "i" },
254     { "ї", "i" },
255     { "ў", "у" },
256     //{"є","е","э"},
257     { "ґ", "г" },
258     { "ѓ", "г" },
259     { "ђ", "д" },
260     { "ќ", "к" },
261     //{"љ","л","ль"},
262     //{"њ","н","нь"},
263     { "џ", "ц" },
264 zoff99 2
265     };
266 zoff99 15
267 zoff99 31
268 zoff99 2 static GHashTable *special_hash;
269    
270     /* Array of strings for case conversion
271     * Even elements of array are strings of upper-case letters
272     * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
273     * Last element of array should be NULL.
274     */
275 zoff99 27 static const char
276     *upperlower[] =
277     {
278     /*Latin diacritics*/
279     "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
280     "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
281     /*Cyrillic*/
282     "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
283     "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
284 zoff99 2
285 zoff99 27 NULL };
286 zoff99 2
287     static GHashTable *casefold_hash;
288    
289 zoff99 27 struct special_pos
290     {
291 zoff99 2 char **variants;
292     int n;
293     char *s1, *s2;
294     };
295    
296 zoff99 27 static char**
297 zoff99 2 linguistics_get_special(char *str, char *end)
298     {
299 zoff99 31 char buf[11];
300 zoff99 2 int len;
301 zoff99 27 if (!end)
302 zoff99 31 {
303 zoff99 27 end = g_utf8_find_next_char(str, NULL);
304 zoff99 31 }
305 zoff99 27 len = end - str + 1;
306     g_strlcpy(buf, str, len > 10 ? 10 : len);
307     return g_hash_table_lookup(special_hash, buf);
308 zoff99 2 }
309    
310     /*
311     * @brief Prepare an utf-8 string for case insensitive comparison.
312     * @param in String to prepeare.
313     * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
314     */
315     char*
316     linguistics_casefold(char *in)
317     {
318 zoff99 27 int len = strlen(in);
319     char *src = in;
320 zoff99 31 //char *ret=g_new(char,len+1);
321     char *ret=g_new(char,len+20); // try to fix strange BUG
322 zoff99 27 char *dest = ret;
323 zoff99 2 char buf[10];
324 zoff99 31
325     // string end
326     ret[19] = '\0';
327     // fprintf(stderr, "xxxsssssssssssss\n");
328    
329     while (*src && ((dest - ret) < len))
330 zoff99 27 {
331     if (*src >= 'A' && *src <= 'Z')
332     {
333     *dest++ = *src++ - 'A' + 'a';
334     }
335     else if (!(*src & 128))
336     {
337     *dest++ = *src++;
338     }
339     else
340     {
341 zoff99 2 int charlen;
342     char *tmp, *folded;
343 zoff99 27 tmp = g_utf8_find_next_char(src, NULL);
344     charlen = tmp - src + 1;
345     g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
346     folded = g_hash_table_lookup(casefold_hash, buf);
347 zoff99 15
348 zoff99 27 if (folded)
349 zoff99 15 {
350 zoff99 27 while (*folded && dest - ret < len)
351 zoff99 31 {
352 zoff99 27 *dest++ = *folded++;
353 zoff99 31 }
354 zoff99 27 src = tmp;
355 zoff99 15 }
356     else
357     {
358 zoff99 27 while (src < tmp && dest - ret < len)
359 zoff99 31 {
360 zoff99 27 *dest++ = *src++;
361 zoff99 31 }
362 zoff99 2 }
363     }
364     }
365 zoff99 31
366 zoff99 27 *dest = 0;
367     if (*src)
368 zoff99 31 {
369 zoff99 27 dbg(
370     0,
371     "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
372     in, ret);
373 zoff99 31 }
374    
375 zoff99 2 return ret;
376     }
377    
378 zoff99 31 char* linguistics_fold_and_prepare_complete(char *in, int free_input)
379     {
380     char *tmp1;
381     char *tmp2;
382    
383     if (in == NULL)
384     {
385     return NULL;
386     }
387    
388     tmp1 = linguistics_casefold(in);
389     if (tmp1)
390     {
391     tmp2 = linguistics_remove_all_specials(tmp1);
392     if (tmp2)
393     {
394     g_free(tmp1);
395     tmp1 = tmp2;
396     }
397     tmp2 = linguistics_expand_special(tmp1, 1);
398     if (tmp2)
399     {
400     g_free(tmp1);
401     tmp1 = tmp2;
402     }
403     }
404    
405     if (free_input)
406     {
407     if (in)
408     {
409     g_free(in);
410     in = NULL;
411     }
412     }
413    
414     return tmp1;
415     }
416    
417    
418    
419    
420     /*
421     * Verify that "in" points to valid "modified UTF-8" data.
422     * returns: string of useable utf-8 bytes (dont need to free, original input is just truncated)
423     */
424     char* linguistics_check_utf8_string(char* in)
425     {
426     char* bytes = in;
427    
428     if (bytes == NULL)
429     {
430     return NULL;
431     }
432    
433     while (*bytes != '\0')
434     {
435     guint32 utf8 = *(bytes++);
436     // Switch on the high four bits.
437     switch (utf8 >> 4)
438     {
439     case 0x00:
440     case 0x01:
441     case 0x02:
442     case 0x03:
443     case 0x04:
444     case 0x05:
445     case 0x06:
446     case 0x07:
447     {
448     // Bit pattern 0xxx. No need for any extra bytes.
449     break;
450     }
451     case 0x08:
452     case 0x09:
453     case 0x0a:
454     case 0x0b:
455     case 0x0f:
456     {
457     /*
458     * Bit pattern 10xx or 1111, which are illegal start bytes.
459     * Note: 1111 is valid for normal UTF-8, but not the
460     * modified UTF-8 used here.
461     */
462     // LOGW("JNI WARNING: illegal start byte 0x%x\n", utf8);
463     *(bytes--) = '\0';
464     return in;
465     }
466     case 0x0e:
467     {
468     // Bit pattern 1110, so there are two additional bytes.
469     utf8 = *(bytes++);
470     if ((utf8 & 0xc0) != 0x80)
471     {
472     // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8);
473     *(bytes-2) = '\0';
474     return in;
475     }
476     // Fall through to take care of the final byte.
477     }
478     case 0x0c:
479     case 0x0d:
480     {
481     // Bit pattern 110x, so there is one additional byte.
482     utf8 = *(bytes++);
483     if ((utf8 & 0xc0) != 0x80)
484     {
485     // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8);
486     *(bytes-2) = '\0';
487     return in;
488     }
489     break;
490     }
491     }
492     }
493    
494     return in;
495     }
496    
497    
498    
499     /*
500     *
501     * @brief find match anywhere in str (only complete match, not partial!)
502     * both strings should have beend folded (and specials removed) before calling this function
503     *
504     * @return =0 on match =1 on not matched
505     *
506     */
507    
508     int linguistics_compare_anywhere(char *str, char *match)
509     {
510     char *match_1;
511     char *match_next;
512     char *next;
513     char *next_char;
514     int found = 1;
515     gunichar match_1_unichar;
516    
517     if ((str == NULL)||(match == NULL))
518     {
519     return found;
520     }
521    
522     match_1 = g_strdup(match);
523     next = g_utf8_find_next_char(match_1, NULL);
524     if (next == NULL)
525     {
526     g_free(match_1);
527     return found;
528     }
529    
530     *next = '\0'; // cut off after first utf-8 char
531    
532     //dbg(0, "match=%s match_1=%s", match, match_1);
533    
534     match_1_unichar = g_utf8_get_char(match_1);
535    
536     match_next = g_utf8_strchr(str, -1, match_1_unichar);
537     while (match_next)
538     {
539    
540     //dbg(0, "cmp1: match=%s match_next=%s", match, match_next);
541    
542     // see if the utf-8 chars match
543     if (!strncmp(match, match_next, strlen(match)))
544     {
545     found = 0;
546     break;
547     }
548     match_next = g_utf8_strchr(g_utf8_find_next_char(match_next, NULL), -1, match_1_unichar);
549    
550     //dbg(0, "cmp2: match=%s match_next=%s", match, match_next);
551     }
552    
553     g_free(match_1);
554    
555     return found;
556     }
557    
558    
559 zoff99 2 /**
560     * @brief Compare two strings using special characters expansion.
561     *
562     * @param str first string to compare, special characters are expanded.
563     * @param match second string to compare, special characters are not expanded.
564     * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
565     * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
566     */
567    
568 zoff99 27 int linguistics_compare(char *str, char *match, int partial)
569 zoff99 2 {
570 zoff99 31
571     if ((str == NULL)||(match == NULL))
572     {
573     return 1;
574     }
575    
576 zoff99 27 char *s1 = str, *s2 = match;
577 zoff99 2 char **sp;
578 zoff99 27 int ret = 0;
579 zoff99 2 int got_match;
580 zoff99 27 GList *l = NULL;
581     while (*s1 && *s2)
582     {
583 zoff99 2 int j;
584     struct special_pos *spp;
585     char *utf_boundary, *tmp;
586     /* Skip all matching chars */
587 zoff99 27 for (j = 0; s1[j] && s1[j] == s2[j]; j++)
588 zoff99 31 {
589 zoff99 27 ;
590 zoff99 31 }
591    
592 zoff99 27 if (!s2[j] && (partial || !s1[j]))
593     {
594 zoff99 2 /* MATCH! */
595 zoff99 27 ret = 0;
596 zoff99 2 break;
597     }
598 zoff99 31
599 zoff99 2 /* Find beginning of first mismatching utf-8 encoded char */
600 zoff99 27 utf_boundary = s1;
601     while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
602     {
603     if (tmp > s1 + j)
604 zoff99 31 {
605 zoff99 2 break;
606 zoff99 31 }
607 zoff99 27 utf_boundary = tmp;
608 zoff99 2 }
609 zoff99 31
610 zoff99 2 /* Push first mismatching char to the list if it's a special char */
611 zoff99 27 sp = linguistics_get_special(utf_boundary, tmp);
612 zoff99 15
613 zoff99 27 if (sp)
614 zoff99 15 {
615 zoff99 2 spp=g_new(struct special_pos,1);
616 zoff99 27 spp->variants = sp;
617     spp->n = 1;
618     spp->s1 = utf_boundary;
619     spp->s2 = s2 + (utf_boundary - s1);
620     l = g_list_prepend(l, spp);
621     }
622 zoff99 2
623     /* Try to find a match using special char variants from the list */
624 zoff99 27 got_match = 0;
625     while (l && !got_match)
626     {
627     spp = l->data;
628     s1 = spp->s1;
629     s2 = spp->s2;
630     while (spp->n < 3 && !got_match)
631     {
632     char *s = spp->variants[(spp->n)++];
633 zoff99 2 int len;
634 zoff99 27 if (!s)
635 zoff99 2 break;
636 zoff99 27 len = strlen(s);
637     if (!strncmp(s, s2, len))
638     {
639     s2 += len;
640     s1 += strlen(spp->variants[0]);
641     got_match = 1;
642 zoff99 2 break;
643     }
644     }
645 zoff99 27 if (spp->n >= 3 || !spp->variants[spp->n])
646     {
647 zoff99 2 /* No matches for current top list element, go to the closest special char towards beginning of the string */
648     g_free(spp);
649 zoff99 27 l = g_list_delete_link(l, l);
650 zoff99 2 }
651     }
652 zoff99 31
653 zoff99 27 if (!got_match)
654     {
655 zoff99 2 /* NO MATCH
656     * FIXME: If we're going to use this function to sort a string list alphabetically we should use
657     * utf-aware comparison here.
658     */
659 zoff99 27 ret = 1;
660 zoff99 2 break;
661     }
662     }
663 zoff99 31
664 zoff99 27 while (l)
665     {
666 zoff99 2 g_free(l->data);
667 zoff99 27 l = g_list_delete_link(l, l);
668 zoff99 2 }
669     return ret;
670     }
671    
672     char *
673     linguistics_expand_special(char *str, int mode)
674     {
675 zoff99 27 char *in = str;
676     char *out, *ret;
677     int found = 0;
678 zoff99 2
679 zoff99 15 if (!str)
680     {
681     return NULL;
682     }
683    
684 zoff99 27 ret = g_strdup(str);
685     out = ret;
686 zoff99 15
687     if (!mode)
688     {
689 zoff99 2 return ret;
690 zoff99 15 }
691 zoff99 2
692     while (*in)
693     {
694 zoff99 27 char *next = g_utf8_find_next_char(in, NULL);
695     int i, len = next - in;
696     int match = 0;
697 zoff99 2 if (len > 1)
698     {
699 zoff99 27 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
700 zoff99 2 {
701 zoff99 27 const char *search = special[i][0];
702     if (!strncmp(in, search, len))
703 zoff99 2 {
704 zoff99 27 const char *replace = special[i][mode];
705 zoff99 2 if (replace)
706     {
707 zoff99 27 int replace_len = strlen(replace);
708 zoff99 2
709 zoff99 15 if (replace_len > len)
710 zoff99 2 {
711 zoff99 27 fprintf(
712     stderr,
713     "* ERROR !! ERROR !! found %s %s %d %s %d\n",
714     in, search, len, replace, replace_len);
715 zoff99 2 }
716 zoff99 15 dbg_assert(replace_len <= len);
717     if (replace_len > len)
718     {
719 zoff99 27 out += len;
720     match = 0;
721 zoff99 15 break;
722     }
723 zoff99 2 else
724     {
725 zoff99 15 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
726 zoff99 2 strcpy(out, replace);
727 zoff99 27 out += replace_len;
728     match = 1;
729 zoff99 2 break;
730     }
731     }
732     }
733     }
734     }
735    
736     if (match)
737     {
738 zoff99 27 found = 1;
739     in = next;
740 zoff99 2 }
741     else
742     {
743     while (len-- > 0)
744     {
745 zoff99 27 *out++ = *in++;
746 zoff99 2 }
747     }
748     }
749 zoff99 27 *out++ = '\0';
750 zoff99 2 if (!found)
751     {
752 zoff99 15 if (ret)
753     {
754     g_free(ret);
755     }
756 zoff99 27 ret = NULL;
757 zoff99 2 }
758     return ret;
759     }
760    
761 zoff99 31 char *linguistics_remove_all_spaces(char *str)
762     {
763     char *p;
764     char *next = NULL;
765     int len = 0;
766     char *ret;
767     char *out;
768    
769     ret = g_strdup(str);
770     out = ret;
771     p = str;
772     while (*p)
773     {
774     next = g_utf8_find_next_char(p, NULL);
775     len = next - p;
776     if ((len > 1)||(p[0] != ' '))
777     {
778     strncpy(out, p, len);
779     out = out + len;
780     }
781     p = next;
782     }
783     *out = '\0';
784    
785     return ret;
786     }
787    
788     // special characters
789     static const char *remove_those = " _-.—,;:*#?=%&$§!@~()[]{}'`´^°|<>\\/\n\r\t\"\'";
790    
791     char *linguistics_remove_all_specials(char *str)
792     {
793     char *p;
794     char *next = NULL;
795     int len = 0;
796     char *ret;
797     char *out;
798     int i;
799     int found_special;
800     int so_rtz = sizeof(remove_those[0]); // should be 1, but lets calculate it anyway
801     int so_rt = strlen(remove_those) * so_rtz;
802    
803     ret = g_strdup(str);
804     out = ret;
805     p = str;
806     while (*p)
807     {
808     next = g_utf8_find_next_char(p, NULL);
809     len = next - p;
810     if (len > 1)
811     {
812     strncpy(out, p, len);
813     out = out + len;
814     }
815     else
816     {
817     found_special = 0;
818     for (i = 0; i < (so_rt / so_rtz); i++)
819     {
820     if (p[0] == remove_those[i])
821     {
822     // special found -> skip it
823     found_special = 1;
824     break;
825     }
826     }
827    
828     if (found_special == 0)
829     {
830     strncpy(out, p, len);
831     out = out + len;
832     }
833     }
834     p = next;
835     }
836     *out = '\0';
837    
838     return ret;
839     }
840    
841    
842 zoff99 2 char *
843     linguistics_next_word(char *str)
844     {
845 zoff99 27 char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
846 zoff99 15 return ret;
847    
848 zoff99 27 // int len=strcspn(str, " -/()");
849     // if (!str[len] || !str[len+1])
850     // return NULL;
851     // return str+len+1;
852 zoff99 15
853 zoff99 2 }
854    
855 zoff99 27 int linguistics_search(char *str)
856 zoff99 2 {
857 zoff99 27 if (!g_strcasecmp(str, "str"))
858 zoff99 2 return 0;
859 zoff99 27 if (!g_strcasecmp(str, "str."))
860 zoff99 2 return 0;
861 zoff99 27 if (!g_strcasecmp(str, "strasse"))
862 zoff99 2 return 0;
863 zoff99 27 if (!g_strcasecmp(str, "weg"))
864 zoff99 2 return 0;
865     return 1;
866     }
867    
868     /**
869     * @brief Copy one utf8 encoded char to newly allocated buffer.
870     *
871     * @param s pointer to the beginning of the char.
872     * @return newly allocated nul-terminated string containing one utf8 encoded character.
873     */
874 zoff99 27 static char *linguistics_dup_utf8_char(const char *s)
875 zoff99 2 {
876     char *ret, *next;
877 zoff99 27 next = g_utf8_find_next_char(s, NULL);
878 zoff99 2 ret=g_new(char, next-s+1);
879 zoff99 27 g_strlcpy(ret, s, next - s + 1);
880 zoff99 2 return ret;
881     }
882    
883 zoff99 27 void linguistics_init(void)
884 zoff99 2 {
885     int i;
886 zoff99 27 special_hash = g_hash_table_new(g_str_hash, g_str_equal);
887     casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);
888 zoff99 2
889 zoff99 27 for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
890 zoff99 15 {
891 zoff99 27 g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
892 zoff99 15 }
893 zoff99 2
894 zoff99 27 for (i = 0; upperlower[i]; i += 2)
895 zoff99 15 {
896 zoff99 27 int j, k;
897     for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
898 zoff99 15 {
899 zoff99 27 char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
900     char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
901     g_hash_table_insert(casefold_hash, s1, s2);
902     j += strlen(s1);
903     k += strlen(s2);
904 zoff99 2 }
905     }
906     }
907    

   
Visit the ZANavi Wiki