/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Contents of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log


Revision 15 - (hide annotations) (download)
Mon Nov 21 20:54:48 2011 UTC (12 years, 4 months ago) by zoff99
File MIME type: text/plain
File size: 11173 byte(s)
better search, make search interuptable, new translations
1 zoff99 2 #include <string.h>
2     #include <stdio.h>
3     #include <glib.h>
4     #include "debug.h"
5     #include "linguistics.h"
6    
7     static const char *special[][3]={
8     /* Capital Diacritics */
9     /* ¨ Diaresis */
10     {"Ä","A","AE"},
11     {"Ë","E"},
12     {"Ï","I"},
13     {"Ö","O","OE"},
14     {"Ü","U","UE"},
15     {"Ÿ","Y"},
16     /* ˝ Double Acute Accent */
17     {"Ő","O","Ö"},
18     {"Ű","U","Ü"},
19     /* ´ Acute Accent */
20     {"Á","A"},
21     {"Ć","C"},
22     {"É","E"},
23     {"Í","I"},
24     {"Ĺ","L"},
25     {"Ń","N"},
26     {"Ó","O"},
27     {"Ŕ","R"},
28     {"Ś","S"},
29     {"Ú","U"},
30     {"Ý","Y"},
31     {"Ź","Z"},
32     /* ˛ Ogonek (nosinė) */
33     {"Ą","A"},
34     {"Ę","E"},
35     {"Į","I"},
36     {"Ų","U"},
37     /* ˙ Dot */
38     {"Ċ","C"},
39     {"Ė","E"},
40     {"Ġ","G"},
41     {"İ","I"},
42     {"Ŀ","L"},
43     {"Ż","Z"},
44     /* – Stroke */
45     {"Đ","D","DJ"}, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
46     {"Ħ","H"},
47     {"Ł","L"},
48     {"Ŧ","T"},
49     /* ˚ Ring */
50     {"Å","A","AA"},
51     {"Ů","U"},
52     /* ˇ Caron (haček, paukščiukas) */
53     {"Č","C"},
54     {"Ď","D"},
55     {"Ě","E"},
56     {"Ľ","L"},
57     {"Ň","N"},
58     {"Ř","R"},
59     {"Š","S"},
60     {"Ť","T"},
61     {"Ž","Z"},
62     /* / Slash */
63     {"Ø","O","OE"},
64     /* ¯ Macron */
65     {"Ā","A","AA"},
66     {"Ē","E","EE"},
67     {"Ī","I","II"},
68     {"Ō","O","OO"},
69     {"Ū","U","UU"},
70     /* ˘ Brevis */
71     {"Ă","A"},
72     {"Ĕ","E"},
73     {"Ğ","G"},
74     {"Ĭ","I"},
75     {"Ŏ","O"},
76     {"Ŭ","U"},
77     /* ^ Circumflex */
78     {"Â","A"},
79     {"Ĉ","C"},
80     {"Ê","E"},
81     {"Ĝ","G"},
82     {"Ĥ","H"},
83     {"Î","I"},
84     {"Ĵ","J"},
85     {"Ô","O"},
86     {"Ŝ","S"},
87     {"Û","U"},
88     {"Ŵ","W"},
89     {"Ŷ","Y"},
90     /* ¸ Cedilla */
91     {"Ç","C"},
92     {"Ģ","G","GJ"},
93     {"Ķ","K","KJ"},
94     {"Ļ","L","LJ"},
95     {"Ņ","N","NJ"},
96     {"Ŗ","R"},
97     {"Ş","S"},
98     {"Ţ","T"},
99     /* ~ Tilde */
100     {"Ã","A"},
101     {"Ĩ","I"},
102     {"Ñ","N"},
103     {"Õ","O"},
104     {"Ũ","U"},
105     /* ` Grave */
106     {"À","A"},
107     {"È","E"},
108     {"Ì","I"},
109     {"Ò","O"},
110     {"Ù","U"},
111     /* ligatures */
112     {"Æ","A","AE"},
113     {"IJ","IJ"},
114     {"Œ","O","OE"},
115     /* special letters */
116     {"Ð","D","DH"}, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
117     {"Ŋ","N","NG"},
118     {"Þ","T","TH"},
119     /* Small Diacritics */
120     /* ¨ Diaresis */
121     {"ä","a","ae"},
122     {"ë","e"},
123     {"ï","i"},
124     {"ö","o","oe"},
125     {"ü","u","ue"},
126     {"ÿ","y"},
127     /* ˝ Double Acute Accent */
128     {"ő","o","ö"},
129     {"ű","u","ü"},
130     /* ´ Acute Accent */
131     {"á","a"},
132     {"ć","c"},
133     {"é","e"},
134     {"í","i"},
135     {"ĺ","l"},
136     {"ń","n"},
137     {"ó","o"},
138     {"ŕ","r"},
139     {"ś","s"},
140     {"ú","u"},
141     {"ý","y"},
142     {"ź","z"},
143     /* ˛ Ogonek (nosinė) */
144     {"ą","a"},
145     {"ę","e"},
146     {"į","i"},
147     {"ų","u"},
148     /* ˙ Dot (and dotless i) */
149     {"ċ","c"},
150     {"ė","e"},
151     {"ġ","g"},
152     {"ı","i"},
153     {"ŀ","l"},
154     {"ż","z"},
155     /* – Stroke */
156     {"đ","d","dj"},
157     {"ħ","h"},
158     {"ł","l"},
159     {"ŧ","t"},
160     /* ˚ Ring */
161     {"å","a", "aa"},
162     {"ů","u"},
163     /* ˇ Caron (haček, paukščiukas) */
164     {"č","c"},
165     {"ď","d"},
166     {"ě","e"},
167     {"ľ","l"},
168     {"ň","n"},
169     {"ř","r"},
170     {"š","s"},
171     {"ť","t"},
172     {"ž","z"},
173     /* / Slash */
174     {"ø","o", "oe"},
175     /* Macron */
176     {"ā","a","aa"},
177     {"ē","e","ee"},
178     {"ī","i","ii"},
179     {"ō","o","oo"},
180     {"ū","u","uu"},
181     /* ˘ Brevis */
182     {"ă","a"},
183     {"ĕ","e"},
184     {"ğ","g"},
185     {"ĭ","i"},
186     {"ŏ","o"},
187     {"ŭ","u"},
188     /* ^ Circumflex */
189     {"â","a"},
190     {"ĉ","c"},
191     {"ê","e"},
192     {"ĝ","g"},
193     {"ĥ","h"},
194     {"î","i"},
195     {"ĵ","j"},
196     {"ô","o"},
197     {"ŝ","s"},
198     {"û","u"},
199     {"ŵ","w"},
200     {"ŷ","y"},
201     /* ¸ Cedilla */
202     {"ç","c"},
203     {"ģ","g","gj"},
204     {"ķ","k","kj"},
205     {"ļ","l","lj"},
206     {"ņ","n","nj"},
207     {"ŗ","r"},
208     {"ş","s"},
209     {"ţ","t"},
210     /* ~ Tilde */
211     {"ã","a"},
212     {"ĩ","i"},
213     {"õ","o"},
214     {"ñ","n"},
215     {"ũ","u"},
216     /* ` Grave */
217     {"à","a"},
218     {"è","e"},
219     {"ì","i"},
220     {"ò","o"},
221     {"ù","u"},
222     /* ligatures */
223     {"æ","a","ae"},
224     {"ij","ij"},
225     {"œ","o","oe"},
226     {"ß","s","ss"},
227     /* special letters */
228     {"ð","d","dh"},
229     {"ŋ","n","ng"},
230     {"þ","t","th"},
231    
232     /* Cyrillic capital */
233 zoff99 15
234 zoff99 2 {"Ё","Е"},
235     {"Й","И"},
236     {"І","I"},
237     {"Ї","I"},
238     {"Ў","У"},
239     {"Є","Е","Э"},
240     {"Ґ","Г"},
241     {"Ѓ","Г"},
242     {"Ђ","Д"},
243     {"Ќ","К"},
244 zoff99 15 //{"Љ","Л","ЛЬ"},
245     //{"Њ","Н","НЬ"},
246 zoff99 2 {"Џ","Ц"},
247    
248     /* Cyrillic small */
249 zoff99 15
250 zoff99 2 {"ё","е"},
251     {"й","и"},
252     {"і","i"},
253     {"ї","i"},
254     {"ў","у"},
255 zoff99 15 //{"є","е","э"},
256 zoff99 2 {"ґ","г"},
257     {"ѓ","г"},
258     {"ђ","д"},
259     {"ќ","к"},
260 zoff99 15 //{"љ","л","ль"},
261     //{"њ","н","нь"},
262 zoff99 2 {"џ","ц"},
263    
264     };
265 zoff99 15
266 zoff99 2 static GHashTable *special_hash;
267    
268     /* Array of strings for case conversion
269     * Even elements of array are strings of upper-case letters
270     * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
271     * Last element of array should be NULL.
272     */
273     static const char *upperlower[]={
274     /*Latin diacritics*/
275     "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆIJŒÐŊÞ",
276     "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæijœðŋþ",
277     /*Cyrillic*/
278     "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
279     "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
280    
281     NULL
282     };
283    
284     static GHashTable *casefold_hash;
285    
286    
287     struct special_pos {
288     char **variants;
289     int n;
290     char *s1, *s2;
291     };
292    
293    
294    
295     static char**
296     linguistics_get_special(char *str, char *end)
297     {
298     char buf[10];
299     int len;
300     if(!end)
301     end=g_utf8_find_next_char(str,NULL);
302     len=end-str+1;
303     g_strlcpy(buf,str,len>10?10:len);
304     return g_hash_table_lookup(special_hash,buf);
305     }
306    
307    
308     /*
309     * @brief Prepare an utf-8 string for case insensitive comparison.
310     * @param in String to prepeare.
311     * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
312     */
313     char*
314     linguistics_casefold(char *in)
315     {
316     int len=strlen(in);
317     char *src=in;
318     char *ret=g_new(char,len+1);
319     char *dest=ret;
320     char buf[10];
321     while(*src && dest-ret<len){
322     if(*src>='A' && *src<='Z') {
323     *dest++=*src++ - 'A' + 'a';
324     } else if (!(*src&128)) {
325     *dest++=*src++;
326     } else {
327     int charlen;
328     char *tmp, *folded;
329     tmp=g_utf8_find_next_char(src,NULL);
330     charlen=tmp-src+1;
331     g_strlcpy(buf,src,charlen>10?10:charlen);
332     folded=g_hash_table_lookup(casefold_hash,buf);
333 zoff99 15
334     if(folded)
335     {
336 zoff99 2 while(*folded && dest-ret<len)
337     *dest++=*folded++;
338     src=tmp;
339 zoff99 15 }
340     else
341     {
342 zoff99 2 while(src<tmp && dest-ret<len)
343     *dest++=*src++;
344     }
345     }
346     }
347     *dest=0;
348     if(*src)
349     dbg(0,"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",in,ret);
350     return ret;
351     }
352    
353     /**
354     * @brief Compare two strings using special characters expansion.
355     *
356     * @param str first string to compare, special characters are expanded.
357     * @param match second string to compare, special characters are not expanded.
358     * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
359     * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
360     */
361    
362     int
363     linguistics_compare(char *str, char *match, int partial)
364     {
365     char *s1=str, *s2=match;
366     char **sp;
367     int ret=0;
368     int got_match;
369     GList *l=NULL;
370     while (*s1 && *s2) {
371     int j;
372     struct special_pos *spp;
373     char *utf_boundary, *tmp;
374     /* Skip all matching chars */
375     for(j=0;s1[j] && s1[j]==s2[j];j++);
376     if(!s2[j] && (partial || !s1[j])) {
377     /* MATCH! */
378     ret=0;
379     break;
380     }
381     /* Find beginning of first mismatching utf-8 encoded char */
382     utf_boundary=s1;
383     while(*(tmp=g_utf8_find_next_char(utf_boundary, NULL))) {
384     if(tmp>s1+j)
385     break;
386     utf_boundary=tmp;
387     }
388     /* Push first mismatching char to the list if it's a special char */
389     sp=linguistics_get_special(utf_boundary,tmp);
390 zoff99 15
391     if(sp)
392     {
393 zoff99 2 spp=g_new(struct special_pos,1);
394     spp->variants=sp;
395     spp->n=1;
396     spp->s1=utf_boundary;
397     spp->s2=s2+(utf_boundary-s1);
398     l=g_list_prepend(l,spp);
399     }
400    
401     /* Try to find a match using special char variants from the list */
402     got_match=0;
403     while(l && !got_match) {
404     spp=l->data;
405     s1=spp->s1;
406     s2=spp->s2;
407     while(spp->n<3 && !got_match) {
408     char *s=spp->variants[(spp->n)++];
409     int len;
410     if(!s)
411     break;
412     len=strlen(s);
413     if(!strncmp(s,s2,len)) {
414     s2+=len;
415     s1+=strlen(spp->variants[0]);
416     got_match=1;
417     break;
418     }
419     }
420     if(spp->n>=3 || !spp->variants[spp->n]) {
421     /* No matches for current top list element, go to the closest special char towards beginning of the string */
422     g_free(spp);
423     l=g_list_delete_link(l,l);
424     }
425     }
426     if(!got_match) {
427     /* NO MATCH
428     * FIXME: If we're going to use this function to sort a string list alphabetically we should use
429     * utf-aware comparison here.
430     */
431     ret=1;
432     break;
433     }
434     }
435     while(l) {
436     g_free(l->data);
437     l=g_list_delete_link(l,l);
438     }
439     return ret;
440     }
441    
442    
443     char *
444     linguistics_expand_special(char *str, int mode)
445     {
446     char *in=str;
447     char *out,*ret;
448     int found=0;
449    
450 zoff99 15 if (!str)
451     {
452     return NULL;
453     }
454    
455     ret=g_strdup(str);
456     out=ret;
457    
458     if (!mode)
459     {
460 zoff99 2 return ret;
461 zoff99 15 }
462 zoff99 2
463     while (*in)
464     {
465     char *next=g_utf8_find_next_char(in, NULL);
466     int i,len=next-in;
467     int match=0;
468     if (len > 1)
469     {
470     for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++)
471     {
472     const char *search=special[i][0];
473     if (!strncmp(in,search,len))
474     {
475     const char *replace=special[i][mode];
476     if (replace)
477     {
478     int replace_len=strlen(replace);
479    
480 zoff99 15 if (replace_len > len)
481 zoff99 2 {
482 zoff99 15 fprintf(stderr,"* ERROR !! ERROR !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
483 zoff99 2 }
484 zoff99 15 dbg_assert(replace_len <= len);
485     if (replace_len > len)
486     {
487     out+=len;
488     match=0;
489     break;
490     }
491 zoff99 2 else
492     {
493 zoff99 15 // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
494 zoff99 2 strcpy(out, replace);
495     out+=replace_len;
496     match=1;
497     break;
498     }
499     }
500     }
501     }
502     }
503    
504     if (match)
505     {
506     found=1;
507     in=next;
508     }
509     else
510     {
511     while (len-- > 0)
512     {
513     *out++=*in++;
514     }
515     }
516     }
517     *out++='\0';
518     if (!found)
519     {
520 zoff99 15 if (ret)
521     {
522     g_free(ret);
523     }
524 zoff99 2 ret=NULL;
525     }
526     return ret;
527     }
528    
529     char *
530     linguistics_next_word(char *str)
531     {
532 zoff99 15 char* ret=strtok(str, " -/()\"\',.;_[]{}\\");
533     return ret;
534    
535     // int len=strcspn(str, " -/()");
536     // if (!str[len] || !str[len+1])
537     // return NULL;
538     // return str+len+1;
539    
540 zoff99 2 }
541    
542     int
543     linguistics_search(char *str)
544     {
545     if (!g_strcasecmp(str,"str"))
546     return 0;
547     if (!g_strcasecmp(str,"str."))
548     return 0;
549     if (!g_strcasecmp(str,"strasse"))
550     return 0;
551     if (!g_strcasecmp(str,"weg"))
552     return 0;
553     return 1;
554     }
555    
556     /**
557     * @brief Copy one utf8 encoded char to newly allocated buffer.
558     *
559     * @param s pointer to the beginning of the char.
560     * @return newly allocated nul-terminated string containing one utf8 encoded character.
561     */
562     static char
563     *linguistics_dup_utf8_char(const char *s)
564     {
565     char *ret, *next;
566     next=g_utf8_find_next_char(s,NULL);
567     ret=g_new(char, next-s+1);
568     g_strlcpy(ret,s,next-s+1);
569     return ret;
570     }
571    
572     void
573     linguistics_init(void)
574     {
575     int i;
576     special_hash=g_hash_table_new(g_str_hash, g_str_equal);
577     casefold_hash=g_hash_table_new(g_str_hash, g_str_equal);
578    
579     for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++)
580 zoff99 15 {
581 zoff99 2 g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]);
582 zoff99 15 }
583 zoff99 2
584 zoff99 15 for (i = 0 ; upperlower[i]; i+=2)
585     {
586 zoff99 2 int j,k;
587 zoff99 15 for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];)
588     {
589 zoff99 2 char *s1=linguistics_dup_utf8_char(upperlower[i]+j);
590     char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k);
591     g_hash_table_insert(casefold_hash,s1,s2);
592     j+=strlen(s1);
593     k+=strlen(s2);
594     }
595     }
596     }
597    

   
Visit the ZANavi Wiki