/[zanavi_public1]/navit/navit/linguistics.c
ZANavi

Diff of /navit/navit/linguistics.c

Parent Directory Parent Directory | Revision Log Revision Log | View Patch Patch

Revision 30 Revision 31
109{ "Ì", "I" }, 109{ "Ì", "I" },
110{ "Ò", "O" }, 110{ "Ò", "O" },
111{ "Ù", "U" }, 111{ "Ù", "U" },
112/* ligatures */ 112/* ligatures */
113{ "Æ", "A", "AE" }, 113{ "Æ", "A", "AE" },
114{ "IJ", "IJ" }, 114//{ "IJ", "IJ" },
115{ "Œ", "O", "OE" }, 115{ "Œ", "O", "OE" },
116/* special letters */ 116/* special letters */
117{ "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */ 117{ "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
118{ "Ŋ", "N", "NG" }, 118{ "Ŋ", "N", "NG" },
119{ "Þ", "T", "TH" }, 119{ "Þ", "T", "TH" },
220{ "ì", "i" }, 220{ "ì", "i" },
221{ "ò", "o" }, 221{ "ò", "o" },
222{ "ù", "u" }, 222{ "ù", "u" },
223/* ligatures */ 223/* ligatures */
224{ "æ", "a", "ae" }, 224{ "æ", "a", "ae" },
225{ "ij", "ij" }, 225//{ "ij", "ij" },
226{ "œ", "o", "oe" }, 226{ "œ", "o", "oe" },
227{ "ß", "s", "ss" }, 227{ "ß", "s", "ss" },
228/* special letters */ 228/* special letters */
229{ "ð", "d", "dh" }, 229{ "ð", "d", "dh" },
230{ "ŋ", "n", "ng" }, 230{ "ŋ", "n", "ng" },
261 //{"љ","л","ль"}, 261 //{"љ","л","ль"},
262 //{"њ","н","нь"}, 262 //{"њ","н","нь"},
263 { "џ", "ц" }, 263 { "џ", "ц" },
264 264
265}; 265};
266
266 267
267static GHashTable *special_hash; 268static GHashTable *special_hash;
268 269
269/* Array of strings for case conversion 270/* Array of strings for case conversion
270 * Even elements of array are strings of upper-case letters 271 * Even elements of array are strings of upper-case letters
293}; 294};
294 295
295static char** 296static char**
296linguistics_get_special(char *str, char *end) 297linguistics_get_special(char *str, char *end)
297{ 298{
298 char buf[10]; 299 char buf[11];
299 int len; 300 int len;
300 if (!end) 301 if (!end)
302 {
301 end = g_utf8_find_next_char(str, NULL); 303 end = g_utf8_find_next_char(str, NULL);
304 }
302 len = end - str + 1; 305 len = end - str + 1;
303 g_strlcpy(buf, str, len > 10 ? 10 : len); 306 g_strlcpy(buf, str, len > 10 ? 10 : len);
304 return g_hash_table_lookup(special_hash, buf); 307 return g_hash_table_lookup(special_hash, buf);
305} 308}
306 309
312char* 315char*
313linguistics_casefold(char *in) 316linguistics_casefold(char *in)
314{ 317{
315 int len = strlen(in); 318 int len = strlen(in);
316 char *src = in; 319 char *src = in;
317 char *ret=g_new(char,len+1); 320 //char *ret=g_new(char,len+1);
321 char *ret=g_new(char,len+20); // try to fix strange BUG
318 char *dest = ret; 322 char *dest = ret;
319 char buf[10]; 323 char buf[10];
324
325 // string end
326 ret[19] = '\0';
327 // fprintf(stderr, "xxxsssssssssssss\n");
328
320 while (*src && dest - ret < len) 329 while (*src && ((dest - ret) < len))
321 { 330 {
322 if (*src >= 'A' && *src <= 'Z') 331 if (*src >= 'A' && *src <= 'Z')
323 { 332 {
324 *dest++ = *src++ - 'A' + 'a'; 333 *dest++ = *src++ - 'A' + 'a';
325 } 334 }
337 folded = g_hash_table_lookup(casefold_hash, buf); 346 folded = g_hash_table_lookup(casefold_hash, buf);
338 347
339 if (folded) 348 if (folded)
340 { 349 {
341 while (*folded && dest - ret < len) 350 while (*folded && dest - ret < len)
351 {
342 *dest++ = *folded++; 352 *dest++ = *folded++;
353 }
343 src = tmp; 354 src = tmp;
344 } 355 }
345 else 356 else
346 { 357 {
347 while (src < tmp && dest - ret < len) 358 while (src < tmp && dest - ret < len)
359 {
348 *dest++ = *src++; 360 *dest++ = *src++;
361 }
349 } 362 }
350 } 363 }
351 } 364 }
365
352 *dest = 0; 366 *dest = 0;
353 if (*src) 367 if (*src)
368 {
354 dbg( 369 dbg(
355 0, 370 0,
356 "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n", 371 "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
357 in, ret); 372 in, ret);
373 }
374
358 return ret; 375 return ret;
359} 376}
377
378char* linguistics_fold_and_prepare_complete(char *in, int free_input)
379{
380 char *tmp1;
381 char *tmp2;
382
383 if (in == NULL)
384 {
385 return NULL;
386 }
387
388 tmp1 = linguistics_casefold(in);
389 if (tmp1)
390 {
391 tmp2 = linguistics_remove_all_specials(tmp1);
392 if (tmp2)
393 {
394 g_free(tmp1);
395 tmp1 = tmp2;
396 }
397 tmp2 = linguistics_expand_special(tmp1, 1);
398 if (tmp2)
399 {
400 g_free(tmp1);
401 tmp1 = tmp2;
402 }
403 }
404
405 if (free_input)
406 {
407 if (in)
408 {
409 g_free(in);
410 in = NULL;
411 }
412 }
413
414 return tmp1;
415}
416
417
418
419
420/*
421* Verify that "in" points to valid "modified UTF-8" data.
422* returns: string of useable utf-8 bytes (dont need to free, original input is just truncated)
423*/
424char* linguistics_check_utf8_string(char* in)
425{
426 char* bytes = in;
427
428 if (bytes == NULL)
429 {
430 return NULL;
431 }
432
433 while (*bytes != '\0')
434 {
435 guint32 utf8 = *(bytes++);
436 // Switch on the high four bits.
437 switch (utf8 >> 4)
438 {
439 case 0x00:
440 case 0x01:
441 case 0x02:
442 case 0x03:
443 case 0x04:
444 case 0x05:
445 case 0x06:
446 case 0x07:
447 {
448 // Bit pattern 0xxx. No need for any extra bytes.
449 break;
450 }
451 case 0x08:
452 case 0x09:
453 case 0x0a:
454 case 0x0b:
455 case 0x0f:
456 {
457 /*
458 * Bit pattern 10xx or 1111, which are illegal start bytes.
459 * Note: 1111 is valid for normal UTF-8, but not the
460 * modified UTF-8 used here.
461 */
462 // LOGW("JNI WARNING: illegal start byte 0x%x\n", utf8);
463 *(bytes--) = '\0';
464 return in;
465 }
466 case 0x0e:
467 {
468 // Bit pattern 1110, so there are two additional bytes.
469 utf8 = *(bytes++);
470 if ((utf8 & 0xc0) != 0x80)
471 {
472 // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8);
473 *(bytes-2) = '\0';
474 return in;
475 }
476 // Fall through to take care of the final byte.
477 }
478 case 0x0c:
479 case 0x0d:
480 {
481 // Bit pattern 110x, so there is one additional byte.
482 utf8 = *(bytes++);
483 if ((utf8 & 0xc0) != 0x80)
484 {
485 // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8);
486 *(bytes-2) = '\0';
487 return in;
488 }
489 break;
490 }
491 }
492 }
493
494 return in;
495}
496
497
498
499/*
500 *
501 * @brief find match anywhere in str (only complete match, not partial!)
502 * both strings should have beend folded (and specials removed) before calling this function
503 *
504 * @return =0 on match =1 on not matched
505 *
506 */
507
508int linguistics_compare_anywhere(char *str, char *match)
509{
510 char *match_1;
511 char *match_next;
512 char *next;
513 char *next_char;
514 int found = 1;
515 gunichar match_1_unichar;
516
517 if ((str == NULL)||(match == NULL))
518 {
519 return found;
520 }
521
522 match_1 = g_strdup(match);
523 next = g_utf8_find_next_char(match_1, NULL);
524 if (next == NULL)
525 {
526 g_free(match_1);
527 return found;
528 }
529
530 *next = '\0'; // cut off after first utf-8 char
531
532 //dbg(0, "match=%s match_1=%s", match, match_1);
533
534 match_1_unichar = g_utf8_get_char(match_1);
535
536 match_next = g_utf8_strchr(str, -1, match_1_unichar);
537 while (match_next)
538 {
539
540 //dbg(0, "cmp1: match=%s match_next=%s", match, match_next);
541
542 // see if the utf-8 chars match
543 if (!strncmp(match, match_next, strlen(match)))
544 {
545 found = 0;
546 break;
547 }
548 match_next = g_utf8_strchr(g_utf8_find_next_char(match_next, NULL), -1, match_1_unichar);
549
550 //dbg(0, "cmp2: match=%s match_next=%s", match, match_next);
551 }
552
553 g_free(match_1);
554
555 return found;
556}
557
360 558
361/** 559/**
362 * @brief Compare two strings using special characters expansion. 560 * @brief Compare two strings using special characters expansion.
363 * 561 *
364 * @param str first string to compare, special characters are expanded. 562 * @param str first string to compare, special characters are expanded.
367 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp(). 565 * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
368 */ 566 */
369 567
370int linguistics_compare(char *str, char *match, int partial) 568int linguistics_compare(char *str, char *match, int partial)
371{ 569{
570
571 if ((str == NULL)||(match == NULL))
572 {
573 return 1;
574 }
575
372 char *s1 = str, *s2 = match; 576 char *s1 = str, *s2 = match;
373 char **sp; 577 char **sp;
374 int ret = 0; 578 int ret = 0;
375 int got_match; 579 int got_match;
376 GList *l = NULL; 580 GList *l = NULL;
379 int j; 583 int j;
380 struct special_pos *spp; 584 struct special_pos *spp;
381 char *utf_boundary, *tmp; 585 char *utf_boundary, *tmp;
382 /* Skip all matching chars */ 586 /* Skip all matching chars */
383 for (j = 0; s1[j] && s1[j] == s2[j]; j++) 587 for (j = 0; s1[j] && s1[j] == s2[j]; j++)
588 {
384 ; 589 ;
590 }
591
385 if (!s2[j] && (partial || !s1[j])) 592 if (!s2[j] && (partial || !s1[j]))
386 { 593 {
387 /* MATCH! */ 594 /* MATCH! */
388 ret = 0; 595 ret = 0;
389 break; 596 break;
390 } 597 }
598
391 /* Find beginning of first mismatching utf-8 encoded char */ 599 /* Find beginning of first mismatching utf-8 encoded char */
392 utf_boundary = s1; 600 utf_boundary = s1;
393 while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL))) 601 while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
394 { 602 {
395 if (tmp > s1 + j) 603 if (tmp > s1 + j)
604 {
396 break; 605 break;
606 }
397 utf_boundary = tmp; 607 utf_boundary = tmp;
398 } 608 }
609
399 /* Push first mismatching char to the list if it's a special char */ 610 /* Push first mismatching char to the list if it's a special char */
400 sp = linguistics_get_special(utf_boundary, tmp); 611 sp = linguistics_get_special(utf_boundary, tmp);
401 612
402 if (sp) 613 if (sp)
403 { 614 {
436 /* No matches for current top list element, go to the closest special char towards beginning of the string */ 647 /* No matches for current top list element, go to the closest special char towards beginning of the string */
437 g_free(spp); 648 g_free(spp);
438 l = g_list_delete_link(l, l); 649 l = g_list_delete_link(l, l);
439 } 650 }
440 } 651 }
652
441 if (!got_match) 653 if (!got_match)
442 { 654 {
443 /* NO MATCH 655 /* NO MATCH
444 * FIXME: If we're going to use this function to sort a string list alphabetically we should use 656 * FIXME: If we're going to use this function to sort a string list alphabetically we should use
445 * utf-aware comparison here. 657 * utf-aware comparison here.
446 */ 658 */
447 ret = 1; 659 ret = 1;
448 break; 660 break;
449 } 661 }
450 } 662 }
663
451 while (l) 664 while (l)
452 { 665 {
453 g_free(l->data); 666 g_free(l->data);
454 l = g_list_delete_link(l, l); 667 l = g_list_delete_link(l, l);
455 } 668 }
543 ret = NULL; 756 ret = NULL;
544 } 757 }
545 return ret; 758 return ret;
546} 759}
547 760
761char *linguistics_remove_all_spaces(char *str)
762{
763 char *p;
764 char *next = NULL;
765 int len = 0;
766 char *ret;
767 char *out;
768
769 ret = g_strdup(str);
770 out = ret;
771 p = str;
772 while (*p)
773 {
774 next = g_utf8_find_next_char(p, NULL);
775 len = next - p;
776 if ((len > 1)||(p[0] != ' '))
777 {
778 strncpy(out, p, len);
779 out = out + len;
780 }
781 p = next;
782 }
783 *out = '\0';
784
785 return ret;
786}
787
788// special characters
789static const char *remove_those = " _-.—,;:*#?=%&$§!@~()[]{}'`´^°|<>\\/\n\r\t\"\'";
790
791char *linguistics_remove_all_specials(char *str)
792{
793 char *p;
794 char *next = NULL;
795 int len = 0;
796 char *ret;
797 char *out;
798 int i;
799 int found_special;
800 int so_rtz = sizeof(remove_those[0]); // should be 1, but lets calculate it anyway
801 int so_rt = strlen(remove_those) * so_rtz;
802
803 ret = g_strdup(str);
804 out = ret;
805 p = str;
806 while (*p)
807 {
808 next = g_utf8_find_next_char(p, NULL);
809 len = next - p;
810 if (len > 1)
811 {
812 strncpy(out, p, len);
813 out = out + len;
814 }
815 else
816 {
817 found_special = 0;
818 for (i = 0; i < (so_rt / so_rtz); i++)
819 {
820 if (p[0] == remove_those[i])
821 {
822 // special found -> skip it
823 found_special = 1;
824 break;
825 }
826 }
827
828 if (found_special == 0)
829 {
830 strncpy(out, p, len);
831 out = out + len;
832 }
833 }
834 p = next;
835 }
836 *out = '\0';
837
838 return ret;
839}
840
841
548char * 842char *
549linguistics_next_word(char *str) 843linguistics_next_word(char *str)
550{ 844{
551 char* ret = strtok(str, " -/()\"\',.;_[]{}\\"); 845 char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
552 return ret; 846 return ret;

Legend:
Removed from v.30  
changed lines
  Added in v.31

   
Visit the ZANavi Wiki