… | |
… | |
109 | { "Ì", "I" }, |
109 | { "Ì", "I" }, |
110 | { "Ò", "O" }, |
110 | { "Ò", "O" }, |
111 | { "Ù", "U" }, |
111 | { "Ù", "U" }, |
112 | /* ligatures */ |
112 | /* ligatures */ |
113 | { "Æ", "A", "AE" }, |
113 | { "Æ", "A", "AE" }, |
114 | { "IJ", "IJ" }, |
114 | //{ "IJ", "IJ" }, |
115 | { "Œ", "O", "OE" }, |
115 | { "Œ", "O", "OE" }, |
116 | /* special letters */ |
116 | /* special letters */ |
117 | { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */ |
117 | { "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */ |
118 | { "Ŋ", "N", "NG" }, |
118 | { "Ŋ", "N", "NG" }, |
119 | { "Þ", "T", "TH" }, |
119 | { "Þ", "T", "TH" }, |
… | |
… | |
220 | { "ì", "i" }, |
220 | { "ì", "i" }, |
221 | { "ò", "o" }, |
221 | { "ò", "o" }, |
222 | { "ù", "u" }, |
222 | { "ù", "u" }, |
223 | /* ligatures */ |
223 | /* ligatures */ |
224 | { "æ", "a", "ae" }, |
224 | { "æ", "a", "ae" }, |
225 | { "ij", "ij" }, |
225 | //{ "ij", "ij" }, |
226 | { "œ", "o", "oe" }, |
226 | { "œ", "o", "oe" }, |
227 | { "ß", "s", "ss" }, |
227 | { "ß", "s", "ss" }, |
228 | /* special letters */ |
228 | /* special letters */ |
229 | { "ð", "d", "dh" }, |
229 | { "ð", "d", "dh" }, |
230 | { "ŋ", "n", "ng" }, |
230 | { "ŋ", "n", "ng" }, |
… | |
… | |
261 | //{"љ","л","ль"}, |
261 | //{"љ","л","ль"}, |
262 | //{"њ","н","нь"}, |
262 | //{"њ","н","нь"}, |
263 | { "џ", "ц" }, |
263 | { "џ", "ц" }, |
264 | |
264 | |
265 | }; |
265 | }; |
|
|
266 | |
266 | |
267 | |
267 | static GHashTable *special_hash; |
268 | static GHashTable *special_hash; |
268 | |
269 | |
269 | /* Array of strings for case conversion |
270 | /* Array of strings for case conversion |
270 | * Even elements of array are strings of upper-case letters |
271 | * Even elements of array are strings of upper-case letters |
… | |
… | |
293 | }; |
294 | }; |
294 | |
295 | |
295 | static char** |
296 | static char** |
296 | linguistics_get_special(char *str, char *end) |
297 | linguistics_get_special(char *str, char *end) |
297 | { |
298 | { |
298 | char buf[10]; |
299 | char buf[11]; |
299 | int len; |
300 | int len; |
300 | if (!end) |
301 | if (!end) |
|
|
302 | { |
301 | end = g_utf8_find_next_char(str, NULL); |
303 | end = g_utf8_find_next_char(str, NULL); |
|
|
304 | } |
302 | len = end - str + 1; |
305 | len = end - str + 1; |
303 | g_strlcpy(buf, str, len > 10 ? 10 : len); |
306 | g_strlcpy(buf, str, len > 10 ? 10 : len); |
304 | return g_hash_table_lookup(special_hash, buf); |
307 | return g_hash_table_lookup(special_hash, buf); |
305 | } |
308 | } |
306 | |
309 | |
… | |
… | |
312 | char* |
315 | char* |
313 | linguistics_casefold(char *in) |
316 | linguistics_casefold(char *in) |
314 | { |
317 | { |
315 | int len = strlen(in); |
318 | int len = strlen(in); |
316 | char *src = in; |
319 | char *src = in; |
317 | char *ret=g_new(char,len+1); |
320 | //char *ret=g_new(char,len+1); |
|
|
321 | char *ret=g_new(char,len+20); // try to fix strange BUG |
318 | char *dest = ret; |
322 | char *dest = ret; |
319 | char buf[10]; |
323 | char buf[10]; |
|
|
324 | |
|
|
325 | // string end |
|
|
326 | ret[19] = '\0'; |
|
|
327 | // fprintf(stderr, "xxxsssssssssssss\n"); |
|
|
328 | |
320 | while (*src && dest - ret < len) |
329 | while (*src && ((dest - ret) < len)) |
321 | { |
330 | { |
322 | if (*src >= 'A' && *src <= 'Z') |
331 | if (*src >= 'A' && *src <= 'Z') |
323 | { |
332 | { |
324 | *dest++ = *src++ - 'A' + 'a'; |
333 | *dest++ = *src++ - 'A' + 'a'; |
325 | } |
334 | } |
… | |
… | |
337 | folded = g_hash_table_lookup(casefold_hash, buf); |
346 | folded = g_hash_table_lookup(casefold_hash, buf); |
338 | |
347 | |
339 | if (folded) |
348 | if (folded) |
340 | { |
349 | { |
341 | while (*folded && dest - ret < len) |
350 | while (*folded && dest - ret < len) |
|
|
351 | { |
342 | *dest++ = *folded++; |
352 | *dest++ = *folded++; |
|
|
353 | } |
343 | src = tmp; |
354 | src = tmp; |
344 | } |
355 | } |
345 | else |
356 | else |
346 | { |
357 | { |
347 | while (src < tmp && dest - ret < len) |
358 | while (src < tmp && dest - ret < len) |
|
|
359 | { |
348 | *dest++ = *src++; |
360 | *dest++ = *src++; |
|
|
361 | } |
349 | } |
362 | } |
350 | } |
363 | } |
351 | } |
364 | } |
|
|
365 | |
352 | *dest = 0; |
366 | *dest = 0; |
353 | if (*src) |
367 | if (*src) |
|
|
368 | { |
354 | dbg( |
369 | dbg( |
355 | 0, |
370 | 0, |
356 | "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n", |
371 | "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n", |
357 | in, ret); |
372 | in, ret); |
|
|
373 | } |
|
|
374 | |
358 | return ret; |
375 | return ret; |
359 | } |
376 | } |
|
|
377 | |
|
|
378 | char* linguistics_fold_and_prepare_complete(char *in, int free_input) |
|
|
379 | { |
|
|
380 | char *tmp1; |
|
|
381 | char *tmp2; |
|
|
382 | |
|
|
383 | if (in == NULL) |
|
|
384 | { |
|
|
385 | return NULL; |
|
|
386 | } |
|
|
387 | |
|
|
388 | tmp1 = linguistics_casefold(in); |
|
|
389 | if (tmp1) |
|
|
390 | { |
|
|
391 | tmp2 = linguistics_remove_all_specials(tmp1); |
|
|
392 | if (tmp2) |
|
|
393 | { |
|
|
394 | g_free(tmp1); |
|
|
395 | tmp1 = tmp2; |
|
|
396 | } |
|
|
397 | tmp2 = linguistics_expand_special(tmp1, 1); |
|
|
398 | if (tmp2) |
|
|
399 | { |
|
|
400 | g_free(tmp1); |
|
|
401 | tmp1 = tmp2; |
|
|
402 | } |
|
|
403 | } |
|
|
404 | |
|
|
405 | if (free_input) |
|
|
406 | { |
|
|
407 | if (in) |
|
|
408 | { |
|
|
409 | g_free(in); |
|
|
410 | in = NULL; |
|
|
411 | } |
|
|
412 | } |
|
|
413 | |
|
|
414 | return tmp1; |
|
|
415 | } |
|
|
416 | |
|
|
417 | |
|
|
418 | |
|
|
419 | |
|
|
420 | /* |
|
|
421 | * Verify that "in" points to valid "modified UTF-8" data. |
|
|
422 | * returns: string of useable utf-8 bytes (dont need to free, original input is just truncated) |
|
|
423 | */ |
|
|
424 | char* linguistics_check_utf8_string(char* in) |
|
|
425 | { |
|
|
426 | char* bytes = in; |
|
|
427 | |
|
|
428 | if (bytes == NULL) |
|
|
429 | { |
|
|
430 | return NULL; |
|
|
431 | } |
|
|
432 | |
|
|
433 | while (*bytes != '\0') |
|
|
434 | { |
|
|
435 | guint32 utf8 = *(bytes++); |
|
|
436 | // Switch on the high four bits. |
|
|
437 | switch (utf8 >> 4) |
|
|
438 | { |
|
|
439 | case 0x00: |
|
|
440 | case 0x01: |
|
|
441 | case 0x02: |
|
|
442 | case 0x03: |
|
|
443 | case 0x04: |
|
|
444 | case 0x05: |
|
|
445 | case 0x06: |
|
|
446 | case 0x07: |
|
|
447 | { |
|
|
448 | // Bit pattern 0xxx. No need for any extra bytes. |
|
|
449 | break; |
|
|
450 | } |
|
|
451 | case 0x08: |
|
|
452 | case 0x09: |
|
|
453 | case 0x0a: |
|
|
454 | case 0x0b: |
|
|
455 | case 0x0f: |
|
|
456 | { |
|
|
457 | /* |
|
|
458 | * Bit pattern 10xx or 1111, which are illegal start bytes. |
|
|
459 | * Note: 1111 is valid for normal UTF-8, but not the |
|
|
460 | * modified UTF-8 used here. |
|
|
461 | */ |
|
|
462 | // LOGW("JNI WARNING: illegal start byte 0x%x\n", utf8); |
|
|
463 | *(bytes--) = '\0'; |
|
|
464 | return in; |
|
|
465 | } |
|
|
466 | case 0x0e: |
|
|
467 | { |
|
|
468 | // Bit pattern 1110, so there are two additional bytes. |
|
|
469 | utf8 = *(bytes++); |
|
|
470 | if ((utf8 & 0xc0) != 0x80) |
|
|
471 | { |
|
|
472 | // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8); |
|
|
473 | *(bytes-2) = '\0'; |
|
|
474 | return in; |
|
|
475 | } |
|
|
476 | // Fall through to take care of the final byte. |
|
|
477 | } |
|
|
478 | case 0x0c: |
|
|
479 | case 0x0d: |
|
|
480 | { |
|
|
481 | // Bit pattern 110x, so there is one additional byte. |
|
|
482 | utf8 = *(bytes++); |
|
|
483 | if ((utf8 & 0xc0) != 0x80) |
|
|
484 | { |
|
|
485 | // LOGW("JNI WARNING: illegal continuation byte 0x%x\n", utf8); |
|
|
486 | *(bytes-2) = '\0'; |
|
|
487 | return in; |
|
|
488 | } |
|
|
489 | break; |
|
|
490 | } |
|
|
491 | } |
|
|
492 | } |
|
|
493 | |
|
|
494 | return in; |
|
|
495 | } |
|
|
496 | |
|
|
497 | |
|
|
498 | |
|
|
499 | /* |
|
|
500 | * |
|
|
501 | * @brief find match anywhere in str (only complete match, not partial!) |
|
|
502 | * both strings should have beend folded (and specials removed) before calling this function |
|
|
503 | * |
|
|
504 | * @return =0 on match =1 on not matched |
|
|
505 | * |
|
|
506 | */ |
|
|
507 | |
|
|
508 | int linguistics_compare_anywhere(char *str, char *match) |
|
|
509 | { |
|
|
510 | char *match_1; |
|
|
511 | char *match_next; |
|
|
512 | char *next; |
|
|
513 | char *next_char; |
|
|
514 | int found = 1; |
|
|
515 | gunichar match_1_unichar; |
|
|
516 | |
|
|
517 | if ((str == NULL)||(match == NULL)) |
|
|
518 | { |
|
|
519 | return found; |
|
|
520 | } |
|
|
521 | |
|
|
522 | match_1 = g_strdup(match); |
|
|
523 | next = g_utf8_find_next_char(match_1, NULL); |
|
|
524 | if (next == NULL) |
|
|
525 | { |
|
|
526 | g_free(match_1); |
|
|
527 | return found; |
|
|
528 | } |
|
|
529 | |
|
|
530 | *next = '\0'; // cut off after first utf-8 char |
|
|
531 | |
|
|
532 | //dbg(0, "match=%s match_1=%s", match, match_1); |
|
|
533 | |
|
|
534 | match_1_unichar = g_utf8_get_char(match_1); |
|
|
535 | |
|
|
536 | match_next = g_utf8_strchr(str, -1, match_1_unichar); |
|
|
537 | while (match_next) |
|
|
538 | { |
|
|
539 | |
|
|
540 | //dbg(0, "cmp1: match=%s match_next=%s", match, match_next); |
|
|
541 | |
|
|
542 | // see if the utf-8 chars match |
|
|
543 | if (!strncmp(match, match_next, strlen(match))) |
|
|
544 | { |
|
|
545 | found = 0; |
|
|
546 | break; |
|
|
547 | } |
|
|
548 | match_next = g_utf8_strchr(g_utf8_find_next_char(match_next, NULL), -1, match_1_unichar); |
|
|
549 | |
|
|
550 | //dbg(0, "cmp2: match=%s match_next=%s", match, match_next); |
|
|
551 | } |
|
|
552 | |
|
|
553 | g_free(match_1); |
|
|
554 | |
|
|
555 | return found; |
|
|
556 | } |
|
|
557 | |
360 | |
558 | |
361 | /** |
559 | /** |
362 | * @brief Compare two strings using special characters expansion. |
560 | * @brief Compare two strings using special characters expansion. |
363 | * |
561 | * |
364 | * @param str first string to compare, special characters are expanded. |
562 | * @param str first string to compare, special characters are expanded. |
… | |
… | |
367 | * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp(). |
565 | * @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp(). |
368 | */ |
566 | */ |
369 | |
567 | |
370 | int linguistics_compare(char *str, char *match, int partial) |
568 | int linguistics_compare(char *str, char *match, int partial) |
371 | { |
569 | { |
|
|
570 | |
|
|
571 | if ((str == NULL)||(match == NULL)) |
|
|
572 | { |
|
|
573 | return 1; |
|
|
574 | } |
|
|
575 | |
372 | char *s1 = str, *s2 = match; |
576 | char *s1 = str, *s2 = match; |
373 | char **sp; |
577 | char **sp; |
374 | int ret = 0; |
578 | int ret = 0; |
375 | int got_match; |
579 | int got_match; |
376 | GList *l = NULL; |
580 | GList *l = NULL; |
… | |
… | |
379 | int j; |
583 | int j; |
380 | struct special_pos *spp; |
584 | struct special_pos *spp; |
381 | char *utf_boundary, *tmp; |
585 | char *utf_boundary, *tmp; |
382 | /* Skip all matching chars */ |
586 | /* Skip all matching chars */ |
383 | for (j = 0; s1[j] && s1[j] == s2[j]; j++) |
587 | for (j = 0; s1[j] && s1[j] == s2[j]; j++) |
|
|
588 | { |
384 | ; |
589 | ; |
|
|
590 | } |
|
|
591 | |
385 | if (!s2[j] && (partial || !s1[j])) |
592 | if (!s2[j] && (partial || !s1[j])) |
386 | { |
593 | { |
387 | /* MATCH! */ |
594 | /* MATCH! */ |
388 | ret = 0; |
595 | ret = 0; |
389 | break; |
596 | break; |
390 | } |
597 | } |
|
|
598 | |
391 | /* Find beginning of first mismatching utf-8 encoded char */ |
599 | /* Find beginning of first mismatching utf-8 encoded char */ |
392 | utf_boundary = s1; |
600 | utf_boundary = s1; |
393 | while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL))) |
601 | while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL))) |
394 | { |
602 | { |
395 | if (tmp > s1 + j) |
603 | if (tmp > s1 + j) |
|
|
604 | { |
396 | break; |
605 | break; |
|
|
606 | } |
397 | utf_boundary = tmp; |
607 | utf_boundary = tmp; |
398 | } |
608 | } |
|
|
609 | |
399 | /* Push first mismatching char to the list if it's a special char */ |
610 | /* Push first mismatching char to the list if it's a special char */ |
400 | sp = linguistics_get_special(utf_boundary, tmp); |
611 | sp = linguistics_get_special(utf_boundary, tmp); |
401 | |
612 | |
402 | if (sp) |
613 | if (sp) |
403 | { |
614 | { |
… | |
… | |
436 | /* No matches for current top list element, go to the closest special char towards beginning of the string */ |
647 | /* No matches for current top list element, go to the closest special char towards beginning of the string */ |
437 | g_free(spp); |
648 | g_free(spp); |
438 | l = g_list_delete_link(l, l); |
649 | l = g_list_delete_link(l, l); |
439 | } |
650 | } |
440 | } |
651 | } |
|
|
652 | |
441 | if (!got_match) |
653 | if (!got_match) |
442 | { |
654 | { |
443 | /* NO MATCH |
655 | /* NO MATCH |
444 | * FIXME: If we're going to use this function to sort a string list alphabetically we should use |
656 | * FIXME: If we're going to use this function to sort a string list alphabetically we should use |
445 | * utf-aware comparison here. |
657 | * utf-aware comparison here. |
446 | */ |
658 | */ |
447 | ret = 1; |
659 | ret = 1; |
448 | break; |
660 | break; |
449 | } |
661 | } |
450 | } |
662 | } |
|
|
663 | |
451 | while (l) |
664 | while (l) |
452 | { |
665 | { |
453 | g_free(l->data); |
666 | g_free(l->data); |
454 | l = g_list_delete_link(l, l); |
667 | l = g_list_delete_link(l, l); |
455 | } |
668 | } |
… | |
… | |
543 | ret = NULL; |
756 | ret = NULL; |
544 | } |
757 | } |
545 | return ret; |
758 | return ret; |
546 | } |
759 | } |
547 | |
760 | |
|
|
761 | char *linguistics_remove_all_spaces(char *str) |
|
|
762 | { |
|
|
763 | char *p; |
|
|
764 | char *next = NULL; |
|
|
765 | int len = 0; |
|
|
766 | char *ret; |
|
|
767 | char *out; |
|
|
768 | |
|
|
769 | ret = g_strdup(str); |
|
|
770 | out = ret; |
|
|
771 | p = str; |
|
|
772 | while (*p) |
|
|
773 | { |
|
|
774 | next = g_utf8_find_next_char(p, NULL); |
|
|
775 | len = next - p; |
|
|
776 | if ((len > 1)||(p[0] != ' ')) |
|
|
777 | { |
|
|
778 | strncpy(out, p, len); |
|
|
779 | out = out + len; |
|
|
780 | } |
|
|
781 | p = next; |
|
|
782 | } |
|
|
783 | *out = '\0'; |
|
|
784 | |
|
|
785 | return ret; |
|
|
786 | } |
|
|
787 | |
|
|
788 | // special characters |
|
|
789 | static const char *remove_those = " _-.—,;:*#?=%&$§!@~()[]{}'`´^°|<>\\/\n\r\t\"\'"; |
|
|
790 | |
|
|
791 | char *linguistics_remove_all_specials(char *str) |
|
|
792 | { |
|
|
793 | char *p; |
|
|
794 | char *next = NULL; |
|
|
795 | int len = 0; |
|
|
796 | char *ret; |
|
|
797 | char *out; |
|
|
798 | int i; |
|
|
799 | int found_special; |
|
|
800 | int so_rtz = sizeof(remove_those[0]); // should be 1, but lets calculate it anyway |
|
|
801 | int so_rt = strlen(remove_those) * so_rtz; |
|
|
802 | |
|
|
803 | ret = g_strdup(str); |
|
|
804 | out = ret; |
|
|
805 | p = str; |
|
|
806 | while (*p) |
|
|
807 | { |
|
|
808 | next = g_utf8_find_next_char(p, NULL); |
|
|
809 | len = next - p; |
|
|
810 | if (len > 1) |
|
|
811 | { |
|
|
812 | strncpy(out, p, len); |
|
|
813 | out = out + len; |
|
|
814 | } |
|
|
815 | else |
|
|
816 | { |
|
|
817 | found_special = 0; |
|
|
818 | for (i = 0; i < (so_rt / so_rtz); i++) |
|
|
819 | { |
|
|
820 | if (p[0] == remove_those[i]) |
|
|
821 | { |
|
|
822 | // special found -> skip it |
|
|
823 | found_special = 1; |
|
|
824 | break; |
|
|
825 | } |
|
|
826 | } |
|
|
827 | |
|
|
828 | if (found_special == 0) |
|
|
829 | { |
|
|
830 | strncpy(out, p, len); |
|
|
831 | out = out + len; |
|
|
832 | } |
|
|
833 | } |
|
|
834 | p = next; |
|
|
835 | } |
|
|
836 | *out = '\0'; |
|
|
837 | |
|
|
838 | return ret; |
|
|
839 | } |
|
|
840 | |
|
|
841 | |
548 | char * |
842 | char * |
549 | linguistics_next_word(char *str) |
843 | linguistics_next_word(char *str) |
550 | { |
844 | { |
551 | char* ret = strtok(str, " -/()\"\',.;_[]{}\\"); |
845 | char* ret = strtok(str, " -/()\"\',.;_[]{}\\"); |
552 | return ret; |
846 | return ret; |