… | |
… | |
228 | {"ð","d","dh"}, |
228 | {"ð","d","dh"}, |
229 | {"ŋ","n","ng"}, |
229 | {"ŋ","n","ng"}, |
230 | {"þ","t","th"}, |
230 | {"þ","t","th"}, |
231 | |
231 | |
232 | /* Cyrillic capital */ |
232 | /* Cyrillic capital */ |
|
|
233 | |
233 | {"Ё","Е"}, |
234 | {"Ё","Е"}, |
234 | {"Й","И"}, |
235 | {"Й","И"}, |
235 | {"І","I"}, |
236 | {"І","I"}, |
236 | {"Ї","I"}, |
237 | {"Ї","I"}, |
237 | {"Ў","У"}, |
238 | {"Ў","У"}, |
238 | {"Є","Е","Э"}, |
239 | {"Є","Е","Э"}, |
239 | {"Ґ","Г"}, |
240 | {"Ґ","Г"}, |
240 | {"Ѓ","Г"}, |
241 | {"Ѓ","Г"}, |
241 | {"Ђ","Д"}, |
242 | {"Ђ","Д"}, |
242 | {"Ќ","К"}, |
243 | {"Ќ","К"}, |
243 | {"Љ","Л","ЛЬ"}, |
244 | //{"Љ","Л","ЛЬ"}, |
244 | {"Њ","Н","НЬ"}, |
245 | //{"Њ","Н","НЬ"}, |
245 | {"Џ","Ц"}, |
246 | {"Џ","Ц"}, |
246 | |
247 | |
247 | /* Cyrillic small */ |
248 | /* Cyrillic small */ |
|
|
249 | |
248 | {"ё","е"}, |
250 | {"ё","е"}, |
249 | {"й","и"}, |
251 | {"й","и"}, |
250 | {"і","i"}, |
252 | {"і","i"}, |
251 | {"ї","i"}, |
253 | {"ї","i"}, |
252 | {"ў","у"}, |
254 | {"ў","у"}, |
253 | {"є","е","э"}, |
255 | //{"є","е","э"}, |
254 | {"ґ","г"}, |
256 | {"ґ","г"}, |
255 | {"ѓ","г"}, |
257 | {"ѓ","г"}, |
256 | {"ђ","д"}, |
258 | {"ђ","д"}, |
257 | {"ќ","к"}, |
259 | {"ќ","к"}, |
258 | {"љ","л","ль"}, |
260 | //{"љ","л","ль"}, |
259 | {"њ","н","нь"}, |
261 | //{"њ","н","нь"}, |
260 | {"џ","ц"}, |
262 | {"џ","ц"}, |
261 | |
263 | |
262 | }; |
264 | }; |
|
|
265 | |
263 | static GHashTable *special_hash; |
266 | static GHashTable *special_hash; |
264 | |
267 | |
265 | /* Array of strings for case conversion |
268 | /* Array of strings for case conversion |
266 | * Even elements of array are strings of upper-case letters |
269 | * Even elements of array are strings of upper-case letters |
267 | * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element. |
270 | * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element. |
… | |
… | |
325 | char *tmp, *folded; |
328 | char *tmp, *folded; |
326 | tmp=g_utf8_find_next_char(src,NULL); |
329 | tmp=g_utf8_find_next_char(src,NULL); |
327 | charlen=tmp-src+1; |
330 | charlen=tmp-src+1; |
328 | g_strlcpy(buf,src,charlen>10?10:charlen); |
331 | g_strlcpy(buf,src,charlen>10?10:charlen); |
329 | folded=g_hash_table_lookup(casefold_hash,buf); |
332 | folded=g_hash_table_lookup(casefold_hash,buf); |
|
|
333 | |
330 | if(folded) { |
334 | if(folded) |
|
|
335 | { |
331 | while(*folded && dest-ret<len) |
336 | while(*folded && dest-ret<len) |
332 | *dest++=*folded++; |
337 | *dest++=*folded++; |
333 | src=tmp; |
338 | src=tmp; |
|
|
339 | } |
334 | } else { |
340 | else |
|
|
341 | { |
335 | while(src<tmp && dest-ret<len) |
342 | while(src<tmp && dest-ret<len) |
336 | *dest++=*src++; |
343 | *dest++=*src++; |
337 | } |
344 | } |
338 | } |
345 | } |
339 | } |
346 | } |
… | |
… | |
378 | break; |
385 | break; |
379 | utf_boundary=tmp; |
386 | utf_boundary=tmp; |
380 | } |
387 | } |
381 | /* Push first mismatching char to the list if it's a special char */ |
388 | /* Push first mismatching char to the list if it's a special char */ |
382 | sp=linguistics_get_special(utf_boundary,tmp); |
389 | sp=linguistics_get_special(utf_boundary,tmp); |
|
|
390 | |
383 | if(sp){ |
391 | if(sp) |
|
|
392 | { |
384 | spp=g_new(struct special_pos,1); |
393 | spp=g_new(struct special_pos,1); |
385 | spp->variants=sp; |
394 | spp->variants=sp; |
386 | spp->n=1; |
395 | spp->n=1; |
387 | spp->s1=utf_boundary; |
396 | spp->s1=utf_boundary; |
388 | spp->s2=s2+(utf_boundary-s1); |
397 | spp->s2=s2+(utf_boundary-s1); |
… | |
… | |
435 | linguistics_expand_special(char *str, int mode) |
444 | linguistics_expand_special(char *str, int mode) |
436 | { |
445 | { |
437 | char *in=str; |
446 | char *in=str; |
438 | char *out,*ret; |
447 | char *out,*ret; |
439 | int found=0; |
448 | int found=0; |
|
|
449 | |
|
|
450 | if (!str) |
|
|
451 | { |
|
|
452 | return NULL; |
|
|
453 | } |
|
|
454 | |
440 | out=ret=g_strdup(str); |
455 | ret=g_strdup(str); |
|
|
456 | out=ret; |
441 | |
457 | |
442 | if (!mode) |
458 | if (!mode) |
|
|
459 | { |
443 | return ret; |
460 | return ret; |
|
|
461 | } |
444 | |
462 | |
445 | while (*in) |
463 | while (*in) |
446 | { |
464 | { |
447 | char *next=g_utf8_find_next_char(in, NULL); |
465 | char *next=g_utf8_find_next_char(in, NULL); |
448 | int i,len=next-in; |
466 | int i,len=next-in; |
… | |
… | |
457 | const char *replace=special[i][mode]; |
475 | const char *replace=special[i][mode]; |
458 | if (replace) |
476 | if (replace) |
459 | { |
477 | { |
460 | int replace_len=strlen(replace); |
478 | int replace_len=strlen(replace); |
461 | |
479 | |
462 | // dbg_assert(replace_len <= len); |
|
|
463 | if (replace_len <= len) |
480 | if (replace_len > len) |
464 | { |
481 | { |
465 | dbg(0,"found %s %s %d %s %d\n",in,search,len,replace,replace_len); |
482 | fprintf(stderr,"* ERROR !! ERROR !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); |
|
|
483 | } |
|
|
484 | dbg_assert(replace_len <= len); |
|
|
485 | if (replace_len > len) |
|
|
486 | { |
|
|
487 | out+=len; |
|
|
488 | match=0; |
|
|
489 | break; |
466 | } |
490 | } |
467 | else |
491 | else |
468 | { |
492 | { |
|
|
493 | // fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len); |
469 | strcpy(out, replace); |
494 | strcpy(out, replace); |
470 | out+=replace_len; |
495 | out+=replace_len; |
471 | match=1; |
496 | match=1; |
472 | break; |
497 | break; |
473 | } |
498 | } |
… | |
… | |
490 | } |
515 | } |
491 | } |
516 | } |
492 | *out++='\0'; |
517 | *out++='\0'; |
493 | if (!found) |
518 | if (!found) |
494 | { |
519 | { |
|
|
520 | if (ret) |
|
|
521 | { |
495 | g_free(ret); |
522 | g_free(ret); |
|
|
523 | } |
496 | ret=NULL; |
524 | ret=NULL; |
497 | } |
525 | } |
498 | return ret; |
526 | return ret; |
499 | } |
527 | } |
500 | |
528 | |
501 | char * |
529 | char * |
502 | linguistics_next_word(char *str) |
530 | linguistics_next_word(char *str) |
503 | { |
531 | { |
|
|
532 | char* ret=strtok(str, " -/()\"\',.;_[]{}\\"); |
|
|
533 | return ret; |
|
|
534 | |
504 | int len=strcspn(str, " -/()"); |
535 | // int len=strcspn(str, " -/()"); |
505 | if (!str[len] || !str[len+1]) |
536 | // if (!str[len] || !str[len+1]) |
506 | return NULL; |
537 | // return NULL; |
507 | return str+len+1; |
538 | // return str+len+1; |
|
|
539 | |
508 | } |
540 | } |
509 | |
541 | |
510 | int |
542 | int |
511 | linguistics_search(char *str) |
543 | linguistics_search(char *str) |
512 | { |
544 | { |
… | |
… | |
543 | int i; |
575 | int i; |
544 | special_hash=g_hash_table_new(g_str_hash, g_str_equal); |
576 | special_hash=g_hash_table_new(g_str_hash, g_str_equal); |
545 | casefold_hash=g_hash_table_new(g_str_hash, g_str_equal); |
577 | casefold_hash=g_hash_table_new(g_str_hash, g_str_equal); |
546 | |
578 | |
547 | for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) |
579 | for (i = 0 ; i < sizeof(special)/sizeof(special[0]); i++) |
|
|
580 | { |
548 | g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]); |
581 | g_hash_table_insert(special_hash,(gpointer)special[i][0],special[i]); |
|
|
582 | } |
549 | |
583 | |
550 | for (i = 0 ; upperlower[i]; i+=2) { |
584 | for (i = 0 ; upperlower[i]; i+=2) |
|
|
585 | { |
551 | int j,k; |
586 | int j,k; |
552 | for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) { |
587 | for(j=0,k=0;upperlower[i][j] && upperlower[i+1][k];) |
|
|
588 | { |
553 | char *s1=linguistics_dup_utf8_char(upperlower[i]+j); |
589 | char *s1=linguistics_dup_utf8_char(upperlower[i]+j); |
554 | char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k); |
590 | char *s2=linguistics_dup_utf8_char(upperlower[i+1]+k); |
555 | g_hash_table_insert(casefold_hash,s1,s2); |
591 | g_hash_table_insert(casefold_hash,s1,s2); |
556 | j+=strlen(s1); |
592 | j+=strlen(s1); |
557 | k+=strlen(s2); |
593 | k+=strlen(s2); |