navit/navit/linguistics.c

#include <string.h>
#include <stdio.h>
#include <glib.h>
#include "debug.h"
#include "linguistics.h"

static const char *special[][3] =
{
/* Capital Diacritics */
/* ¨ Diaresis */
{ "Ä", "A", "AE" },
{ "Ë", "E" },
{ "Ï", "I" },
{ "Ö", "O", "OE" },
{ "Ü", "U", "UE" },
{ "Ÿ", "Y" },
/* ˝ Double Acute Accent */
{ "Ő", "O", "Ö" },
{ "Ű", "U", "Ü" },
/* ´ Acute Accent */
{ "Á", "A" },
{ "Ć", "C" },
{ "É", "E" },
{ "Í", "I" },
{ "Ĺ", "L" },
{ "Ń", "N" },
{ "Ó", "O" },
{ "Ŕ", "R" },
{ "Ś", "S" },
{ "Ú", "U" },
{ "Ý", "Y" },
{ "Ź", "Z" },
/* ˛ Ogonek (nosinė) */
{ "Ą", "A" },
{ "Ę", "E" },
{ "Į", "I" },
{ "Ų", "U" },
/* ˙ Dot */
{ "Ċ", "C" },
{ "Ė", "E" },
{ "Ġ", "G" },
{ "İ", "I" },
{ "Ŀ", "L" },
{ "Ż", "Z" },
/* – Stroke */
{ "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
{ "Ħ", "H" },
{ "Ł", "L" },
{ "Ŧ", "T" },
/* ˚ Ring */
{ "Å", "A", "AA" },
{ "Ů", "U" },
/* ˇ Caron (haček, paukščiukas) */
{ "Č", "C" },
{ "Ď", "D" },
{ "Ě", "E" },
{ "Ľ", "L" },
{ "Ň", "N" },
{ "Ř", "R" },
{ "Š", "S" },
{ "Ť", "T" },
{ "Ž", "Z" },
/* / Slash */
{ "Ø", "O", "OE" },
/* ¯ Macron */
{ "Ā", "A", "AA" },
{ "Ē", "E", "EE" },
{ "Ī", "I", "II" },
{ "Ō", "O", "OO" },
{ "Ū", "U", "UU" },
/* ˘ Brevis */
{ "Ă", "A" },
{ "Ĕ", "E" },
{ "Ğ", "G" },
{ "Ĭ", "I" },
{ "Ŏ", "O" },
{ "Ŭ", "U" },
/* ^ Circumflex */
{ "Â", "A" },
{ "Ĉ", "C" },
{ "Ê", "E" },
{ "Ĝ", "G" },
{ "Ĥ", "H" },
{ "Î", "I" },
{ "Ĵ", "J" },
{ "Ô", "O" },
{ "Ŝ", "S" },
{ "Û", "U" },
{ "Ŵ", "W" },
{ "Ŷ", "Y" },
/* ¸ Cedilla */
{ "Ç", "C" },
{ "Ģ", "G", "GJ" },
{ "Ķ", "K", "KJ" },
{ "Ļ", "L", "LJ" },
{ "Ņ", "N", "NJ" },
{ "Ŗ", "R" },
{ "Ş", "S" },
{ "Ţ", "T" },
/* ~ Tilde */
{ "Ã", "A" },
{ "Ĩ", "I" },
{ "Ñ", "N" },
{ "Õ", "O" },
{ "Ũ", "U" },
/* ` Grave */
{ "À", "A" },
{ "È", "E" },
{ "Ì", "I" },
{ "Ò", "O" },
{ "Ù", "U" },
/* ligatures */
{ "Æ", "A", "AE" },
{ "Ĳ", "IJ" },
{ "Œ", "O", "OE" },
/* special letters */
{ "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
{ "Ŋ", "N", "NG" },
{ "Þ", "T", "TH" },
/* Small Diacritics */
/* ¨ Diaresis */
{ "ä", "a", "ae" },
{ "ë", "e" },
{ "ï", "i" },
{ "ö", "o", "oe" },
{ "ü", "u", "ue" },
{ "ÿ", "y" },
/* ˝ Double Acute Accent */
{ "ő", "o", "ö" },
{ "ű", "u", "ü" },
/* ´ Acute Accent */
{ "á", "a" },
{ "ć", "c" },
{ "é", "e" },
{ "í", "i" },
{ "ĺ", "l" },
{ "ń", "n" },
{ "ó", "o" },
{ "ŕ", "r" },
{ "ś", "s" },
{ "ú", "u" },
{ "ý", "y" },
{ "ź", "z" },
/* ˛ Ogonek (nosinė) */
{ "ą", "a" },
{ "ę", "e" },
{ "į", "i" },
{ "ų", "u" },
/* ˙ Dot (and dotless i) */
{ "ċ", "c" },
{ "ė", "e" },
{ "ġ", "g" },
{ "ı", "i" },
{ "ŀ", "l" },
{ "ż", "z" },
/* – Stroke */
{ "đ", "d", "dj" },
{ "ħ", "h" },
{ "ł", "l" },
{ "ŧ", "t" },
/* ˚ Ring */
{ "å", "a", "aa" },
{ "ů", "u" },
/* ˇ Caron (haček, paukščiukas) */
{ "č", "c" },
{ "ď", "d" },
{ "ě", "e" },
{ "ľ", "l" },
{ "ň", "n" },
{ "ř", "r" },
{ "š", "s" },
{ "ť", "t" },
{ "ž", "z" },
/* / Slash */
{ "ø", "o", "oe" },
/* Macron */
{ "ā", "a", "aa" },
{ "ē", "e", "ee" },
{ "ī", "i", "ii" },
{ "ō", "o", "oo" },
{ "ū", "u", "uu" },
/* ˘ Brevis */
{ "ă", "a" },
{ "ĕ", "e" },
{ "ğ", "g" },
{ "ĭ", "i" },
{ "ŏ", "o" },
{ "ŭ", "u" },
/* ^ Circumflex */
{ "â", "a" },
{ "ĉ", "c" },
{ "ê", "e" },
{ "ĝ", "g" },
{ "ĥ", "h" },
{ "î", "i" },
{ "ĵ", "j" },
{ "ô", "o" },
{ "ŝ", "s" },
{ "û", "u" },
{ "ŵ", "w" },
{ "ŷ", "y" },
/* ¸ Cedilla */
{ "ç", "c" },
{ "ģ", "g", "gj" },
{ "ķ", "k", "kj" },
{ "ļ", "l", "lj" },
{ "ņ", "n", "nj" },
{ "ŗ", "r" },
{ "ş", "s" },
{ "ţ", "t" },
/* ~ Tilde */
{ "ã", "a" },
{ "ĩ", "i" },
{ "õ", "o" },
{ "ñ", "n" },
{ "ũ", "u" },
/* ` Grave */
{ "à", "a" },
{ "è", "e" },
{ "ì", "i" },
{ "ò", "o" },
{ "ù", "u" },
/* ligatures */
{ "æ", "a", "ae" },
{ "ĳ", "ij" },
{ "œ", "o", "oe" },
{ "ß", "s", "ss" },
/* special letters */
{ "ð", "d", "dh" },
{ "ŋ", "n", "ng" },
{ "þ", "t", "th" },

/* Cyrillic capital */

{ "Ё", "Е" },
{ "Й", "И" },
{ "І", "I" },
{ "Ї", "I" },
{ "Ў", "У" },
{ "Є", "Е", "Э" },
{ "Ґ", "Г" },
{ "Ѓ", "Г" },
{ "Ђ", "Д" },
{ "Ќ", "К" },
//{"Љ","Л","ЛЬ"},
                //{"Њ","Н","НЬ"},
                { "Џ", "Ц" },

                /* Cyrillic small */

                { "ё", "е" },
                { "й", "и" },
                { "і", "i" },
                { "ї", "i" },
                { "ў", "у" },
                //{"є","е","э"},
                { "ґ", "г" },
                { "ѓ", "г" },
                { "ђ", "д" },
                { "ќ", "к" },
                //{"љ","л","ль"},
                //{"њ","н","нь"},
                { "џ", "ц" },

};

static GHashTable *special_hash;

/* Array of strings for case conversion
 * Even elements of array are strings of upper-case letters
 * Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
 * Last element of array should be NULL.
 */
static const char
                *upperlower[] =
                                {
                                                /*Latin diacritics*/
                                                "ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆĲŒÐŊÞ",
                                                "äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæĳœðŋþ",
                                                /*Cyrillic*/
                                                "АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
                                                "абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",

                                                NULL };

static GHashTable *casefold_hash;

struct special_pos
{
        char **variants;
        int n;
        char *s1, *s2;
};

static char**
linguistics_get_special(char *str, char *end)
{
        char buf[10];
        int len;
        if (!end)
                end = g_utf8_find_next_char(str, NULL);
        len = end - str + 1;
        g_strlcpy(buf, str, len > 10 ? 10 : len);
        return g_hash_table_lookup(special_hash, buf);
}

/*
 * @brief Prepare an utf-8 string for case insensitive comparison.
 * @param in String to prepeare.
 * @return String prepared for case insensitive search. Result shoud be g_free()d after use.
 */
char*
linguistics_casefold(char *in)
{
        int len = strlen(in);
        char *src = in;
        char *ret=g_new(char,len+1);
        char *dest = ret;
        char buf[10];
        while (*src && dest - ret < len)
        {
                if (*src >= 'A' && *src <= 'Z')
                {
                        *dest++ = *src++ - 'A' + 'a';
                }
                else if (!(*src & 128))
                {
                        *dest++ = *src++;
                }
                else
                {
                        int charlen;
                        char *tmp, *folded;
                        tmp = g_utf8_find_next_char(src, NULL);
                        charlen = tmp - src + 1;
                        g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
                        folded = g_hash_table_lookup(casefold_hash, buf);

                        if (folded)
                        {
                                while (*folded && dest - ret < len)
                                        *dest++ = *folded++;
                                src = tmp;
                        }
                        else
                        {
                                while (src < tmp && dest - ret < len)
                                        *dest++ = *src++;
                        }
                }
        }
        *dest = 0;
        if (*src)
                dbg(
                                0,
                                "Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
                                in, ret);
        return ret;
}

/**
 * @brief Compare two strings using special characters expansion.
 *
 * @param str first string to compare, special characters are expanded.
 * @param match second string to compare, special characters are not expanded.
 * @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
 * @return  =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
 */

int linguistics_compare(char *str, char *match, int partial)
{
        char *s1 = str, *s2 = match;
        char **sp;
        int ret = 0;
        int got_match;
        GList *l = NULL;
        while (*s1 && *s2)
        {
                int j;
                struct special_pos *spp;
                char *utf_boundary, *tmp;
                /* Skip all matching chars */
                for (j = 0; s1[j] && s1[j] == s2[j]; j++)
                        ;
                if (!s2[j] && (partial || !s1[j]))
                {
                        /* MATCH! */
                        ret = 0;
                        break;
                }
                /* Find beginning of first mismatching utf-8 encoded char */
                utf_boundary = s1;
                while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
                {
                        if (tmp > s1 + j)
                                break;
                        utf_boundary = tmp;
                }
                /* Push first mismatching char to the list if it's a special char */
                sp = linguistics_get_special(utf_boundary, tmp);

                if (sp)
                {
                        spp=g_new(struct special_pos,1);
                        spp->variants = sp;
                        spp->n = 1;
                        spp->s1 = utf_boundary;
                        spp->s2 = s2 + (utf_boundary - s1);
                        l = g_list_prepend(l, spp);
                }

                /* Try to find a match using special char variants from the list */
                got_match = 0;
                while (l && !got_match)
                {
                        spp = l->data;
                        s1 = spp->s1;
                        s2 = spp->s2;
                        while (spp->n < 3 && !got_match)
                        {
                                char *s = spp->variants[(spp->n)++];
                                int len;
                                if (!s)
                                        break;
                                len = strlen(s);
                                if (!strncmp(s, s2, len))
                                {
                                        s2 += len;
                                        s1 += strlen(spp->variants[0]);
                                        got_match = 1;
                                        break;
                                }
                        }
                        if (spp->n >= 3 || !spp->variants[spp->n])
                        {
                                /* No matches for current top list element, go to the closest special char towards beginning of the string */
                                g_free(spp);
                                l = g_list_delete_link(l, l);
                        }
                }
                if (!got_match)
                {
                        /* NO MATCH
                         * FIXME: If we're going to use this function to sort a string list alphabetically we should use 
                         * utf-aware comparison here.
                         */
                        ret = 1;
                        break;
                }
        }
        while (l)
        {
                g_free(l->data);
                l = g_list_delete_link(l, l);
        }
        return ret;
}

char *
linguistics_expand_special(char *str, int mode)
{
        char *in = str;
        char *out, *ret;
        int found = 0;

        if (!str)
        {
                return NULL;
        }

        ret = g_strdup(str);
        out = ret;

        if (!mode)
        {
                return ret;
        }

        while (*in)
        {
                char *next = g_utf8_find_next_char(in, NULL);
                int i, len = next - in;
                int match = 0;
                if (len > 1)
                {
                        for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
                        {
                                const char *search = special[i][0];
                                if (!strncmp(in, search, len))
                                {
                                        const char *replace = special[i][mode];
                                        if (replace)
                                        {
                                                int replace_len = strlen(replace);

                                                if (replace_len > len)
                                                {
                                                        fprintf(
                                                                        stderr,
                                                                        "* ERROR !! ERROR !! found %s %s %d %s %d\n",
                                                                        in, search, len, replace, replace_len);
                                                }
                                                dbg_assert(replace_len <= len);
                                                if (replace_len > len)
                                                {
                                                        out += len;
                                                        match = 0;
                                                        break;
                                                }
                                                else
                                                {
                                                        // fprintf(stderr,"  GOOD  !!  GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
                                                        strcpy(out, replace);
                                                        out += replace_len;
                                                        match = 1;
                                                        break;
                                                }
                                        }
                                }
                        }
                }

                if (match)
                {
                        found = 1;
                        in = next;
                }
                else
                {
                        while (len-- > 0)
                        {
                                *out++ = *in++;
                        }
                }
        }
        *out++ = '\0';
        if (!found)
        {
                if (ret)
                {
                        g_free(ret);
                }
                ret = NULL;
        }
        return ret;
}

char *
linguistics_next_word(char *str)
{
        char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
        return ret;

        //      int len=strcspn(str, " -/()");
        //      if (!str[len] || !str[len+1])
        //              return NULL;
        //      return str+len+1;

}

int linguistics_search(char *str)
{
        if (!g_strcasecmp(str, "str"))
                return 0;
        if (!g_strcasecmp(str, "str."))
                return 0;
        if (!g_strcasecmp(str, "strasse"))
                return 0;
        if (!g_strcasecmp(str, "weg"))
                return 0;
        return 1;
}

/**
 * @brief Copy one utf8 encoded char to newly allocated buffer.
 *
 * @param s pointer to the beginning of the char.
 * @return  newly allocated nul-terminated string containing one utf8 encoded character.
 */
static char *linguistics_dup_utf8_char(const char *s)
{
        char *ret, *next;
        next = g_utf8_find_next_char(s, NULL);
        ret=g_new(char, next-s+1);
        g_strlcpy(ret, s, next - s + 1);
        return ret;
}

void linguistics_init(void)
{
        int i;
        special_hash = g_hash_table_new(g_str_hash, g_str_equal);
        casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);

        for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
        {
                g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
        }

        for (i = 0; upperlower[i]; i += 2)
        {
                int j, k;
                for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
                {
                        char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
                        char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
                        g_hash_table_insert(casefold_hash, s1, s2);
                        j += strlen(s1);
                        k += strlen(s2);
                }
        }
}

1	#include <string.h>
2	#include <stdio.h>
3	#include <glib.h>
4	#include "debug.h"
5	#include "linguistics.h"
6
7	static const char *special[][3] =
8	{
9	/* Capital Diacritics */
10	/* ¨ Diaresis */
11	{ "Ä", "A", "AE" },
12	{ "Ë", "E" },
13	{ "Ï", "I" },
14	{ "Ö", "O", "OE" },
15	{ "Ü", "U", "UE" },
16	{ "Ÿ", "Y" },
17	/* ˝ Double Acute Accent */
18	{ "Ő", "O", "Ö" },
19	{ "Ű", "U", "Ü" },
20	/* ´ Acute Accent */
21	{ "Á", "A" },
22	{ "Ć", "C" },
23	{ "É", "E" },
24	{ "Í", "I" },
25	{ "Ĺ", "L" },
26	{ "Ń", "N" },
27	{ "Ó", "O" },
28	{ "Ŕ", "R" },
29	{ "Ś", "S" },
30	{ "Ú", "U" },
31	{ "Ý", "Y" },
32	{ "Ź", "Z" },
33	/* ˛ Ogonek (nosinė) */
34	{ "Ą", "A" },
35	{ "Ę", "E" },
36	{ "Į", "I" },
37	{ "Ų", "U" },
38	/* ˙ Dot */
39	{ "Ċ", "C" },
40	{ "Ė", "E" },
41	{ "Ġ", "G" },
42	{ "İ", "I" },
43	{ "Ŀ", "L" },
44	{ "Ż", "Z" },
45	/* – Stroke */
46	{ "Đ", "D", "DJ" }, /* Croatian Dj, not to be confused with the similar-looking Icelandic Eth */
47	{ "Ħ", "H" },
48	{ "Ł", "L" },
49	{ "Ŧ", "T" },
50	/* ˚ Ring */
51	{ "Å", "A", "AA" },
52	{ "Ů", "U" },
53	/* ˇ Caron (haček, paukščiukas) */
54	{ "Č", "C" },
55	{ "Ď", "D" },
56	{ "Ě", "E" },
57	{ "Ľ", "L" },
58	{ "Ň", "N" },
59	{ "Ř", "R" },
60	{ "Š", "S" },
61	{ "Ť", "T" },
62	{ "Ž", "Z" },
63	/* / Slash */
64	{ "Ø", "O", "OE" },
65	/* ¯ Macron */
66	{ "Ā", "A", "AA" },
67	{ "Ē", "E", "EE" },
68	{ "Ī", "I", "II" },
69	{ "Ō", "O", "OO" },
70	{ "Ū", "U", "UU" },
71	/* ˘ Brevis */
72	{ "Ă", "A" },
73	{ "Ĕ", "E" },
74	{ "Ğ", "G" },
75	{ "Ĭ", "I" },
76	{ "Ŏ", "O" },
77	{ "Ŭ", "U" },
78	/* ^ Circumflex */
79	{ "Â", "A" },
80	{ "Ĉ", "C" },
81	{ "Ê", "E" },
82	{ "Ĝ", "G" },
83	{ "Ĥ", "H" },
84	{ "Î", "I" },
85	{ "Ĵ", "J" },
86	{ "Ô", "O" },
87	{ "Ŝ", "S" },
88	{ "Û", "U" },
89	{ "Ŵ", "W" },
90	{ "Ŷ", "Y" },
91	/* ¸ Cedilla */
92	{ "Ç", "C" },
93	{ "Ģ", "G", "GJ" },
94	{ "Ķ", "K", "KJ" },
95	{ "Ļ", "L", "LJ" },
96	{ "Ņ", "N", "NJ" },
97	{ "Ŗ", "R" },
98	{ "Ş", "S" },
99	{ "Ţ", "T" },
100	/* ~ Tilde */
101	{ "Ã", "A" },
102	{ "Ĩ", "I" },
103	{ "Ñ", "N" },
104	{ "Õ", "O" },
105	{ "Ũ", "U" },
106	/* ` Grave */
107	{ "À", "A" },
108	{ "È", "E" },
109	{ "Ì", "I" },
110	{ "Ò", "O" },
111	{ "Ù", "U" },
112	/* ligatures */
113	{ "Æ", "A", "AE" },
114	{ "Ĳ", "IJ" },
115	{ "Œ", "O", "OE" },
116	/* special letters */
117	{ "Ð", "D", "DH" }, /* Icelandic Eth, not to be confused with the similar-looking Croatian Dj */
118	{ "Ŋ", "N", "NG" },
119	{ "Þ", "T", "TH" },
120	/* Small Diacritics */
121	/* ¨ Diaresis */
122	{ "ä", "a", "ae" },
123	{ "ë", "e" },
124	{ "ï", "i" },
125	{ "ö", "o", "oe" },
126	{ "ü", "u", "ue" },
127	{ "ÿ", "y" },
128	/* ˝ Double Acute Accent */
129	{ "ő", "o", "ö" },
130	{ "ű", "u", "ü" },
131	/* ´ Acute Accent */
132	{ "á", "a" },
133	{ "ć", "c" },
134	{ "é", "e" },
135	{ "í", "i" },
136	{ "ĺ", "l" },
137	{ "ń", "n" },
138	{ "ó", "o" },
139	{ "ŕ", "r" },
140	{ "ś", "s" },
141	{ "ú", "u" },
142	{ "ý", "y" },
143	{ "ź", "z" },
144	/* ˛ Ogonek (nosinė) */
145	{ "ą", "a" },
146	{ "ę", "e" },
147	{ "į", "i" },
148	{ "ų", "u" },
149	/* ˙ Dot (and dotless i) */
150	{ "ċ", "c" },
151	{ "ė", "e" },
152	{ "ġ", "g" },
153	{ "ı", "i" },
154	{ "ŀ", "l" },
155	{ "ż", "z" },
156	/* – Stroke */
157	{ "đ", "d", "dj" },
158	{ "ħ", "h" },
159	{ "ł", "l" },
160	{ "ŧ", "t" },
161	/* ˚ Ring */
162	{ "å", "a", "aa" },
163	{ "ů", "u" },
164	/* ˇ Caron (haček, paukščiukas) */
165	{ "č", "c" },
166	{ "ď", "d" },
167	{ "ě", "e" },
168	{ "ľ", "l" },
169	{ "ň", "n" },
170	{ "ř", "r" },
171	{ "š", "s" },
172	{ "ť", "t" },
173	{ "ž", "z" },
174	/* / Slash */
175	{ "ø", "o", "oe" },
176	/* Macron */
177	{ "ā", "a", "aa" },
178	{ "ē", "e", "ee" },
179	{ "ī", "i", "ii" },
180	{ "ō", "o", "oo" },
181	{ "ū", "u", "uu" },
182	/* ˘ Brevis */
183	{ "ă", "a" },
184	{ "ĕ", "e" },
185	{ "ğ", "g" },
186	{ "ĭ", "i" },
187	{ "ŏ", "o" },
188	{ "ŭ", "u" },
189	/* ^ Circumflex */
190	{ "â", "a" },
191	{ "ĉ", "c" },
192	{ "ê", "e" },
193	{ "ĝ", "g" },
194	{ "ĥ", "h" },
195	{ "î", "i" },
196	{ "ĵ", "j" },
197	{ "ô", "o" },
198	{ "ŝ", "s" },
199	{ "û", "u" },
200	{ "ŵ", "w" },
201	{ "ŷ", "y" },
202	/* ¸ Cedilla */
203	{ "ç", "c" },
204	{ "ģ", "g", "gj" },
205	{ "ķ", "k", "kj" },
206	{ "ļ", "l", "lj" },
207	{ "ņ", "n", "nj" },
208	{ "ŗ", "r" },
209	{ "ş", "s" },
210	{ "ţ", "t" },
211	/* ~ Tilde */
212	{ "ã", "a" },
213	{ "ĩ", "i" },
214	{ "õ", "o" },
215	{ "ñ", "n" },
216	{ "ũ", "u" },
217	/* ` Grave */
218	{ "à", "a" },
219	{ "è", "e" },
220	{ "ì", "i" },
221	{ "ò", "o" },
222	{ "ù", "u" },
223	/* ligatures */
224	{ "æ", "a", "ae" },
225	{ "ĳ", "ij" },
226	{ "œ", "o", "oe" },
227	{ "ß", "s", "ss" },
228	/* special letters */
229	{ "ð", "d", "dh" },
230	{ "ŋ", "n", "ng" },
231	{ "þ", "t", "th" },
232
233	/* Cyrillic capital */
234
235	{ "Ё", "Е" },
236	{ "Й", "И" },
237	{ "І", "I" },
238	{ "Ї", "I" },
239	{ "Ў", "У" },
240	{ "Є", "Е", "Э" },
241	{ "Ґ", "Г" },
242	{ "Ѓ", "Г" },
243	{ "Ђ", "Д" },
244	{ "Ќ", "К" },
245	//{"Љ","Л","ЛЬ"},
246	//{"Њ","Н","НЬ"},
247	{ "Џ", "Ц" },
248
249	/* Cyrillic small */
250
251	{ "ё", "е" },
252	{ "й", "и" },
253	{ "і", "i" },
254	{ "ї", "i" },
255	{ "ў", "у" },
256	//{"є","е","э"},
257	{ "ґ", "г" },
258	{ "ѓ", "г" },
259	{ "ђ", "д" },
260	{ "ќ", "к" },
261	//{"љ","л","ль"},
262	//{"њ","н","нь"},
263	{ "џ", "ц" },
264
265	};
266
267	static GHashTable *special_hash;
268
269	/* Array of strings for case conversion
270	* Even elements of array are strings of upper-case letters
271	* Odd elements of array are strings of lower-case letters, in the order corresponding to directly preceeding even element.
272	* Last element of array should be NULL.
273	*/
274	static const char
275	*upperlower[] =
276	{
277	/Latin diacritics/
278	"ÄËÏÖÜŸŐŰÁĆÉÍĹŃÓŔŚÚÝŹĄĘĮŲĊĖĠİĿŻĐĦŁŦÅŮČĎĚĽŇŘŠŤŽØĀĒĪŌŪĂĔĞĬŎŬÂĈÊĜĤÎĴÔŜÛŴŶÇĢĶĻŅŖŞŢÃĨÑÕŨÀÈÌÒÙÆĲŒÐŊÞ",
279	"äëïöüÿőűáćéíĺńóŕśúýźąęįųċėġıŀżđħłŧåůčďěľňřšťžøāēīōūăĕğĭŏŭâĉêĝĥîĵôŝûŵŷçģķļņŗşţãĩõñũàèìòùæĳœðŋþ",
280	/Cyrillic/
281	"АБВГҐЃДЂЕЄЁЖЗИЙКЌЛЉМНЊОПРСТУФХЦЏЧШЩЪЫЬЭЮЯІЇЎ",
282	"абвгґѓдђеєёжзийкќлљмнњопрстуфхцџчшщъыьэюяіїў",
283
284	NULL };
285
286	static GHashTable *casefold_hash;
287
288	struct special_pos
289	{
290	char **variants;
291	int n;
292	char s1, s2;
293	};
294
295	static char**
296	linguistics_get_special(char str, char end)
297	{
298	char buf[10];
299	int len;
300	if (!end)
301	end = g_utf8_find_next_char(str, NULL);
302	len = end - str + 1;
303	g_strlcpy(buf, str, len > 10 ? 10 : len);
304	return g_hash_table_lookup(special_hash, buf);
305	}
306
307	/*
308	* @brief Prepare an utf-8 string for case insensitive comparison.
309	* @param in String to prepeare.
310	* @return String prepared for case insensitive search. Result shoud be g_free()d after use.
311	*/
312	char*
313	linguistics_casefold(char *in)
314	{
315	int len = strlen(in);
316	char *src = in;
317	char *ret=g_new(char,len+1);
318	char *dest = ret;
319	char buf[10];
320	while (*src && dest - ret < len)
321	{
322	if (src >= 'A' && src <= 'Z')
323	{
324	dest++ = src++ - 'A' + 'a';
325	}
326	else if (!(*src & 128))
327	{
328	dest++ = src++;
329	}
330	else
331	{
332	int charlen;
333	char tmp, folded;
334	tmp = g_utf8_find_next_char(src, NULL);
335	charlen = tmp - src + 1;
336	g_strlcpy(buf, src, charlen > 10 ? 10 : charlen);
337	folded = g_hash_table_lookup(casefold_hash, buf);
338
339	if (folded)
340	{
341	while (*folded && dest - ret < len)
342	dest++ = folded++;
343	src = tmp;
344	}
345	else
346	{
347	while (src < tmp && dest - ret < len)
348	dest++ = src++;
349	}
350	}
351	}
352	*dest = 0;
353	if (*src)
354	dbg(
355	0,
356	"Casefolded string for '%s' needs extra space, result is trucated to '%s'.\n",
357	in, ret);
358	return ret;
359	}
360
361	/**
362	* @brief Compare two strings using special characters expansion.
363	*
364	* @param str first string to compare, special characters are expanded.
365	* @param match second string to compare, special characters are not expanded.
366	* @param partial if = 1 then str string may be shorter than match string, in which case the rest from str isn't analysed.
367	* @return =0 strings matched, =1 not matched. Note this function return value is not fully compatible with strcmp().
368	*/
369
370	int linguistics_compare(char str, char match, int partial)
371	{
372	char s1 = str, s2 = match;
373	char **sp;
374	int ret = 0;
375	int got_match;
376	GList *l = NULL;
377	while (s1 && s2)
378	{
379	int j;
380	struct special_pos *spp;
381	char utf_boundary, tmp;
382	/* Skip all matching chars */
383	for (j = 0; s1[j] && s1[j] == s2[j]; j++)
384	;
385	if (!s2[j] && (partial \|\| !s1[j]))
386	{
387	/* MATCH! */
388	ret = 0;
389	break;
390	}
391	/* Find beginning of first mismatching utf-8 encoded char */
392	utf_boundary = s1;
393	while (*(tmp = g_utf8_find_next_char(utf_boundary, NULL)))
394	{
395	if (tmp > s1 + j)
396	break;
397	utf_boundary = tmp;
398	}
399	/* Push first mismatching char to the list if it's a special char */
400	sp = linguistics_get_special(utf_boundary, tmp);
401
402	if (sp)
403	{
404	spp=g_new(struct special_pos,1);
405	spp->variants = sp;
406	spp->n = 1;
407	spp->s1 = utf_boundary;
408	spp->s2 = s2 + (utf_boundary - s1);
409	l = g_list_prepend(l, spp);
410	}
411
412	/* Try to find a match using special char variants from the list */
413	got_match = 0;
414	while (l && !got_match)
415	{
416	spp = l->data;
417	s1 = spp->s1;
418	s2 = spp->s2;
419	while (spp->n < 3 && !got_match)
420	{
421	char *s = spp->variants[(spp->n)++];
422	int len;
423	if (!s)
424	break;
425	len = strlen(s);
426	if (!strncmp(s, s2, len))
427	{
428	s2 += len;
429	s1 += strlen(spp->variants[0]);
430	got_match = 1;
431	break;
432	}
433	}
434	if (spp->n >= 3 \|\| !spp->variants[spp->n])
435	{
436	/* No matches for current top list element, go to the closest special char towards beginning of the string */
437	g_free(spp);
438	l = g_list_delete_link(l, l);
439	}
440	}
441	if (!got_match)
442	{
443	/* NO MATCH
444	* FIXME: If we're going to use this function to sort a string list alphabetically we should use
445	* utf-aware comparison here.
446	*/
447	ret = 1;
448	break;
449	}
450	}
451	while (l)
452	{
453	g_free(l->data);
454	l = g_list_delete_link(l, l);
455	}
456	return ret;
457	}
458
459	char *
460	linguistics_expand_special(char *str, int mode)
461	{
462	char *in = str;
463	char out, ret;
464	int found = 0;
465
466	if (!str)
467	{
468	return NULL;
469	}
470
471	ret = g_strdup(str);
472	out = ret;
473
474	if (!mode)
475	{
476	return ret;
477	}
478
479	while (*in)
480	{
481	char *next = g_utf8_find_next_char(in, NULL);
482	int i, len = next - in;
483	int match = 0;
484	if (len > 1)
485	{
486	for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
487	{
488	const char *search = special[i][0];
489	if (!strncmp(in, search, len))
490	{
491	const char *replace = special[i][mode];
492	if (replace)
493	{
494	int replace_len = strlen(replace);
495
496	if (replace_len > len)
497	{
498	fprintf(
499	stderr,
500	"* ERROR !! ERROR !! found %s %s %d %s %d\n",
501	in, search, len, replace, replace_len);
502	}
503	dbg_assert(replace_len <= len);
504	if (replace_len > len)
505	{
506	out += len;
507	match = 0;
508	break;
509	}
510	else
511	{
512	// fprintf(stderr," GOOD !! GOOD !! found %s %s %d %s %d\n",in,search,len,replace,replace_len);
513	strcpy(out, replace);
514	out += replace_len;
515	match = 1;
516	break;
517	}
518	}
519	}
520	}
521	}
522
523	if (match)
524	{
525	found = 1;
526	in = next;
527	}
528	else
529	{
530	while (len-- > 0)
531	{
532	out++ = in++;
533	}
534	}
535	}
536	*out++ = '\0';
537	if (!found)
538	{
539	if (ret)
540	{
541	g_free(ret);
542	}
543	ret = NULL;
544	}
545	return ret;
546	}
547
548	char *
549	linguistics_next_word(char *str)
550	{
551	char* ret = strtok(str, " -/()\"\',.;_[]{}\\");
552	return ret;
553
554	// int len=strcspn(str, " -/()");
555	// if (!str[len] \|\| !str[len+1])
556	// return NULL;
557	// return str+len+1;
558
559	}
560
561	int linguistics_search(char *str)
562	{
563	if (!g_strcasecmp(str, "str"))
564	return 0;
565	if (!g_strcasecmp(str, "str."))
566	return 0;
567	if (!g_strcasecmp(str, "strasse"))
568	return 0;
569	if (!g_strcasecmp(str, "weg"))
570	return 0;
571	return 1;
572	}
573
574	/**
575	* @brief Copy one utf8 encoded char to newly allocated buffer.
576	*
577	* @param s pointer to the beginning of the char.
578	* @return newly allocated nul-terminated string containing one utf8 encoded character.
579	*/
580	static char linguistics_dup_utf8_char(const char s)
581	{
582	char ret, next;
583	next = g_utf8_find_next_char(s, NULL);
584	ret=g_new(char, next-s+1);
585	g_strlcpy(ret, s, next - s + 1);
586	return ret;
587	}
588
589	void linguistics_init(void)
590	{
591	int i;
592	special_hash = g_hash_table_new(g_str_hash, g_str_equal);
593	casefold_hash = g_hash_table_new(g_str_hash, g_str_equal);
594
595	for (i = 0; i < sizeof(special) / sizeof(special[0]); i++)
596	{
597	g_hash_table_insert(special_hash, (gpointer) special[i][0], special[i]);
598	}
599
600	for (i = 0; upperlower[i]; i += 2)
601	{
602	int j, k;
603	for (j = 0, k = 0; upperlower[i][j] && upperlower[i + 1][k];)
604	{
605	char *s1 = linguistics_dup_utf8_char(upperlower[i] + j);
606	char *s2 = linguistics_dup_utf8_char(upperlower[i + 1] + k);
607	g_hash_table_insert(casefold_hash, s1, s2);
608	j += strlen(s1);
609	k += strlen(s2);
610	}
611	}
612	}
613