zhash.c
changeset 11 68ea18fe402c
child 13 f71e89074c62
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zhash.c	Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,183 @@
+#include <stdio.h>
+#include <assert.h>
+#include "dictre.h"
+
+enum
+{
+    MAXHASH=1<<16
+};
+
+struct BareWord
+{
+    struct BareWord *next;
+    char *str;
+};
+
+struct WordEntry
+{
+    struct WordEntry *next;
+    char *str;
+    struct BareWord *accented;
+    struct BareWord *unflexed;
+};
+
+static struct WordEntry * wordlist[MAXHASH];
+
+struct WordEntry * new_WordEntry()
+{
+    struct WordEntry *tmp;
+    tmp =  (struct WordEntry *) malloc(sizeof(*tmp));
+    assert(tmp != 0);
+    return tmp;
+}
+
+struct BareWord * new_BareWord()
+{
+    struct BareWord *tmp;
+    tmp =  (struct BareWord *) malloc(sizeof(*tmp));
+    assert(tmp != 0);
+    return tmp;
+}
+
+void init_wordlist()
+{
+    int i;
+    for(i=0; i < MAXHASH; ++i)
+    {
+        struct WordEntry *nodata;
+        nodata = new_WordEntry();
+        assert(nodata != 0);
+        nodata->str = 0;
+        nodata->accented = 0;
+        nodata->unflexed = 0;
+        nodata->next = 0;
+        wordlist[i] = nodata;
+    }
+}
+
+static unsigned int hash_func(const unsigned char *str)
+{
+    int res;
+    char v;
+
+    v = 0;
+
+    /* Taking only the meaningful utf-8 codes */
+    if (str[2] != 0)
+        v = str[3];
+
+    res = (str[1] << 8) + v;
+
+    return res;
+}
+
+/* Word without accent */
+struct WordEntry * does_word_exist(int hash, const char *word)
+{
+    struct WordEntry *tmp;
+
+    for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next)
+    {
+        if (tmp->str) /* The last item in the linked list will have str=0 */
+            if (strcmp(word, tmp->str) == 0)
+                return tmp;
+    }
+    return 0;
+}
+
+void add_to_unflexed(struct WordEntry *pos, const char *word)
+{
+    struct BareWord *tmp;
+
+    if (pos->unflexed == 0)
+    {
+        pos->unflexed = new_BareWord();
+        tmp = pos->unflexed;
+        tmp->str = strdup(word);
+        tmp->next = 0;
+    } else
+    {
+        /* Look for the same word */
+        for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next)
+        {
+            if (strcmp(word, pos->str) == 0)
+                break;
+        }
+        if (tmp == 0)
+        {
+            tmp = new_BareWord();
+        } else
+        {
+            struct BareWord *new;
+            new = new_BareWord();
+        }
+    }
+}
+
+void set_accented(struct WordEntry *pos, const char *word)
+{
+    if (pos->accented)
+        /* Will free the first parameter */
+        pos->accented->str = mix_accents(pos->accented->str, word);
+    else
+    {
+        pos->accented = new_BareWord();
+        pos->accented->str = strdup(word);
+        pos->accented->next = 0;
+    }
+}
+
+void insert_word(const char *word, const char *unflexed)
+{
+    int hash;
+    unsigned char word_no_accent[MAXWORD];
+    struct WordEntry *found;
+    unsigned int hash_num;
+
+    remove_accent(word_no_accent, word);
+
+    hash_num = hash_func(word_no_accent);
+
+    /* Where to insert */
+    found = does_word_exist(hash_num, word_no_accent);
+    if (found)
+    {
+        set_accented(found, word);
+        /* TODO process word_no_accent */
+    } else /* Does not exist */
+    {
+        /* new word */
+        struct WordEntry *new;
+
+        new = new_WordEntry();
+        new->str = strdup(word_no_accent);
+        new->unflexed = 0;
+        add_to_unflexed(new, unflexed);
+        new->accented = 0;
+        set_accented(new, word);
+        /* Put it on the head of the hash list */
+        new->next = wordlist[hash_num];
+        wordlist[hash_num] = new;
+    }
+}
+
+static void dump_word(struct WordEntry *word)
+{
+    printf("%s:%s\n", word->str, word->accented->str);
+}
+
+void dump_wordlist()
+{
+    int i;
+    for(i=0; i < MAXHASH; ++i)
+    {
+        struct WordEntry *word;
+        word = wordlist[i];
+        while (word != 0)
+        {
+            if (word->str)
+                dump_word(word);
+            word = word->next;
+        }
+    }
+}