zhash.c
author viric@llimona
Wed, 29 Aug 2007 00:19:14 +0200
changeset 14 a961bb8806b9
parent 13 f71e89074c62
child 16 b4e251400e36
permissions -rw-r--r--
first 'zparsetext'.

#include <stdio.h>
#include <assert.h>
#include "dictre.h"

enum
{
    MAXHASH=1<<16
};

struct BareWord
{
    struct BareWord *next;
    char *str;
};

struct WordEntry
{
    struct WordEntry *next;
    char *str;
    struct BareWord *accented;
    struct BareWord *unflexed;
};

static struct WordEntry * wordlist[MAXHASH];

struct WordEntry * new_WordEntry()
{
    struct WordEntry *tmp;
    tmp =  (struct WordEntry *) malloc(sizeof(*tmp));
    assert(tmp != 0);
    return tmp;
}

struct BareWord * new_BareWord()
{
    struct BareWord *tmp;
    tmp =  (struct BareWord *) malloc(sizeof(*tmp));
    assert(tmp != 0);
    return tmp;
}

void init_wordlist()
{
    int i;
    for(i=0; i < MAXHASH; ++i)
    {
        struct WordEntry *nodata;
        nodata = new_WordEntry();
        assert(nodata != 0);
        nodata->str = 0;
        nodata->accented = 0;
        nodata->unflexed = 0;
        nodata->next = 0;
        wordlist[i] = nodata;
    }
}

static unsigned int hash_func(const unsigned char *str)
{
    int res;
    char v;

    v = 0;

    /* Taking only the meaningful utf-8 codes */
    if (str[2] != 0)
        v = str[3];

    res = (str[1] << 8) + v;

    return res;
}

/* Word without accent */
struct WordEntry * does_word_exist(int hash, const char *word)
{
    struct WordEntry *tmp;

    for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next)
    {
        if (tmp->str) /* The last item in the linked list will have str=0 */
            if (strcmp(word, tmp->str) == 0)
                return tmp;
    }
    return 0;
}

void add_to_unflexed(struct WordEntry *pos, const char *word)
{
    struct BareWord *tmp;

    if (pos->unflexed == 0)
    {
        pos->unflexed = new_BareWord();
        tmp = pos->unflexed;
        tmp->str = strdup(word);
        tmp->next = 0;
    } else
    {
        /* Look for the same word */
        for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next)
        {
            if (strcmp(word, tmp->str) == 0)
                break;
        }
        /* If not found... */
        if (tmp == 0)
        {
            tmp = new_BareWord();
            tmp->str = strdup(word);
            tmp->next = pos->unflexed;
            pos->unflexed = tmp;
        }
    }
}

void set_accented(struct WordEntry *pos, const char *word)
{
    if (pos->accented)
        /* Will free the first parameter */
        pos->accented->str = mix_accents(pos->accented->str, word);
    else
    {
        pos->accented = new_BareWord();
        pos->accented->str = strdup(word);
        pos->accented->next = 0;
    }
}

void insert_word(const char *word, const char *unflexed)
{
    int hash;
    unsigned char word_no_accent[MAXWORD];
    struct WordEntry *found;
    unsigned int hash_num;

    remove_accent(word_no_accent, word);

    hash_num = hash_func(word_no_accent);

    /* Where to insert */
    found = does_word_exist(hash_num, word_no_accent);
    if (found)
    {
        set_accented(found, word);
        add_to_unflexed(found, unflexed);
    } else /* Does not exist */
    {
        /* new word */
        struct WordEntry *new;

        new = new_WordEntry();
        new->str = strdup(word_no_accent);
        new->unflexed = 0;
        add_to_unflexed(new, unflexed);
        new->accented = 0;
        set_accented(new, word);
        /* Put it on the head of the hash list */
        new->next = wordlist[hash_num];
        wordlist[hash_num] = new;
    }
}

static void dump_word(struct WordEntry *word)
{
    struct BareWord *tmp;
    printf(":%s:%s", word->str, word->accented->str);

    for(tmp = word->unflexed; tmp != 0; tmp = tmp->next)
    {
        printf(" %s", tmp->str);
    }
    printf("\n");
}

void dump_wordlist()
{
    int i;
    for(i=0; i < MAXHASH; ++i)
    {
        struct WordEntry *word;
        word = wordlist[i];
        while (word != 0)
        {
            if (word->str)
                dump_word(word);
            word = word->next;
        }
    }
}