zhash.c
author viric <viriketo@gmail.com>
Fri, 30 Mar 2012 18:56:20 +0200
changeset 33 ebbedaa090be
parent 16 b4e251400e36
child 32 6a1a709330bf
permissions -rw-r--r--
Adding what I had in the web for zparsetext (akcentiga)
viric@11
     1
#include <stdio.h>
viric@11
     2
#include <assert.h>
viric@11
     3
#include "dictre.h"
viric@11
     4
viric@11
     5
enum
viric@11
     6
{
viric@11
     7
    MAXHASH=1<<16
viric@11
     8
};
viric@11
     9
viric@11
    10
struct BareWord
viric@11
    11
{
viric@11
    12
    struct BareWord *next;
viric@11
    13
    char *str;
viric@11
    14
};
viric@11
    15
viric@11
    16
struct WordEntry
viric@11
    17
{
viric@11
    18
    struct WordEntry *next;
viric@11
    19
    char *str;
viric@11
    20
    struct BareWord *accented;
viric@11
    21
    struct BareWord *unflexed;
viric@11
    22
};
viric@11
    23
viric@11
    24
static struct WordEntry * wordlist[MAXHASH];
viric@11
    25
viric@11
    26
struct WordEntry * new_WordEntry()
viric@11
    27
{
viric@11
    28
    struct WordEntry *tmp;
viric@11
    29
    tmp =  (struct WordEntry *) malloc(sizeof(*tmp));
viric@11
    30
    assert(tmp != 0);
viric@11
    31
    return tmp;
viric@11
    32
}
viric@11
    33
viric@11
    34
struct BareWord * new_BareWord()
viric@11
    35
{
viric@11
    36
    struct BareWord *tmp;
viric@11
    37
    tmp =  (struct BareWord *) malloc(sizeof(*tmp));
viric@11
    38
    assert(tmp != 0);
viric@11
    39
    return tmp;
viric@11
    40
}
viric@11
    41
viric@11
    42
void init_wordlist()
viric@11
    43
{
viric@11
    44
    int i;
viric@11
    45
    for(i=0; i < MAXHASH; ++i)
viric@11
    46
    {
viric@11
    47
        struct WordEntry *nodata;
viric@11
    48
        nodata = new_WordEntry();
viric@11
    49
        assert(nodata != 0);
viric@11
    50
        nodata->str = 0;
viric@11
    51
        nodata->accented = 0;
viric@11
    52
        nodata->unflexed = 0;
viric@11
    53
        nodata->next = 0;
viric@11
    54
        wordlist[i] = nodata;
viric@11
    55
    }
viric@11
    56
}
viric@11
    57
viric@11
    58
static unsigned int hash_func(const unsigned char *str)
viric@11
    59
{
viric@16
    60
    unsigned int v;
viric@11
    61
viric@16
    62
    /* for hashmax of 2^16 */
viric@11
    63
viric@16
    64
    v = (str[1] & 15) << 4*3;
viric@11
    65
    if (str[2] != 0)
viric@16
    66
        v += (str[3] & 15) << 4*2;
viric@16
    67
    if (str[4] != 0)
viric@16
    68
        v += (str[5] & 15) << 4;
viric@16
    69
    if (str[6] != 0)
viric@16
    70
        v += (str[7] & 15);
viric@11
    71
viric@16
    72
    return v;
viric@11
    73
}
viric@11
    74
viric@11
    75
/* Word without accent */
viric@11
    76
struct WordEntry * does_word_exist(int hash, const char *word)
viric@11
    77
{
viric@11
    78
    struct WordEntry *tmp;
viric@11
    79
viric@11
    80
    for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next)
viric@11
    81
    {
viric@11
    82
        if (tmp->str) /* The last item in the linked list will have str=0 */
viric@11
    83
            if (strcmp(word, tmp->str) == 0)
viric@11
    84
                return tmp;
viric@11
    85
    }
viric@11
    86
    return 0;
viric@11
    87
}
viric@11
    88
viric@11
    89
void add_to_unflexed(struct WordEntry *pos, const char *word)
viric@11
    90
{
viric@11
    91
    struct BareWord *tmp;
viric@11
    92
viric@11
    93
    if (pos->unflexed == 0)
viric@11
    94
    {
viric@11
    95
        pos->unflexed = new_BareWord();
viric@11
    96
        tmp = pos->unflexed;
viric@11
    97
        tmp->str = strdup(word);
viric@11
    98
        tmp->next = 0;
viric@11
    99
    } else
viric@11
   100
    {
viric@11
   101
        /* Look for the same word */
viric@11
   102
        for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next)
viric@11
   103
        {
viric@13
   104
            if (strcmp(word, tmp->str) == 0)
viric@11
   105
                break;
viric@11
   106
        }
viric@13
   107
        /* If not found... */
viric@11
   108
        if (tmp == 0)
viric@11
   109
        {
viric@11
   110
            tmp = new_BareWord();
viric@13
   111
            tmp->str = strdup(word);
viric@13
   112
            tmp->next = pos->unflexed;
viric@13
   113
            pos->unflexed = tmp;
viric@11
   114
        }
viric@11
   115
    }
viric@11
   116
}
viric@11
   117
viric@11
   118
void set_accented(struct WordEntry *pos, const char *word)
viric@11
   119
{
viric@11
   120
    if (pos->accented)
viric@11
   121
        /* Will free the first parameter */
viric@11
   122
        pos->accented->str = mix_accents(pos->accented->str, word);
viric@11
   123
    else
viric@11
   124
    {
viric@11
   125
        pos->accented = new_BareWord();
viric@11
   126
        pos->accented->str = strdup(word);
viric@11
   127
        pos->accented->next = 0;
viric@11
   128
    }
viric@11
   129
}
viric@11
   130
viric@11
   131
void insert_word(const char *word, const char *unflexed)
viric@11
   132
{
viric@11
   133
    int hash;
viric@11
   134
    unsigned char word_no_accent[MAXWORD];
viric@11
   135
    struct WordEntry *found;
viric@11
   136
    unsigned int hash_num;
viric@11
   137
viric@11
   138
    remove_accent(word_no_accent, word);
viric@16
   139
    remove_jo(word_no_accent);
viric@11
   140
viric@11
   141
    hash_num = hash_func(word_no_accent);
viric@11
   142
viric@11
   143
    /* Where to insert */
viric@11
   144
    found = does_word_exist(hash_num, word_no_accent);
viric@11
   145
    if (found)
viric@11
   146
    {
viric@11
   147
        set_accented(found, word);
viric@13
   148
        add_to_unflexed(found, unflexed);
viric@11
   149
    } else /* Does not exist */
viric@11
   150
    {
viric@11
   151
        /* new word */
viric@11
   152
        struct WordEntry *new;
viric@11
   153
viric@11
   154
        new = new_WordEntry();
viric@11
   155
        new->str = strdup(word_no_accent);
viric@11
   156
        new->unflexed = 0;
viric@11
   157
        add_to_unflexed(new, unflexed);
viric@11
   158
        new->accented = 0;
viric@11
   159
        set_accented(new, word);
viric@11
   160
        /* Put it on the head of the hash list */
viric@11
   161
        new->next = wordlist[hash_num];
viric@11
   162
        wordlist[hash_num] = new;
viric@11
   163
    }
viric@11
   164
}
viric@11
   165
viric@11
   166
static void dump_word(struct WordEntry *word)
viric@11
   167
{
viric@13
   168
    struct BareWord *tmp;
viric@13
   169
    printf(":%s:%s", word->str, word->accented->str);
viric@13
   170
viric@13
   171
    for(tmp = word->unflexed; tmp != 0; tmp = tmp->next)
viric@13
   172
    {
viric@13
   173
        printf(" %s", tmp->str);
viric@13
   174
    }
viric@13
   175
    printf("\n");
viric@11
   176
}
viric@11
   177
viric@11
   178
void dump_wordlist()
viric@11
   179
{
viric@11
   180
    int i;
viric@11
   181
    for(i=0; i < MAXHASH; ++i)
viric@11
   182
    {
viric@11
   183
        struct WordEntry *word;
viric@11
   184
        word = wordlist[i];
viric@11
   185
        while (word != 0)
viric@11
   186
        {
viric@11
   187
            if (word->str)
viric@11
   188
                dump_word(word);
viric@11
   189
            word = word->next;
viric@11
   190
        }
viric@11
   191
    }
viric@11
   192
}