zhash.c
changeset 11 68ea18fe402c
child 13 f71e89074c62
equal deleted inserted replaced
10:188a0e3b3fb4 11:68ea18fe402c
       
     1 #include <stdio.h>
       
     2 #include <assert.h>
       
     3 #include "dictre.h"
       
     4 
       
     5 enum
       
     6 {
       
     7     MAXHASH=1<<16
       
     8 };
       
     9 
       
    10 struct BareWord
       
    11 {
       
    12     struct BareWord *next;
       
    13     char *str;
       
    14 };
       
    15 
       
    16 struct WordEntry
       
    17 {
       
    18     struct WordEntry *next;
       
    19     char *str;
       
    20     struct BareWord *accented;
       
    21     struct BareWord *unflexed;
       
    22 };
       
    23 
       
    24 static struct WordEntry * wordlist[MAXHASH];
       
    25 
       
    26 struct WordEntry * new_WordEntry()
       
    27 {
       
    28     struct WordEntry *tmp;
       
    29     tmp =  (struct WordEntry *) malloc(sizeof(*tmp));
       
    30     assert(tmp != 0);
       
    31     return tmp;
       
    32 }
       
    33 
       
    34 struct BareWord * new_BareWord()
       
    35 {
       
    36     struct BareWord *tmp;
       
    37     tmp =  (struct BareWord *) malloc(sizeof(*tmp));
       
    38     assert(tmp != 0);
       
    39     return tmp;
       
    40 }
       
    41 
       
    42 void init_wordlist()
       
    43 {
       
    44     int i;
       
    45     for(i=0; i < MAXHASH; ++i)
       
    46     {
       
    47         struct WordEntry *nodata;
       
    48         nodata = new_WordEntry();
       
    49         assert(nodata != 0);
       
    50         nodata->str = 0;
       
    51         nodata->accented = 0;
       
    52         nodata->unflexed = 0;
       
    53         nodata->next = 0;
       
    54         wordlist[i] = nodata;
       
    55     }
       
    56 }
       
    57 
       
    58 static unsigned int hash_func(const unsigned char *str)
       
    59 {
       
    60     int res;
       
    61     char v;
       
    62 
       
    63     v = 0;
       
    64 
       
    65     /* Taking only the meaningful utf-8 codes */
       
    66     if (str[2] != 0)
       
    67         v = str[3];
       
    68 
       
    69     res = (str[1] << 8) + v;
       
    70 
       
    71     return res;
       
    72 }
       
    73 
       
    74 /* Word without accent */
       
    75 struct WordEntry * does_word_exist(int hash, const char *word)
       
    76 {
       
    77     struct WordEntry *tmp;
       
    78 
       
    79     for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next)
       
    80     {
       
    81         if (tmp->str) /* The last item in the linked list will have str=0 */
       
    82             if (strcmp(word, tmp->str) == 0)
       
    83                 return tmp;
       
    84     }
       
    85     return 0;
       
    86 }
       
    87 
       
    88 void add_to_unflexed(struct WordEntry *pos, const char *word)
       
    89 {
       
    90     struct BareWord *tmp;
       
    91 
       
    92     if (pos->unflexed == 0)
       
    93     {
       
    94         pos->unflexed = new_BareWord();
       
    95         tmp = pos->unflexed;
       
    96         tmp->str = strdup(word);
       
    97         tmp->next = 0;
       
    98     } else
       
    99     {
       
   100         /* Look for the same word */
       
   101         for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next)
       
   102         {
       
   103             if (strcmp(word, pos->str) == 0)
       
   104                 break;
       
   105         }
       
   106         if (tmp == 0)
       
   107         {
       
   108             tmp = new_BareWord();
       
   109         } else
       
   110         {
       
   111             struct BareWord *new;
       
   112             new = new_BareWord();
       
   113         }
       
   114     }
       
   115 }
       
   116 
       
   117 void set_accented(struct WordEntry *pos, const char *word)
       
   118 {
       
   119     if (pos->accented)
       
   120         /* Will free the first parameter */
       
   121         pos->accented->str = mix_accents(pos->accented->str, word);
       
   122     else
       
   123     {
       
   124         pos->accented = new_BareWord();
       
   125         pos->accented->str = strdup(word);
       
   126         pos->accented->next = 0;
       
   127     }
       
   128 }
       
   129 
       
   130 void insert_word(const char *word, const char *unflexed)
       
   131 {
       
   132     int hash;
       
   133     unsigned char word_no_accent[MAXWORD];
       
   134     struct WordEntry *found;
       
   135     unsigned int hash_num;
       
   136 
       
   137     remove_accent(word_no_accent, word);
       
   138 
       
   139     hash_num = hash_func(word_no_accent);
       
   140 
       
   141     /* Where to insert */
       
   142     found = does_word_exist(hash_num, word_no_accent);
       
   143     if (found)
       
   144     {
       
   145         set_accented(found, word);
       
   146         /* TODO process word_no_accent */
       
   147     } else /* Does not exist */
       
   148     {
       
   149         /* new word */
       
   150         struct WordEntry *new;
       
   151 
       
   152         new = new_WordEntry();
       
   153         new->str = strdup(word_no_accent);
       
   154         new->unflexed = 0;
       
   155         add_to_unflexed(new, unflexed);
       
   156         new->accented = 0;
       
   157         set_accented(new, word);
       
   158         /* Put it on the head of the hash list */
       
   159         new->next = wordlist[hash_num];
       
   160         wordlist[hash_num] = new;
       
   161     }
       
   162 }
       
   163 
       
   164 static void dump_word(struct WordEntry *word)
       
   165 {
       
   166     printf("%s:%s\n", word->str, word->accented->str);
       
   167 }
       
   168 
       
   169 void dump_wordlist()
       
   170 {
       
   171     int i;
       
   172     for(i=0; i < MAXHASH; ++i)
       
   173     {
       
   174         struct WordEntry *word;
       
   175         word = wordlist[i];
       
   176         while (word != 0)
       
   177         {
       
   178             if (word->str)
       
   179                 dump_word(word);
       
   180             word = word->next;
       
   181         }
       
   182     }
       
   183 }