Adding code for the zprocess, for processing the Zaliznjak dictionary.
authorviric@llimona
Tue, 28 Aug 2007 01:03:24 +0200
changeset 11 68ea18fe402c
parent 10 188a0e3b3fb4
child 12 c755c945a96a
Adding code for the zprocess, for processing the Zaliznjak dictionary.
zdefs.c
zhash.c
zload.c
zrus.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zdefs.c	Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include "dictre.h"
+
+static void new_word(const char *str)
+{
+    printf("%s\n", str);
+}
+
+static int skip_newline(const char *str, int *index)
+{
+    while(str[*index] != 0 && str[*index] != '\n')
+    {
+        ++*index;
+    }
+
+    if (str[*index] == '\n')
+        return *index;
+
+    return -1;
+}
+
+static int until_noword(const char *str, int *index)
+{
+    while(str[*index] != 0 &&
+            str[*index] != ' ' &&
+            str[*index] != '\n' &&
+            str[*index] != '\r' &&
+            str[*index] != ',')
+    {
+        ++*index;
+    }
+
+    if (str[*index] != 0)
+        return *index;
+
+    return -1;
+}
+
+static int until_newword(const unsigned char *str, int *index)
+{
+    while(str[*index] != 0 && str[*index] < 128)
+    {
+        ++*index;
+    }
+
+    if (str[*index] != 0);
+        return *index;
+
+    return -1;
+}
+
+void zprocess_def(const char *root, char *def)
+{
+    int index = 0;
+    int res;
+    /* Jump the first line (index word) Wait for \n */
+    skip_newline(def, &index);
+    ++index;
+
+    res = until_newword(def, &index);
+    if (res == -1)
+        return;
+
+    /* Mark words */
+    do {
+        int end;
+        end = index;
+        res = until_noword(def, &end);
+        if (res == -1)
+            break;
+        def[end] = 0;
+        insert_word(&def[index], root);
+        index = end+1;
+        res = until_newword(def,&index);
+        if (res == -1)
+            break;
+    } while (1);
+    free(def);
+    free(root);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zhash.c	Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,183 @@
+#include <stdio.h>
+#include <assert.h>
+#include "dictre.h"
+
+enum
+{
+    MAXHASH=1<<16
+};
+
+struct BareWord
+{
+    struct BareWord *next;
+    char *str;
+};
+
+struct WordEntry
+{
+    struct WordEntry *next;
+    char *str;
+    struct BareWord *accented;
+    struct BareWord *unflexed;
+};
+
+static struct WordEntry * wordlist[MAXHASH];
+
+struct WordEntry * new_WordEntry()
+{
+    struct WordEntry *tmp;
+    tmp =  (struct WordEntry *) malloc(sizeof(*tmp));
+    assert(tmp != 0);
+    return tmp;
+}
+
+struct BareWord * new_BareWord()
+{
+    struct BareWord *tmp;
+    tmp =  (struct BareWord *) malloc(sizeof(*tmp));
+    assert(tmp != 0);
+    return tmp;
+}
+
+void init_wordlist()
+{
+    int i;
+    for(i=0; i < MAXHASH; ++i)
+    {
+        struct WordEntry *nodata;
+        nodata = new_WordEntry();
+        assert(nodata != 0);
+        nodata->str = 0;
+        nodata->accented = 0;
+        nodata->unflexed = 0;
+        nodata->next = 0;
+        wordlist[i] = nodata;
+    }
+}
+
+static unsigned int hash_func(const unsigned char *str)
+{
+    int res;
+    char v;
+
+    v = 0;
+
+    /* Taking only the meaningful utf-8 codes */
+    if (str[2] != 0)
+        v = str[3];
+
+    res = (str[1] << 8) + v;
+
+    return res;
+}
+
+/* Word without accent */
+struct WordEntry * does_word_exist(int hash, const char *word)
+{
+    struct WordEntry *tmp;
+
+    for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next)
+    {
+        if (tmp->str) /* The last item in the linked list will have str=0 */
+            if (strcmp(word, tmp->str) == 0)
+                return tmp;
+    }
+    return 0;
+}
+
+void add_to_unflexed(struct WordEntry *pos, const char *word)
+{
+    struct BareWord *tmp;
+
+    if (pos->unflexed == 0)
+    {
+        pos->unflexed = new_BareWord();
+        tmp = pos->unflexed;
+        tmp->str = strdup(word);
+        tmp->next = 0;
+    } else
+    {
+        /* Look for the same word */
+        for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next)
+        {
+            if (strcmp(word, pos->str) == 0)
+                break;
+        }
+        if (tmp == 0)
+        {
+            tmp = new_BareWord();
+        } else
+        {
+            struct BareWord *new;
+            new = new_BareWord();
+        }
+    }
+}
+
+void set_accented(struct WordEntry *pos, const char *word)
+{
+    if (pos->accented)
+        /* Will free the first parameter */
+        pos->accented->str = mix_accents(pos->accented->str, word);
+    else
+    {
+        pos->accented = new_BareWord();
+        pos->accented->str = strdup(word);
+        pos->accented->next = 0;
+    }
+}
+
+void insert_word(const char *word, const char *unflexed)
+{
+    int hash;
+    unsigned char word_no_accent[MAXWORD];
+    struct WordEntry *found;
+    unsigned int hash_num;
+
+    remove_accent(word_no_accent, word);
+
+    hash_num = hash_func(word_no_accent);
+
+    /* Where to insert */
+    found = does_word_exist(hash_num, word_no_accent);
+    if (found)
+    {
+        set_accented(found, word);
+        /* TODO process word_no_accent */
+    } else /* Does not exist */
+    {
+        /* new word */
+        struct WordEntry *new;
+
+        new = new_WordEntry();
+        new->str = strdup(word_no_accent);
+        new->unflexed = 0;
+        add_to_unflexed(new, unflexed);
+        new->accented = 0;
+        set_accented(new, word);
+        /* Put it on the head of the hash list */
+        new->next = wordlist[hash_num];
+        wordlist[hash_num] = new;
+    }
+}
+
+static void dump_word(struct WordEntry *word)
+{
+    printf("%s:%s\n", word->str, word->accented->str);
+}
+
+void dump_wordlist()
+{
+    int i;
+    for(i=0; i < MAXHASH; ++i)
+    {
+        struct WordEntry *word;
+        word = wordlist[i];
+        while (word != 0)
+        {
+            if (word->str)
+                dump_word(word);
+            word = word->next;
+        }
+    }
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zload.c	Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <sys/stat.h>
+#include "dictre.h"
+
+static FILE *index, *dict;
+static remove_tmp_file = 0;
+
+static new_word(const char *w, const char *defstr)
+{
+    printf("'%s': '%s'\n", w, defstr);
+}
+
+void zload_words(FILE *index, FILE *fdefs)
+{
+    int last_offset = 0;
+    int def_avoided = 0;
+    int numword = 0;;
+    static int dispnwords = 0;
+    static int nwords = 0;
+
+    do {
+        int offset, length;
+        char *defstr;
+        char *word;
+        word = get_word(index);
+        /*numword++;
+        printf("words: %i\n", numword);*/
+        if (word == 0)
+            break;
+        /*printf("Word: %s\n", w.w);*/
+        offset = get_int(index);
+        length = get_int(index);
+        defstr = get_def(fdefs, offset, length);
+
+        /* sizeof -1  instead of strlen() */
+        /* If the word is not 00database* ... */
+        if (strncmp(word, "00database", sizeof("00database") - 1) != 0)
+            zprocess_def(word, defstr);
+
+        /* stdout Display */
+        dispnwords++;
+        nwords++;
+        if (dispnwords >= 1000)
+        {
+            dispnwords = 0;
+            fprintf(stderr,
+                    "Loaded: %i Repeated definitions avoided: %i\n", nwords,
+                    def_avoided);
+        }
+
+    } while(1);
+}
+
+static void close_files()
+{
+    fclose(index);
+    fclose(dict);
+
+    if (remove_tmp_file)
+        unlink("/tmp/tmp.dict");
+}
+
+static void open_files(int argn, char **argv)
+{
+    char tmpname[500];
+    if (argn < 2)
+    {
+        fprintf(stderr, "usage: %s <dict_basename>\n", argv[0]);
+        exit(1);
+    }
+    strcpy(tmpname, argv[1]);
+    strcat(tmpname, ".index");
+    index = fopen(tmpname, "r");
+    if(index == NULL)
+    {
+        fprintf(stderr, "File: %s ", tmpname);
+        perror("- cannot open file.");
+        exit(-1);
+    }
+
+    strcpy(tmpname, argv[1]);
+    strcat(tmpname, ".dict");
+    dict = fopen(tmpname, "r");
+    if(dict == NULL)
+    {
+        struct stat st;
+        int res;
+        char tmp[500];
+        strcat(tmpname, ".dz");
+        res = stat(tmpname, &st);
+        if (res == -1)
+        {
+            fprintf(stderr, "File: %s ", tmpname);
+            perror("- cannot open file.");
+            exit(-1);
+        }
+        sprintf(tmp, "gzip -cd %s > /tmp/tmp.dict",
+                tmpname);
+        printf("Gunzipping...\n");
+        res = system(tmp);
+        dict = fopen("/tmp/tmp.dict", "r");
+        if(dict == NULL || res != 0)
+        {
+            fprintf(stderr, "Error gunzipping file: %s ", tmpname);
+            perror("- something happened to /tmp/tmp.dict.");
+            exit(-1);
+        }
+        remove_tmp_file = 1;
+    }
+}
+
+int main(int argn, char **argv)
+{
+    open_files(argn, argv);
+    init_wordlist();
+    zload_words(index, dict);
+    dump_wordlist();
+    close_files();
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zrus.c	Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include "dictre.h"
+
+static int closed_accent(const unsigned char *tmp)
+{
+    if (tmp[0] == 0xcc && tmp[1] == 0x81)
+        return 1;
+    return 0;
+}
+
+static int open_accent(const unsigned char *tmp)
+{
+    if (tmp[0] == 0x60)
+        return 1;
+    return 0;
+}
+
+/* Must free what is needed */
+char * mix_accents(char *a, const char *b)
+{
+    int ia,ib,o;
+    char *out;
+    char tmp[MAXWORD];
+
+    ia = 0;
+    ib = 0;
+    o = 0;
+    while(a[ia] != 0 || b[ib] != 0)
+    {
+        if (closed_accent(&a[ia]))
+        {
+            tmp[o] = a[ia];
+            tmp[o+1] = a[ia+1];
+            o+=2;
+            ia+=2;
+            if(closed_accent(&b[ib]))
+                ib+=2;
+            continue;
+        } else if (closed_accent(&b[ib]))
+        {
+            tmp[o] = b[ib];
+            tmp[o+1] = b[ib+1];
+            o+=2;
+            ib+=2;
+            continue;
+        } else if (open_accent(&a[ia]))
+        {
+            tmp[o] = b[ia];
+            o+=1;
+            ia+=1;
+            if (open_accent(&b[ib]))
+                ib+=1;
+            continue;
+        } else if (open_accent(&b[ib]))
+        {
+            tmp[o] = b[ib];
+            o+=1;
+            ib+=1;
+            continue;
+        }
+        else
+        {
+            /* Letter */
+            tmp[o] = a[ia];
+            if (a[ia] != 0)
+                ++ia;
+            if (b[ib] != 0)
+                ++ib;
+            ++o;
+        }
+    }
+    tmp[o] = 0;
+    out = strdup(tmp);
+    free(a);
+    return out;
+}
+
+void remove_accent(unsigned char *dest, const unsigned char *from)
+{
+    int i,o;
+
+    i = 0;
+    o = 0;
+    while (from[i] != 0)
+    {
+        if (from[i] == 0xcc && from[i+1] == 0x81)
+            i+=2;
+        else if (from[i] == 0x60)
+            ++i;
+        else
+        {
+            dest[o] = from[i];
+            ++o;
+            ++i;
+        }
+    }
+    dest[o] = 0;
+}