# HG changeset patch # User viric@llimona # Date 1188255804 -7200 # Node ID 68ea18fe402c6cff2374fecec884335bf4c61447 # Parent 188a0e3b3fb42a8e92763459165f374e7c0683b8 Adding code for the zprocess, for processing the Zaliznjak dictionary. diff -r 188a0e3b3fb4 -r 68ea18fe402c zdefs.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/zdefs.c Tue Aug 28 01:03:24 2007 +0200 @@ -0,0 +1,80 @@ +#include +#include "dictre.h" + +static void new_word(const char *str) +{ + printf("%s\n", str); +} + +static int skip_newline(const char *str, int *index) +{ + while(str[*index] != 0 && str[*index] != '\n') + { + ++*index; + } + + if (str[*index] == '\n') + return *index; + + return -1; +} + +static int until_noword(const char *str, int *index) +{ + while(str[*index] != 0 && + str[*index] != ' ' && + str[*index] != '\n' && + str[*index] != '\r' && + str[*index] != ',') + { + ++*index; + } + + if (str[*index] != 0) + return *index; + + return -1; +} + +static int until_newword(const unsigned char *str, int *index) +{ + while(str[*index] != 0 && str[*index] < 128) + { + ++*index; + } + + if (str[*index] != 0); + return *index; + + return -1; +} + +void zprocess_def(const char *root, char *def) +{ + int index = 0; + int res; + /* Jump the first line (index word) Wait for \n */ + skip_newline(def, &index); + ++index; + + res = until_newword(def, &index); + if (res == -1) + return; + + /* Mark words */ + do { + int end; + end = index; + res = until_noword(def, &end); + if (res == -1) + break; + def[end] = 0; + insert_word(&def[index], root); + index = end+1; + res = until_newword(def,&index); + if (res == -1) + break; + } while (1); + free(def); + free(root); +} diff -r 188a0e3b3fb4 -r 68ea18fe402c zhash.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/zhash.c Tue Aug 28 01:03:24 2007 +0200 @@ -0,0 +1,183 @@ +#include +#include +#include "dictre.h" + +enum +{ + MAXHASH=1<<16 +}; + +struct BareWord +{ + struct BareWord *next; + char *str; +}; + +struct WordEntry +{ + struct WordEntry *next; + char *str; + struct BareWord *accented; + struct BareWord *unflexed; +}; + +static struct WordEntry * wordlist[MAXHASH]; + +struct WordEntry * new_WordEntry() +{ + struct WordEntry *tmp; + tmp = (struct WordEntry *) malloc(sizeof(*tmp)); + assert(tmp != 0); + return tmp; +} + +struct BareWord * new_BareWord() +{ + struct BareWord *tmp; + tmp = (struct BareWord *) malloc(sizeof(*tmp)); + assert(tmp != 0); + return tmp; +} + +void init_wordlist() +{ + int i; + for(i=0; i < MAXHASH; ++i) + { + struct WordEntry *nodata; + nodata = new_WordEntry(); + assert(nodata != 0); + nodata->str = 0; + nodata->accented = 0; + nodata->unflexed = 0; + nodata->next = 0; + wordlist[i] = nodata; + } +} + +static unsigned int hash_func(const unsigned char *str) +{ + int res; + char v; + + v = 0; + + /* Taking only the meaningful utf-8 codes */ + if (str[2] != 0) + v = str[3]; + + res = (str[1] << 8) + v; + + return res; +} + +/* Word without accent */ +struct WordEntry * does_word_exist(int hash, const char *word) +{ + struct WordEntry *tmp; + + for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next) + { + if (tmp->str) /* The last item in the linked list will have str=0 */ + if (strcmp(word, tmp->str) == 0) + return tmp; + } + return 0; +} + +void add_to_unflexed(struct WordEntry *pos, const char *word) +{ + struct BareWord *tmp; + + if (pos->unflexed == 0) + { + pos->unflexed = new_BareWord(); + tmp = pos->unflexed; + tmp->str = strdup(word); + tmp->next = 0; + } else + { + /* Look for the same word */ + for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next) + { + if (strcmp(word, pos->str) == 0) + break; + } + if (tmp == 0) + { + tmp = new_BareWord(); + } else + { + struct BareWord *new; + new = new_BareWord(); + } + } +} + +void set_accented(struct WordEntry *pos, const char *word) +{ + if (pos->accented) + /* Will free the first parameter */ + pos->accented->str = mix_accents(pos->accented->str, word); + else + { + pos->accented = new_BareWord(); + pos->accented->str = strdup(word); + pos->accented->next = 0; + } +} + +void insert_word(const char *word, const char *unflexed) +{ + int hash; + unsigned char word_no_accent[MAXWORD]; + struct WordEntry *found; + unsigned int hash_num; + + remove_accent(word_no_accent, word); + + hash_num = hash_func(word_no_accent); + + /* Where to insert */ + found = does_word_exist(hash_num, word_no_accent); + if (found) + { + set_accented(found, word); + /* TODO process word_no_accent */ + } else /* Does not exist */ + { + /* new word */ + struct WordEntry *new; + + new = new_WordEntry(); + new->str = strdup(word_no_accent); + new->unflexed = 0; + add_to_unflexed(new, unflexed); + new->accented = 0; + set_accented(new, word); + /* Put it on the head of the hash list */ + new->next = wordlist[hash_num]; + wordlist[hash_num] = new; + } +} + +static void dump_word(struct WordEntry *word) +{ + printf("%s:%s\n", word->str, word->accented->str); +} + +void dump_wordlist() +{ + int i; + for(i=0; i < MAXHASH; ++i) + { + struct WordEntry *word; + word = wordlist[i]; + while (word != 0) + { + if (word->str) + dump_word(word); + word = word->next; + } + } +} diff -r 188a0e3b3fb4 -r 68ea18fe402c zload.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/zload.c Tue Aug 28 01:03:24 2007 +0200 @@ -0,0 +1,119 @@ +#include +#include +#include "dictre.h" + +static FILE *index, *dict; +static remove_tmp_file = 0; + +static new_word(const char *w, const char *defstr) +{ + printf("'%s': '%s'\n", w, defstr); +} + +void zload_words(FILE *index, FILE *fdefs) +{ + int last_offset = 0; + int def_avoided = 0; + int numword = 0;; + static int dispnwords = 0; + static int nwords = 0; + + do { + int offset, length; + char *defstr; + char *word; + word = get_word(index); + /*numword++; + printf("words: %i\n", numword);*/ + if (word == 0) + break; + /*printf("Word: %s\n", w.w);*/ + offset = get_int(index); + length = get_int(index); + defstr = get_def(fdefs, offset, length); + + /* sizeof -1 instead of strlen() */ + /* If the word is not 00database* ... */ + if (strncmp(word, "00database", sizeof("00database") - 1) != 0) + zprocess_def(word, defstr); + + /* stdout Display */ + dispnwords++; + nwords++; + if (dispnwords >= 1000) + { + dispnwords = 0; + fprintf(stderr, + "Loaded: %i Repeated definitions avoided: %i\n", nwords, + def_avoided); + } + + } while(1); +} + +static void close_files() +{ + fclose(index); + fclose(dict); + + if (remove_tmp_file) + unlink("/tmp/tmp.dict"); +} + +static void open_files(int argn, char **argv) +{ + char tmpname[500]; + if (argn < 2) + { + fprintf(stderr, "usage: %s \n", argv[0]); + exit(1); + } + strcpy(tmpname, argv[1]); + strcat(tmpname, ".index"); + index = fopen(tmpname, "r"); + if(index == NULL) + { + fprintf(stderr, "File: %s ", tmpname); + perror("- cannot open file."); + exit(-1); + } + + strcpy(tmpname, argv[1]); + strcat(tmpname, ".dict"); + dict = fopen(tmpname, "r"); + if(dict == NULL) + { + struct stat st; + int res; + char tmp[500]; + strcat(tmpname, ".dz"); + res = stat(tmpname, &st); + if (res == -1) + { + fprintf(stderr, "File: %s ", tmpname); + perror("- cannot open file."); + exit(-1); + } + sprintf(tmp, "gzip -cd %s > /tmp/tmp.dict", + tmpname); + printf("Gunzipping...\n"); + res = system(tmp); + dict = fopen("/tmp/tmp.dict", "r"); + if(dict == NULL || res != 0) + { + fprintf(stderr, "Error gunzipping file: %s ", tmpname); + perror("- something happened to /tmp/tmp.dict."); + exit(-1); + } + remove_tmp_file = 1; + } +} + +int main(int argn, char **argv) +{ + open_files(argn, argv); + init_wordlist(); + zload_words(index, dict); + dump_wordlist(); + close_files(); +} diff -r 188a0e3b3fb4 -r 68ea18fe402c zrus.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/zrus.c Tue Aug 28 01:03:24 2007 +0200 @@ -0,0 +1,98 @@ +#include +#include "dictre.h" + +static int closed_accent(const unsigned char *tmp) +{ + if (tmp[0] == 0xcc && tmp[1] == 0x81) + return 1; + return 0; +} + +static int open_accent(const unsigned char *tmp) +{ + if (tmp[0] == 0x60) + return 1; + return 0; +} + +/* Must free what is needed */ +char * mix_accents(char *a, const char *b) +{ + int ia,ib,o; + char *out; + char tmp[MAXWORD]; + + ia = 0; + ib = 0; + o = 0; + while(a[ia] != 0 || b[ib] != 0) + { + if (closed_accent(&a[ia])) + { + tmp[o] = a[ia]; + tmp[o+1] = a[ia+1]; + o+=2; + ia+=2; + if(closed_accent(&b[ib])) + ib+=2; + continue; + } else if (closed_accent(&b[ib])) + { + tmp[o] = b[ib]; + tmp[o+1] = b[ib+1]; + o+=2; + ib+=2; + continue; + } else if (open_accent(&a[ia])) + { + tmp[o] = b[ia]; + o+=1; + ia+=1; + if (open_accent(&b[ib])) + ib+=1; + continue; + } else if (open_accent(&b[ib])) + { + tmp[o] = b[ib]; + o+=1; + ib+=1; + continue; + } + else + { + /* Letter */ + tmp[o] = a[ia]; + if (a[ia] != 0) + ++ia; + if (b[ib] != 0) + ++ib; + ++o; + } + } + tmp[o] = 0; + out = strdup(tmp); + free(a); + return out; +} + +void remove_accent(unsigned char *dest, const unsigned char *from) +{ + int i,o; + + i = 0; + o = 0; + while (from[i] != 0) + { + if (from[i] == 0xcc && from[i+1] == 0x81) + i+=2; + else if (from[i] == 0x60) + ++i; + else + { + dest[o] = from[i]; + ++o; + ++i; + } + } + dest[o] = 0; +}