Adding code for the zprocess, for processing the Zaliznjak dictionary.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/zdefs.c Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,80 @@
+#include <stdio.h>
+#include "dictre.h"
+
+static void new_word(const char *str)
+{
+ printf("%s\n", str);
+}
+
+static int skip_newline(const char *str, int *index)
+{
+ while(str[*index] != 0 && str[*index] != '\n')
+ {
+ ++*index;
+ }
+
+ if (str[*index] == '\n')
+ return *index;
+
+ return -1;
+}
+
+static int until_noword(const char *str, int *index)
+{
+ while(str[*index] != 0 &&
+ str[*index] != ' ' &&
+ str[*index] != '\n' &&
+ str[*index] != '\r' &&
+ str[*index] != ',')
+ {
+ ++*index;
+ }
+
+ if (str[*index] != 0)
+ return *index;
+
+ return -1;
+}
+
+static int until_newword(const unsigned char *str, int *index)
+{
+ while(str[*index] != 0 && str[*index] < 128)
+ {
+ ++*index;
+ }
+
+ if (str[*index] != 0);
+ return *index;
+
+ return -1;
+}
+
+void zprocess_def(const char *root, char *def)
+{
+ int index = 0;
+ int res;
+ /* Jump the first line (index word) Wait for \n */
+ skip_newline(def, &index);
+ ++index;
+
+ res = until_newword(def, &index);
+ if (res == -1)
+ return;
+
+ /* Mark words */
+ do {
+ int end;
+ end = index;
+ res = until_noword(def, &end);
+ if (res == -1)
+ break;
+ def[end] = 0;
+ insert_word(&def[index], root);
+ index = end+1;
+ res = until_newword(def,&index);
+ if (res == -1)
+ break;
+ } while (1);
+ free(def);
+ free(root);
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/zhash.c Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,183 @@
+#include <stdio.h>
+#include <assert.h>
+#include "dictre.h"
+
+enum
+{
+ MAXHASH=1<<16
+};
+
+struct BareWord
+{
+ struct BareWord *next;
+ char *str;
+};
+
+struct WordEntry
+{
+ struct WordEntry *next;
+ char *str;
+ struct BareWord *accented;
+ struct BareWord *unflexed;
+};
+
+static struct WordEntry * wordlist[MAXHASH];
+
+struct WordEntry * new_WordEntry()
+{
+ struct WordEntry *tmp;
+ tmp = (struct WordEntry *) malloc(sizeof(*tmp));
+ assert(tmp != 0);
+ return tmp;
+}
+
+struct BareWord * new_BareWord()
+{
+ struct BareWord *tmp;
+ tmp = (struct BareWord *) malloc(sizeof(*tmp));
+ assert(tmp != 0);
+ return tmp;
+}
+
+void init_wordlist()
+{
+ int i;
+ for(i=0; i < MAXHASH; ++i)
+ {
+ struct WordEntry *nodata;
+ nodata = new_WordEntry();
+ assert(nodata != 0);
+ nodata->str = 0;
+ nodata->accented = 0;
+ nodata->unflexed = 0;
+ nodata->next = 0;
+ wordlist[i] = nodata;
+ }
+}
+
+static unsigned int hash_func(const unsigned char *str)
+{
+ int res;
+ char v;
+
+ v = 0;
+
+ /* Taking only the meaningful utf-8 codes */
+ if (str[2] != 0)
+ v = str[3];
+
+ res = (str[1] << 8) + v;
+
+ return res;
+}
+
+/* Word without accent */
+struct WordEntry * does_word_exist(int hash, const char *word)
+{
+ struct WordEntry *tmp;
+
+ for(tmp = wordlist[hash]; tmp != 0; tmp = tmp->next)
+ {
+ if (tmp->str) /* The last item in the linked list will have str=0 */
+ if (strcmp(word, tmp->str) == 0)
+ return tmp;
+ }
+ return 0;
+}
+
+void add_to_unflexed(struct WordEntry *pos, const char *word)
+{
+ struct BareWord *tmp;
+
+ if (pos->unflexed == 0)
+ {
+ pos->unflexed = new_BareWord();
+ tmp = pos->unflexed;
+ tmp->str = strdup(word);
+ tmp->next = 0;
+ } else
+ {
+ /* Look for the same word */
+ for(tmp = pos->unflexed; tmp != 0; tmp = tmp->next)
+ {
+ if (strcmp(word, pos->str) == 0)
+ break;
+ }
+ if (tmp == 0)
+ {
+ tmp = new_BareWord();
+ } else
+ {
+ struct BareWord *new;
+ new = new_BareWord();
+ }
+ }
+}
+
+void set_accented(struct WordEntry *pos, const char *word)
+{
+ if (pos->accented)
+ /* Will free the first parameter */
+ pos->accented->str = mix_accents(pos->accented->str, word);
+ else
+ {
+ pos->accented = new_BareWord();
+ pos->accented->str = strdup(word);
+ pos->accented->next = 0;
+ }
+}
+
+void insert_word(const char *word, const char *unflexed)
+{
+ int hash;
+ unsigned char word_no_accent[MAXWORD];
+ struct WordEntry *found;
+ unsigned int hash_num;
+
+ remove_accent(word_no_accent, word);
+
+ hash_num = hash_func(word_no_accent);
+
+ /* Where to insert */
+ found = does_word_exist(hash_num, word_no_accent);
+ if (found)
+ {
+ set_accented(found, word);
+ /* TODO process word_no_accent */
+ } else /* Does not exist */
+ {
+ /* new word */
+ struct WordEntry *new;
+
+ new = new_WordEntry();
+ new->str = strdup(word_no_accent);
+ new->unflexed = 0;
+ add_to_unflexed(new, unflexed);
+ new->accented = 0;
+ set_accented(new, word);
+ /* Put it on the head of the hash list */
+ new->next = wordlist[hash_num];
+ wordlist[hash_num] = new;
+ }
+}
+
+static void dump_word(struct WordEntry *word)
+{
+ printf("%s:%s\n", word->str, word->accented->str);
+}
+
+void dump_wordlist()
+{
+ int i;
+ for(i=0; i < MAXHASH; ++i)
+ {
+ struct WordEntry *word;
+ word = wordlist[i];
+ while (word != 0)
+ {
+ if (word->str)
+ dump_word(word);
+ word = word->next;
+ }
+ }
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/zload.c Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,119 @@
+#include <stdio.h>
+#include <sys/stat.h>
+#include "dictre.h"
+
+static FILE *index, *dict;
+static remove_tmp_file = 0;
+
+static new_word(const char *w, const char *defstr)
+{
+ printf("'%s': '%s'\n", w, defstr);
+}
+
+void zload_words(FILE *index, FILE *fdefs)
+{
+ int last_offset = 0;
+ int def_avoided = 0;
+ int numword = 0;;
+ static int dispnwords = 0;
+ static int nwords = 0;
+
+ do {
+ int offset, length;
+ char *defstr;
+ char *word;
+ word = get_word(index);
+ /*numword++;
+ printf("words: %i\n", numword);*/
+ if (word == 0)
+ break;
+ /*printf("Word: %s\n", w.w);*/
+ offset = get_int(index);
+ length = get_int(index);
+ defstr = get_def(fdefs, offset, length);
+
+ /* sizeof -1 instead of strlen() */
+ /* If the word is not 00database* ... */
+ if (strncmp(word, "00database", sizeof("00database") - 1) != 0)
+ zprocess_def(word, defstr);
+
+ /* stdout Display */
+ dispnwords++;
+ nwords++;
+ if (dispnwords >= 1000)
+ {
+ dispnwords = 0;
+ fprintf(stderr,
+ "Loaded: %i Repeated definitions avoided: %i\n", nwords,
+ def_avoided);
+ }
+
+ } while(1);
+}
+
+static void close_files()
+{
+ fclose(index);
+ fclose(dict);
+
+ if (remove_tmp_file)
+ unlink("/tmp/tmp.dict");
+}
+
+static void open_files(int argn, char **argv)
+{
+ char tmpname[500];
+ if (argn < 2)
+ {
+ fprintf(stderr, "usage: %s <dict_basename>\n", argv[0]);
+ exit(1);
+ }
+ strcpy(tmpname, argv[1]);
+ strcat(tmpname, ".index");
+ index = fopen(tmpname, "r");
+ if(index == NULL)
+ {
+ fprintf(stderr, "File: %s ", tmpname);
+ perror("- cannot open file.");
+ exit(-1);
+ }
+
+ strcpy(tmpname, argv[1]);
+ strcat(tmpname, ".dict");
+ dict = fopen(tmpname, "r");
+ if(dict == NULL)
+ {
+ struct stat st;
+ int res;
+ char tmp[500];
+ strcat(tmpname, ".dz");
+ res = stat(tmpname, &st);
+ if (res == -1)
+ {
+ fprintf(stderr, "File: %s ", tmpname);
+ perror("- cannot open file.");
+ exit(-1);
+ }
+ sprintf(tmp, "gzip -cd %s > /tmp/tmp.dict",
+ tmpname);
+ printf("Gunzipping...\n");
+ res = system(tmp);
+ dict = fopen("/tmp/tmp.dict", "r");
+ if(dict == NULL || res != 0)
+ {
+ fprintf(stderr, "Error gunzipping file: %s ", tmpname);
+ perror("- something happened to /tmp/tmp.dict.");
+ exit(-1);
+ }
+ remove_tmp_file = 1;
+ }
+}
+
+int main(int argn, char **argv)
+{
+ open_files(argn, argv);
+ init_wordlist();
+ zload_words(index, dict);
+ dump_wordlist();
+ close_files();
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/zrus.c Tue Aug 28 01:03:24 2007 +0200
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include "dictre.h"
+
+static int closed_accent(const unsigned char *tmp)
+{
+ if (tmp[0] == 0xcc && tmp[1] == 0x81)
+ return 1;
+ return 0;
+}
+
+static int open_accent(const unsigned char *tmp)
+{
+ if (tmp[0] == 0x60)
+ return 1;
+ return 0;
+}
+
+/* Must free what is needed */
+char * mix_accents(char *a, const char *b)
+{
+ int ia,ib,o;
+ char *out;
+ char tmp[MAXWORD];
+
+ ia = 0;
+ ib = 0;
+ o = 0;
+ while(a[ia] != 0 || b[ib] != 0)
+ {
+ if (closed_accent(&a[ia]))
+ {
+ tmp[o] = a[ia];
+ tmp[o+1] = a[ia+1];
+ o+=2;
+ ia+=2;
+ if(closed_accent(&b[ib]))
+ ib+=2;
+ continue;
+ } else if (closed_accent(&b[ib]))
+ {
+ tmp[o] = b[ib];
+ tmp[o+1] = b[ib+1];
+ o+=2;
+ ib+=2;
+ continue;
+ } else if (open_accent(&a[ia]))
+ {
+ tmp[o] = b[ia];
+ o+=1;
+ ia+=1;
+ if (open_accent(&b[ib]))
+ ib+=1;
+ continue;
+ } else if (open_accent(&b[ib]))
+ {
+ tmp[o] = b[ib];
+ o+=1;
+ ib+=1;
+ continue;
+ }
+ else
+ {
+ /* Letter */
+ tmp[o] = a[ia];
+ if (a[ia] != 0)
+ ++ia;
+ if (b[ib] != 0)
+ ++ib;
+ ++o;
+ }
+ }
+ tmp[o] = 0;
+ out = strdup(tmp);
+ free(a);
+ return out;
+}
+
+void remove_accent(unsigned char *dest, const unsigned char *from)
+{
+ int i,o;
+
+ i = 0;
+ o = 0;
+ while (from[i] != 0)
+ {
+ if (from[i] == 0xcc && from[i+1] == 0x81)
+ i+=2;
+ else if (from[i] == 0x60)
+ ++i;
+ else
+ {
+ dest[o] = from[i];
+ ++o;
+ ++i;
+ }
+ }
+ dest[o] = 0;
+}