first 'zparsetext'.
authorviric@llimona
Wed, 29 Aug 2007 00:19:14 +0200
changeset 14 a961bb8806b9
parent 13 f71e89074c62
child 15 17a66ceb774a
first 'zparsetext'.
Makefile
dict.c
dictre.h
errors.txt
find.c
make-akcentiga.sh
parse_text.c
zdefs.c
zrus.c
--- a/Makefile	Tue Aug 28 08:40:49 2007 +0200
+++ b/Makefile	Wed Aug 29 00:19:14 2007 +0200
@@ -1,7 +1,8 @@
-CFLAGS=-O2 -g
+CFLAGS=-g
 CC=gcc
 
-all: dictre idx2index trim-nou8 ia5 asciiigi-utf8-akcenton zprocess
+all: dictre idx2index trim-nou8 ia5 asciiigi-utf8-akcenton zprocess \
+	zparsetext
 
 idx2index: idx2index.o dict.o
 trim-nou8: trim-nou8.c
@@ -12,7 +13,10 @@
 dictre: load.o dict.o write.o sort.o filter.o main.o fastmalloc.o repeated.o
 	$(CC) -o $@ $^
 
-zprocess: zload.o dict.o zdefs.o zhash.o zrus.o fastmalloc.o
+zprocess: zload.o dict.o zdefs.o zhash.o zrus.o
+	$(CC) -o $@ $^
+
+zparsetext: parse_text.o zrus.o find.o dict.o
 	$(CC) -o $@ $^
 
 dict.c: dictre.h
@@ -28,3 +32,4 @@
 zdefs.c: dictre.h
 zhash.c: dictre.h
 zrus.c: dictre.h
+find.c: dictre.h
--- a/dict.c	Tue Aug 28 08:40:49 2007 +0200
+++ b/dict.c	Wed Aug 29 00:19:14 2007 +0200
@@ -113,14 +113,11 @@
     }
 }
 
-int str2int(const char *str)
+int str2int_len(const char *str, int length)
 {
     int i = 0;
-    int length;
     int val = 0;
 
-    length = strlen(str);
-
     while (i < length)
     {
         val = char2val(str[i]) + val * 64;
@@ -130,6 +127,12 @@
     return val;
 }
 
+int str2int(const char *str)
+{
+    int length = strlen(str);
+    return str2int_len(str, length);
+}
+
 int get_int(FILE *index)
 {
     char buffer[500];
--- a/dictre.h	Tue Aug 28 08:40:49 2007 +0200
+++ b/dictre.h	Wed Aug 29 00:19:14 2007 +0200
@@ -1,5 +1,6 @@
 enum {
-    MAXWORD=200
+    MAXWORD=200,
+    MAXDEF=10000
 };
 
 struct Words
@@ -64,3 +65,10 @@
 /* zrus.c */
 char * mix_accents(char *a, const char *b);
 void remove_accent(unsigned char *dest, const unsigned char *from);
+int until_newword(const unsigned char *str, int *index);
+int until_noword(const char *str, int *index);
+int skip_newline(const char *str, int *index);
+int is_ASCII(unsigned char c);
+
+/* find.c */
+void find_def(const char *word, char * def);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/errors.txt	Wed Aug 29 00:19:14 2007 +0200
@@ -0,0 +1,3 @@
+* La vorto 'телеканал' cxeestas en Zaliznjak, sed ne en 'akcentiga.index'.
+* La majusklojn ĝi ne pritraktas.
+* Mankas получить en Zaliznjak.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/find.c	Wed Aug 29 00:19:14 2007 +0200
@@ -0,0 +1,175 @@
+#include <stdio.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include "dictre.h"
+
+static unsigned char *index;
+static int indexfd;
+static int indexsize;
+static FILE *defs;
+const static char indexname[] = "akcentiga.index";
+const static char dictname[] = "akcentiga.dict";
+
+int get_filesize(const char *fname)
+{
+    struct stat st;
+    int res;
+    res = stat(fname, &st);
+    if (res == -1)
+    {
+        fprintf(stderr, "Problem stating the file %s\n", fname);
+        perror("Error:");
+        exit(-1);
+    }
+
+    return st.st_size;
+}
+
+void init_dictionary()
+{
+    indexsize = get_filesize(indexname);
+    indexfd = open(indexname, O_RDONLY);
+    if (indexfd == -1)
+    {
+        fprintf(stderr, "Problem opening the file %s\n", indexname);
+        perror("Error:");
+        exit(-1);
+    }
+    index = (unsigned char *) mmap(0, indexsize, PROT_READ, MAP_SHARED,
+            indexfd, 0);
+
+    defs = fopen(dictname, "r");
+    if (defs == 0)
+    {
+        fprintf(stderr, "Problem opening the file %s\n", dictname);
+        perror("Error:");
+        exit(-1);
+    }
+}
+
+void end_dictionary()
+{
+    munmap(index, indexsize);
+    close(indexfd);
+    fclose(defs);
+}
+
+static void fill_def(int offset, int length, char * def)
+{
+    fseek(defs, offset, SEEK_SET);
+    fread(def, 1, length, defs);
+}
+
+static int pointer_at_end(unsigned char *ptr)
+{
+    if (ptr >= (index + indexsize))
+        return 1;
+    return 0;
+}
+
+static char * skip_until_newline(char *from)
+{
+    if (pointer_at_end(from))
+        return 0;
+    while(*from != '\n' && *from != 0)
+    {
+        ++from;
+        if(pointer_at_end(from))
+            return 0;
+    }
+    return from;
+}
+
+static int compare(const unsigned char *word, const unsigned char *test)
+{
+    int i;
+
+    /*printf("Comparing %s to %.20s\n", word, test);*/
+    for(i=0; word[i] != 0 && test[i] != 0; ++i)
+    {
+        if (word[i] != test[i])
+        {
+            break;
+        }
+    }
+    if (word[i] == 0 && test[i] == '\t')
+        return 0;
+    else if (word[i] == 0)
+        return -1;
+    else if (test[i] == '\t')
+        return 1;
+    else if (word[i] > test[i])
+        return 1;
+    else if (word[i] < test[i])
+        return -1;
+
+    /* It should never reach this. */
+    return -1;
+}
+
+static char * bin_search(const char *word)
+{
+    int step, pivot;
+
+    pivot = indexsize / 2;
+    step = indexsize / 2;
+
+    do
+    {
+        char *test;
+        int comparision;
+        test = index + pivot;
+        test = skip_until_newline(test);
+        if (test == 0)
+            return 0;
+        test += 1; /* skip exactly the new line */
+
+        comparision = compare(word, test);
+        if (comparision == 0)
+        {
+            return test + strlen(word) + 1; /* skip word and \n */
+        } else if (comparision < 0)
+        {
+            step = step / 2;
+            pivot = pivot - step;
+        } else if (comparision > 0)
+        {
+            step = step / 2;
+            pivot = pivot + step;
+        }
+    } while(step > 0);
+    return 0;
+}
+
+static int my_get_int(char **pos)
+{
+    int i;
+    char *start;
+    int val;
+
+    start = *pos;
+    for(i=0; start[i] != '\t' && start[i] != '\n'; ++i)
+        ;
+    val = str2int_len(start, i);
+    *pos += i + 1;
+    return val;
+}
+
+void find_def(const char *word, char * def)
+{
+    int offset, len;
+    char *pos;
+
+    pos = bin_search(word); /* pos points to the offset already. */
+    if (pos == 0)
+    {
+        def[0] = 0;
+        /*fprintf(stderr, "Cannot find %s\n", word);*/
+        return;
+    }
+    offset = my_get_int(&pos); /* increments pos */
+    len = my_get_int(&pos); /* increments pos */
+    fill_def(offset, len, def);
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/make-akcentiga.sh	Wed Aug 29 00:19:14 2007 +0200
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+dictfmt -s "Полная акцентуированная парадигма по А. А.  Зализняку" -j --locale ca_ES.UTF-8 --without-headword akcentiga < akcentiga.txt
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/parse_text.c	Wed Aug 29 00:19:14 2007 +0200
@@ -0,0 +1,64 @@
+#include <stdio.h>
+#include "dictre.h"
+
+static void give_accent_to_word(const char *tmp)
+{
+    char def[MAXDEF];
+
+    find_def(tmp, def);
+    if (def[0] != 0) /* found */
+    {
+        /* Print the word UNTIL a space.
+         * the definition will have the form:
+         *    ACCENTED_WORD NOMINATIVE1 NOMINATIVE2 ... \n */
+        char *first_space;
+        char *pos;
+        first_space = strchr(def, ' ');
+        if (first_space != 0) /* Space found */
+            for(pos = def; pos < first_space; ++pos)
+                putchar(*pos);
+        return;
+    }
+
+    /* if first_space == 0 or word not found */
+    printf("%s", tmp);
+}
+
+static void process_text(FILE *in, int pos, int length)
+{
+    unsigned char tmp[MAXWORD];
+    int wordpos = 0;
+    do
+    {
+        int c;
+        /* Check pos only if length >= 0 */
+        if (length >= 0 && pos >= length)
+            break;
+        c = fgetc(in);
+        if (c == EOF)
+            break;
+        if (is_ASCII(c))
+        {
+            if (wordpos != 0)
+            {
+                tmp[wordpos] = 0;
+                give_accent_to_word(tmp);
+                wordpos = 0;
+            }
+            putchar(c);
+        }
+        else /* non-ASCII - we consider it russian */
+        {
+            tmp[wordpos++] = c;
+        }
+
+        pos += 1;
+    } while(1);
+}
+
+int main()
+{
+    init_dictionary();
+    process_text(stdin, 0, -1);
+    end_dictionary();
+}
--- a/zdefs.c	Tue Aug 28 08:40:49 2007 +0200
+++ b/zdefs.c	Wed Aug 29 00:19:14 2007 +0200
@@ -6,48 +6,6 @@
     printf("%s\n", str);
 }
 
-static int skip_newline(const char *str, int *index)
-{
-    while(str[*index] != 0 && str[*index] != '\n')
-    {
-        ++*index;
-    }
-
-    if (str[*index] == '\n')
-        return *index;
-
-    return -1;
-}
-
-static int until_noword(const char *str, int *index)
-{
-    while(str[*index] != 0 &&
-            str[*index] != ' ' &&
-            str[*index] != '\n' &&
-            str[*index] != '\r' &&
-            str[*index] != ',')
-    {
-        ++*index;
-    }
-
-    if (str[*index] != 0)
-        return *index;
-
-    return -1;
-}
-
-static int until_newword(const unsigned char *str, int *index)
-{
-    while(str[*index] != 0 && str[*index] < 128)
-    {
-        ++*index;
-    }
-
-    if (str[*index] != 0);
-        return *index;
-
-    return -1;
-}
 
 void zprocess_def(const char *root, char *def)
 {
--- a/zrus.c	Tue Aug 28 08:40:49 2007 +0200
+++ b/zrus.c	Wed Aug 29 00:19:14 2007 +0200
@@ -96,3 +96,53 @@
     }
     dest[o] = 0;
 }
+
+int skip_newline(const char *str, int *index)
+{
+    while(str[*index] != 0 && str[*index] != '\n')
+    {
+        ++*index;
+    }
+
+    if (str[*index] == '\n')
+        return *index;
+
+    return -1;
+}
+
+int until_noword(const char *str, int *index)
+{
+    while(str[*index] != 0 &&
+            str[*index] != ' ' &&
+            str[*index] != '\n' &&
+            str[*index] != '\r' &&
+            str[*index] != ',')
+    {
+        ++*index;
+    }
+
+    if (str[*index] != 0)
+        return *index;
+
+    return -1;
+}
+
+int is_ASCII(unsigned char c)
+{
+    if (c < 128)
+        return 1;
+    return 0;
+}
+
+int until_newword(const unsigned char *str, int *index)
+{
+    while(str[*index] != 0 && is_ASCII(str[*index]))
+    {
+        ++*index;
+    }
+
+    if (str[*index] != 0);
+        return *index;
+
+    return -1;
+}