# HG changeset patch # User viric@llimona # Date 1188339554 -7200 # Node ID a961bb8806b9958114dec840607606316101ebfe # Parent f71e89074c62312c9202fda2e14fc7acd3f66dcf first 'zparsetext'. diff -r f71e89074c62 -r a961bb8806b9 Makefile --- a/Makefile Tue Aug 28 08:40:49 2007 +0200 +++ b/Makefile Wed Aug 29 00:19:14 2007 +0200 @@ -1,7 +1,8 @@ -CFLAGS=-O2 -g +CFLAGS=-g CC=gcc -all: dictre idx2index trim-nou8 ia5 asciiigi-utf8-akcenton zprocess +all: dictre idx2index trim-nou8 ia5 asciiigi-utf8-akcenton zprocess \ + zparsetext idx2index: idx2index.o dict.o trim-nou8: trim-nou8.c @@ -12,7 +13,10 @@ dictre: load.o dict.o write.o sort.o filter.o main.o fastmalloc.o repeated.o $(CC) -o $@ $^ -zprocess: zload.o dict.o zdefs.o zhash.o zrus.o fastmalloc.o +zprocess: zload.o dict.o zdefs.o zhash.o zrus.o + $(CC) -o $@ $^ + +zparsetext: parse_text.o zrus.o find.o dict.o $(CC) -o $@ $^ dict.c: dictre.h @@ -28,3 +32,4 @@ zdefs.c: dictre.h zhash.c: dictre.h zrus.c: dictre.h +find.c: dictre.h diff -r f71e89074c62 -r a961bb8806b9 dict.c --- a/dict.c Tue Aug 28 08:40:49 2007 +0200 +++ b/dict.c Wed Aug 29 00:19:14 2007 +0200 @@ -113,14 +113,11 @@ } } -int str2int(const char *str) +int str2int_len(const char *str, int length) { int i = 0; - int length; int val = 0; - length = strlen(str); - while (i < length) { val = char2val(str[i]) + val * 64; @@ -130,6 +127,12 @@ return val; } +int str2int(const char *str) +{ + int length = strlen(str); + return str2int_len(str, length); +} + int get_int(FILE *index) { char buffer[500]; diff -r f71e89074c62 -r a961bb8806b9 dictre.h --- a/dictre.h Tue Aug 28 08:40:49 2007 +0200 +++ b/dictre.h Wed Aug 29 00:19:14 2007 +0200 @@ -1,5 +1,6 @@ enum { - MAXWORD=200 + MAXWORD=200, + MAXDEF=10000 }; struct Words @@ -64,3 +65,10 @@ /* zrus.c */ char * mix_accents(char *a, const char *b); void remove_accent(unsigned char *dest, const unsigned char *from); +int until_newword(const unsigned char *str, int *index); +int until_noword(const char *str, int *index); +int skip_newline(const char *str, int *index); +int is_ASCII(unsigned char c); + +/* find.c */ +void find_def(const char *word, char * def); diff -r f71e89074c62 -r a961bb8806b9 errors.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/errors.txt Wed Aug 29 00:19:14 2007 +0200 @@ -0,0 +1,3 @@ +* La vorto 'телеканал' cxeestas en Zaliznjak, sed ne en 'akcentiga.index'. +* La majusklojn ĝi ne pritraktas. +* Mankas получить en Zaliznjak. diff -r f71e89074c62 -r a961bb8806b9 find.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find.c Wed Aug 29 00:19:14 2007 +0200 @@ -0,0 +1,175 @@ +#include +#include +#include +#include +#include +#include "dictre.h" + +static unsigned char *index; +static int indexfd; +static int indexsize; +static FILE *defs; +const static char indexname[] = "akcentiga.index"; +const static char dictname[] = "akcentiga.dict"; + +int get_filesize(const char *fname) +{ + struct stat st; + int res; + res = stat(fname, &st); + if (res == -1) + { + fprintf(stderr, "Problem stating the file %s\n", fname); + perror("Error:"); + exit(-1); + } + + return st.st_size; +} + +void init_dictionary() +{ + indexsize = get_filesize(indexname); + indexfd = open(indexname, O_RDONLY); + if (indexfd == -1) + { + fprintf(stderr, "Problem opening the file %s\n", indexname); + perror("Error:"); + exit(-1); + } + index = (unsigned char *) mmap(0, indexsize, PROT_READ, MAP_SHARED, + indexfd, 0); + + defs = fopen(dictname, "r"); + if (defs == 0) + { + fprintf(stderr, "Problem opening the file %s\n", dictname); + perror("Error:"); + exit(-1); + } +} + +void end_dictionary() +{ + munmap(index, indexsize); + close(indexfd); + fclose(defs); +} + +static void fill_def(int offset, int length, char * def) +{ + fseek(defs, offset, SEEK_SET); + fread(def, 1, length, defs); +} + +static int pointer_at_end(unsigned char *ptr) +{ + if (ptr >= (index + indexsize)) + return 1; + return 0; +} + +static char * skip_until_newline(char *from) +{ + if (pointer_at_end(from)) + return 0; + while(*from != '\n' && *from != 0) + { + ++from; + if(pointer_at_end(from)) + return 0; + } + return from; +} + +static int compare(const unsigned char *word, const unsigned char *test) +{ + int i; + + /*printf("Comparing %s to %.20s\n", word, test);*/ + for(i=0; word[i] != 0 && test[i] != 0; ++i) + { + if (word[i] != test[i]) + { + break; + } + } + if (word[i] == 0 && test[i] == '\t') + return 0; + else if (word[i] == 0) + return -1; + else if (test[i] == '\t') + return 1; + else if (word[i] > test[i]) + return 1; + else if (word[i] < test[i]) + return -1; + + /* It should never reach this. */ + return -1; +} + +static char * bin_search(const char *word) +{ + int step, pivot; + + pivot = indexsize / 2; + step = indexsize / 2; + + do + { + char *test; + int comparision; + test = index + pivot; + test = skip_until_newline(test); + if (test == 0) + return 0; + test += 1; /* skip exactly the new line */ + + comparision = compare(word, test); + if (comparision == 0) + { + return test + strlen(word) + 1; /* skip word and \n */ + } else if (comparision < 0) + { + step = step / 2; + pivot = pivot - step; + } else if (comparision > 0) + { + step = step / 2; + pivot = pivot + step; + } + } while(step > 0); + return 0; +} + +static int my_get_int(char **pos) +{ + int i; + char *start; + int val; + + start = *pos; + for(i=0; start[i] != '\t' && start[i] != '\n'; ++i) + ; + val = str2int_len(start, i); + *pos += i + 1; + return val; +} + +void find_def(const char *word, char * def) +{ + int offset, len; + char *pos; + + pos = bin_search(word); /* pos points to the offset already. */ + if (pos == 0) + { + def[0] = 0; + /*fprintf(stderr, "Cannot find %s\n", word);*/ + return; + } + offset = my_get_int(&pos); /* increments pos */ + len = my_get_int(&pos); /* increments pos */ + fill_def(offset, len, def); +} diff -r f71e89074c62 -r a961bb8806b9 make-akcentiga.sh --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/make-akcentiga.sh Wed Aug 29 00:19:14 2007 +0200 @@ -0,0 +1,3 @@ +#!/bin/sh + +dictfmt -s "Полная акцентуированная парадигма по А. А. Зализняку" -j --locale ca_ES.UTF-8 --without-headword akcentiga < akcentiga.txt diff -r f71e89074c62 -r a961bb8806b9 parse_text.c --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parse_text.c Wed Aug 29 00:19:14 2007 +0200 @@ -0,0 +1,64 @@ +#include +#include "dictre.h" + +static void give_accent_to_word(const char *tmp) +{ + char def[MAXDEF]; + + find_def(tmp, def); + if (def[0] != 0) /* found */ + { + /* Print the word UNTIL a space. + * the definition will have the form: + * ACCENTED_WORD NOMINATIVE1 NOMINATIVE2 ... \n */ + char *first_space; + char *pos; + first_space = strchr(def, ' '); + if (first_space != 0) /* Space found */ + for(pos = def; pos < first_space; ++pos) + putchar(*pos); + return; + } + + /* if first_space == 0 or word not found */ + printf("%s", tmp); +} + +static void process_text(FILE *in, int pos, int length) +{ + unsigned char tmp[MAXWORD]; + int wordpos = 0; + do + { + int c; + /* Check pos only if length >= 0 */ + if (length >= 0 && pos >= length) + break; + c = fgetc(in); + if (c == EOF) + break; + if (is_ASCII(c)) + { + if (wordpos != 0) + { + tmp[wordpos] = 0; + give_accent_to_word(tmp); + wordpos = 0; + } + putchar(c); + } + else /* non-ASCII - we consider it russian */ + { + tmp[wordpos++] = c; + } + + pos += 1; + } while(1); +} + +int main() +{ + init_dictionary(); + process_text(stdin, 0, -1); + end_dictionary(); +} diff -r f71e89074c62 -r a961bb8806b9 zdefs.c --- a/zdefs.c Tue Aug 28 08:40:49 2007 +0200 +++ b/zdefs.c Wed Aug 29 00:19:14 2007 +0200 @@ -6,48 +6,6 @@ printf("%s\n", str); } -static int skip_newline(const char *str, int *index) -{ - while(str[*index] != 0 && str[*index] != '\n') - { - ++*index; - } - - if (str[*index] == '\n') - return *index; - - return -1; -} - -static int until_noword(const char *str, int *index) -{ - while(str[*index] != 0 && - str[*index] != ' ' && - str[*index] != '\n' && - str[*index] != '\r' && - str[*index] != ',') - { - ++*index; - } - - if (str[*index] != 0) - return *index; - - return -1; -} - -static int until_newword(const unsigned char *str, int *index) -{ - while(str[*index] != 0 && str[*index] < 128) - { - ++*index; - } - - if (str[*index] != 0); - return *index; - - return -1; -} void zprocess_def(const char *root, char *def) { diff -r f71e89074c62 -r a961bb8806b9 zrus.c --- a/zrus.c Tue Aug 28 08:40:49 2007 +0200 +++ b/zrus.c Wed Aug 29 00:19:14 2007 +0200 @@ -96,3 +96,53 @@ } dest[o] = 0; } + +int skip_newline(const char *str, int *index) +{ + while(str[*index] != 0 && str[*index] != '\n') + { + ++*index; + } + + if (str[*index] == '\n') + return *index; + + return -1; +} + +int until_noword(const char *str, int *index) +{ + while(str[*index] != 0 && + str[*index] != ' ' && + str[*index] != '\n' && + str[*index] != '\r' && + str[*index] != ',') + { + ++*index; + } + + if (str[*index] != 0) + return *index; + + return -1; +} + +int is_ASCII(unsigned char c) +{ + if (c < 128) + return 1; + return 0; +} + +int until_newword(const unsigned char *str, int *index) +{ + while(str[*index] != 0 && is_ASCII(str[*index])) + { + ++*index; + } + + if (str[*index] != 0); + return *index; + + return -1; +}