Pritraktado de majuskloj per ICU.
authorviric@llimona
Sat, 01 Sep 2007 00:50:11 +0200
changeset 15 17a66ceb774a
parent 14 a961bb8806b9
child 16 b4e251400e36
Pritraktado de majuskloj per ICU.
Makefile
dictre.h
errors.txt
parse_text.c
zrus.c
zrustest
--- a/Makefile	Wed Aug 29 00:19:14 2007 +0200
+++ b/Makefile	Sat Sep 01 00:50:11 2007 +0200
@@ -2,7 +2,7 @@
 CC=gcc
 
 all: dictre idx2index trim-nou8 ia5 asciiigi-utf8-akcenton zprocess \
-	zparsetext
+	zparsetext zrustest
 
 idx2index: idx2index.o dict.o
 trim-nou8: trim-nou8.c
@@ -14,10 +14,13 @@
 	$(CC) -o $@ $^
 
 zprocess: zload.o dict.o zdefs.o zhash.o zrus.o
-	$(CC) -o $@ $^
+	$(CC) -o $@ $^ -licui18n -licuuc -licudata
 
 zparsetext: parse_text.o zrus.o find.o dict.o
-	$(CC) -o $@ $^
+	$(CC) -o $@ $^ -licui18n -licuuc -licudata
+
+zrustest: zrustest.o zrus.o
+	$(CC) -o $@ $^ -licui18n -licuuc -licudata
 
 dict.c: dictre.h
 write.c: dictre.h
@@ -33,3 +36,4 @@
 zhash.c: dictre.h
 zrus.c: dictre.h
 find.c: dictre.h
+zrustest.c: dictre.h
--- a/dictre.h	Wed Aug 29 00:19:14 2007 +0200
+++ b/dictre.h	Sat Sep 01 00:50:11 2007 +0200
@@ -3,6 +3,12 @@
     MAXDEF=10000
 };
 
+enum Case
+{
+    LCASE = 0,
+    UCASE = 1
+};
+
 struct Words
 {
     struct Word *first;
@@ -69,6 +75,9 @@
 int until_noword(const char *str, int *index);
 int skip_newline(const char *str, int *index);
 int is_ASCII(unsigned char c);
+int get_case(enum Case *vcase, const char *str);
+void get_lowcase_str(char *inout, const char *str);
+void reapply_case(char *out, const char *in, const enum Case *vcase);
 
 /* find.c */
 void find_def(const char *word, char * def);
--- a/errors.txt	Wed Aug 29 00:19:14 2007 +0200
+++ b/errors.txt	Sat Sep 01 00:50:11 2007 +0200
@@ -1,3 +1,2 @@
 * La vorto 'телеканал' cxeestas en Zaliznjak, sed ne en 'akcentiga.index'.
-* La majusklojn ĝi ne pritraktas.
 * Mankas получить en Zaliznjak.
--- a/parse_text.c	Wed Aug 29 00:19:14 2007 +0200
+++ b/parse_text.c	Sat Sep 01 00:50:11 2007 +0200
@@ -1,27 +1,41 @@
 #include <stdio.h>
 #include "dictre.h"
 
-static void give_accent_to_word(const char *tmp)
+static void give_accent_to_word(const char *word)
 {
     char def[MAXDEF];
+    char low[MAXWORD];
+    char recased[MAXWORD];
+    enum Case vcase[MAXWORD];
 
-    find_def(tmp, def);
+    /* Get case */
+    get_case(vcase, word);
+
+    /* Get lowercase version */
+    get_lowcase_str(low, word);
+
+    /* Find the lowercase version */
+    find_def(low, def);
     if (def[0] != 0) /* found */
     {
         /* Print the word UNTIL a space.
          * the definition will have the form:
          *    ACCENTED_WORD NOMINATIVE1 NOMINATIVE2 ... \n */
         char *first_space;
-        char *pos;
+        char spacepos;
         first_space = strchr(def, ' ');
         if (first_space != 0) /* Space found */
-            for(pos = def; pos < first_space; ++pos)
-                putchar(*pos);
+        {
+            spacepos = first_space - def;
+            def[spacepos] = 0; /* Mark an end of string */
+            reapply_case(recased, def, vcase);
+            printf("%s", recased);
+        }
         return;
     }
 
     /* if first_space == 0 or word not found */
-    printf("%s", tmp);
+    printf("%s", word);
 }
 
 static void process_text(FILE *in, int pos, int length)
--- a/zrus.c	Wed Aug 29 00:19:14 2007 +0200
+++ b/zrus.c	Sat Sep 01 00:50:11 2007 +0200
@@ -1,4 +1,7 @@
 #include <stdio.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#include <unicode/utypes.h>
 #include "dictre.h"
 
 static int closed_accent(const unsigned char *tmp)
@@ -146,3 +149,99 @@
 
     return -1;
 }
+
+int get_case(enum Case *vcase, const char *str)
+{
+    UChar32 c;
+    int i;
+    int o;
+    int len;
+
+    len = strlen(str);
+
+    i=0;
+    o=0;
+    do
+    {
+        U8_NEXT(str, i, len, c);
+        /*printf("[%i] ", c);*/
+        if (c == 0)
+            break;
+        if (u_islower(c))
+            vcase[o] = LCASE;
+        else
+            vcase[o] = UCASE;
+        ++o;
+    } while(1);
+
+    return o;
+}
+
+void get_lowcase_str(char *out, const char *str)
+{
+    UChar32 c;
+    int i;
+    int o;
+    int len;
+    char iserror = 0;
+
+    len = strlen(str);
+
+    i=0;
+    o=0;
+    do
+    {
+        U8_NEXT(str, i, len, c);
+        /*printf("[%i] ", c);*/
+        c = u_tolower(c);
+        U8_APPEND(out, o, MAXWORD, c, iserror);
+        if (iserror)
+            break;
+        if (c == 0)
+            break;
+    } while(1);
+}
+
+void reapply_case(char *out, const char *in, const enum Case *vcase)
+{
+    UChar32 c;
+    int i;
+    int o;
+    int vcasepos;
+    int len;
+    char iserror = 0;
+    const UChar32 inverted = '`';
+
+    len = strlen(in);
+
+    i=0;
+    o=0;
+    vcasepos = 0;
+    do
+    {
+        U8_NEXT(in, i, len, c);
+        /*printf("[%i] ", c);*/
+        if (c == inverted || u_hasBinaryProperty(c, UCHAR_DIACRITIC))
+        {
+            U8_APPEND(out, o, MAXWORD, c, iserror);
+            /* Here we don't increment vcasepos,
+             * so the ` or diacritics gets copied without being taken
+             * care in the recase process. It will
+             * be the only sign that may be _added_ */
+            continue;
+        }
+
+        if (vcase[vcasepos] == LCASE)
+            c = u_tolower(c);
+        else
+            c = u_toupper(c);
+        vcasepos += 1;
+
+        U8_APPEND(out, o, MAXWORD, c, iserror);
+
+        if (iserror)
+            break;
+        if (c == 0)
+            break;
+    } while(1);
+}
Binary file zrustest has changed