Improved hash on zprocess, and added parsing for "jo".
authorviric@llimona
Sat, 01 Sep 2007 01:19:18 +0200
changeset 16 b4e251400e36
parent 15 17a66ceb774a
child 17 d95d9e7a2b81
Improved hash on zprocess, and added parsing for "jo".
dictre.h
errors.txt
zhash.c
zrus.c
zrustest
zrustest.c
--- a/dictre.h	Sat Sep 01 00:50:11 2007 +0200
+++ b/dictre.h	Sat Sep 01 01:19:18 2007 +0200
@@ -78,6 +78,7 @@
 int get_case(enum Case *vcase, const char *str);
 void get_lowcase_str(char *inout, const char *str);
 void reapply_case(char *out, const char *in, const enum Case *vcase);
+void remove_jo(char *str);
 
 /* find.c */
 void find_def(const char *word, char * def);
--- a/errors.txt	Sat Sep 01 00:50:11 2007 +0200
+++ b/errors.txt	Sat Sep 01 01:19:18 2007 +0200
@@ -1,2 +1,4 @@
-* La vorto 'телеканал' cxeestas en Zaliznjak, sed ne en 'akcentiga.index'.
 * Mankas получить en Zaliznjak.
+
+Optimigo:
+- Ne cxiam sangxi al minusklaj literoj por sercxi. Nur kaze de netrovo.
--- a/zhash.c	Sat Sep 01 00:50:11 2007 +0200
+++ b/zhash.c	Sat Sep 01 01:19:18 2007 +0200
@@ -57,18 +57,19 @@
 
 static unsigned int hash_func(const unsigned char *str)
 {
-    int res;
-    char v;
+    unsigned int v;
 
-    v = 0;
+    /* for hashmax of 2^16 */
 
-    /* Taking only the meaningful utf-8 codes */
+    v = (str[1] & 15) << 4*3;
     if (str[2] != 0)
-        v = str[3];
+        v += (str[3] & 15) << 4*2;
+    if (str[4] != 0)
+        v += (str[5] & 15) << 4;
+    if (str[6] != 0)
+        v += (str[7] & 15);
 
-    res = (str[1] << 8) + v;
-
-    return res;
+    return v;
 }
 
 /* Word without accent */
@@ -135,6 +136,7 @@
     unsigned int hash_num;
 
     remove_accent(word_no_accent, word);
+    remove_jo(word_no_accent);
 
     hash_num = hash_func(word_no_accent);
 
--- a/zrus.c	Sat Sep 01 00:50:11 2007 +0200
+++ b/zrus.c	Sat Sep 01 01:19:18 2007 +0200
@@ -245,3 +245,27 @@
             break;
     } while(1);
 }
+
+void remove_jo(char *str)
+{
+    int i, o;
+    UChar32 c;
+    char iserror = 0;
+
+    i=0;
+    o=0;
+    do
+    {
+        U8_NEXT(str, i, MAXWORD, c);
+        if (c == 0x0451)
+        {
+            c = 0x0435;
+            U8_APPEND(str, o, MAXWORD, c, iserror);
+            if (iserror)
+                break;
+        }
+        o = i;
+        if (c == 0)
+            break;
+    } while(1);
+}
Binary file zrustest has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/zrustest.c	Sat Sep 01 01:19:18 2007 +0200
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include "dictre.h"
+
+int main()
+{
+    char in[MAXWORD];
+    enum Case vcase[MAXWORD];
+    char tmp[MAXWORD];
+    char recased[MAXWORD];
+    int len;
+    int i;
+
+    printf("Insert string: ");
+    fgets(in, MAXWORD, stdin);
+    in[strlen(in)-1] = 0; /* Remove last '\n' */
+    len = get_case(vcase, in);
+    printf("Case: ");
+    for(i=0; i < len; ++i)
+    {
+        printf("%i ", (int) vcase[i]);
+    }
+    putchar('\n');
+
+    printf("Lower case: ");
+    get_lowcase_str(tmp, in);
+    remove_jo(tmp);
+    printf("%s\n", tmp);
+
+    printf("Reconstructed case: ");
+    reapply_case(recased, tmp, vcase);
+    printf("%s\n", recased);
+}