Improved hash on zprocess, and added parsing for "jo".
--- a/dictre.h Sat Sep 01 00:50:11 2007 +0200
+++ b/dictre.h Sat Sep 01 01:19:18 2007 +0200
@@ -78,6 +78,7 @@
int get_case(enum Case *vcase, const char *str);
void get_lowcase_str(char *inout, const char *str);
void reapply_case(char *out, const char *in, const enum Case *vcase);
+void remove_jo(char *str);
/* find.c */
void find_def(const char *word, char * def);
--- a/errors.txt Sat Sep 01 00:50:11 2007 +0200
+++ b/errors.txt Sat Sep 01 01:19:18 2007 +0200
@@ -1,2 +1,4 @@
-* La vorto 'телеканал' cxeestas en Zaliznjak, sed ne en 'akcentiga.index'.
* Mankas получить en Zaliznjak.
+
+Optimigo:
+- Ne cxiam sangxi al minusklaj literoj por sercxi. Nur kaze de netrovo.
--- a/zhash.c Sat Sep 01 00:50:11 2007 +0200
+++ b/zhash.c Sat Sep 01 01:19:18 2007 +0200
@@ -57,18 +57,19 @@
static unsigned int hash_func(const unsigned char *str)
{
- int res;
- char v;
+ unsigned int v;
- v = 0;
+ /* for hashmax of 2^16 */
- /* Taking only the meaningful utf-8 codes */
+ v = (str[1] & 15) << 4*3;
if (str[2] != 0)
- v = str[3];
+ v += (str[3] & 15) << 4*2;
+ if (str[4] != 0)
+ v += (str[5] & 15) << 4;
+ if (str[6] != 0)
+ v += (str[7] & 15);
- res = (str[1] << 8) + v;
-
- return res;
+ return v;
}
/* Word without accent */
@@ -135,6 +136,7 @@
unsigned int hash_num;
remove_accent(word_no_accent, word);
+ remove_jo(word_no_accent);
hash_num = hash_func(word_no_accent);
--- a/zrus.c Sat Sep 01 00:50:11 2007 +0200
+++ b/zrus.c Sat Sep 01 01:19:18 2007 +0200
@@ -245,3 +245,27 @@
break;
} while(1);
}
+
+void remove_jo(char *str)
+{
+ int i, o;
+ UChar32 c;
+ char iserror = 0;
+
+ i=0;
+ o=0;
+ do
+ {
+ U8_NEXT(str, i, MAXWORD, c);
+ if (c == 0x0451)
+ {
+ c = 0x0435;
+ U8_APPEND(str, o, MAXWORD, c, iserror);
+ if (iserror)
+ break;
+ }
+ o = i;
+ if (c == 0)
+ break;
+ } while(1);
+}
Binary file zrustest has changed
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/zrustest.c Sat Sep 01 01:19:18 2007 +0200
@@ -0,0 +1,32 @@
+#include <stdio.h>
+#include "dictre.h"
+
+int main()
+{
+ char in[MAXWORD];
+ enum Case vcase[MAXWORD];
+ char tmp[MAXWORD];
+ char recased[MAXWORD];
+ int len;
+ int i;
+
+ printf("Insert string: ");
+ fgets(in, MAXWORD, stdin);
+ in[strlen(in)-1] = 0; /* Remove last '\n' */
+ len = get_case(vcase, in);
+ printf("Case: ");
+ for(i=0; i < len; ++i)
+ {
+ printf("%i ", (int) vcase[i]);
+ }
+ putchar('\n');
+
+ printf("Lower case: ");
+ get_lowcase_str(tmp, in);
+ remove_jo(tmp);
+ printf("%s\n", tmp);
+
+ printf("Reconstructed case: ");
+ reapply_case(recased, tmp, vcase);
+ printf("%s\n", recased);
+}