Pritraktado de majuskloj per ICU.
--- a/Makefile Wed Aug 29 00:19:14 2007 +0200
+++ b/Makefile Sat Sep 01 00:50:11 2007 +0200
@@ -2,7 +2,7 @@
CC=gcc
all: dictre idx2index trim-nou8 ia5 asciiigi-utf8-akcenton zprocess \
- zparsetext
+ zparsetext zrustest
idx2index: idx2index.o dict.o
trim-nou8: trim-nou8.c
@@ -14,10 +14,13 @@
$(CC) -o $@ $^
zprocess: zload.o dict.o zdefs.o zhash.o zrus.o
- $(CC) -o $@ $^
+ $(CC) -o $@ $^ -licui18n -licuuc -licudata
zparsetext: parse_text.o zrus.o find.o dict.o
- $(CC) -o $@ $^
+ $(CC) -o $@ $^ -licui18n -licuuc -licudata
+
+zrustest: zrustest.o zrus.o
+ $(CC) -o $@ $^ -licui18n -licuuc -licudata
dict.c: dictre.h
write.c: dictre.h
@@ -33,3 +36,4 @@
zhash.c: dictre.h
zrus.c: dictre.h
find.c: dictre.h
+zrustest.c: dictre.h
--- a/dictre.h Wed Aug 29 00:19:14 2007 +0200
+++ b/dictre.h Sat Sep 01 00:50:11 2007 +0200
@@ -3,6 +3,12 @@
MAXDEF=10000
};
+enum Case
+{
+ LCASE = 0,
+ UCASE = 1
+};
+
struct Words
{
struct Word *first;
@@ -69,6 +75,9 @@
int until_noword(const char *str, int *index);
int skip_newline(const char *str, int *index);
int is_ASCII(unsigned char c);
+int get_case(enum Case *vcase, const char *str);
+void get_lowcase_str(char *inout, const char *str);
+void reapply_case(char *out, const char *in, const enum Case *vcase);
/* find.c */
void find_def(const char *word, char * def);
--- a/errors.txt Wed Aug 29 00:19:14 2007 +0200
+++ b/errors.txt Sat Sep 01 00:50:11 2007 +0200
@@ -1,3 +1,2 @@
* La vorto 'телеканал' cxeestas en Zaliznjak, sed ne en 'akcentiga.index'.
-* La majusklojn ĝi ne pritraktas.
* Mankas получить en Zaliznjak.
--- a/parse_text.c Wed Aug 29 00:19:14 2007 +0200
+++ b/parse_text.c Sat Sep 01 00:50:11 2007 +0200
@@ -1,27 +1,41 @@
#include <stdio.h>
#include "dictre.h"
-static void give_accent_to_word(const char *tmp)
+static void give_accent_to_word(const char *word)
{
char def[MAXDEF];
+ char low[MAXWORD];
+ char recased[MAXWORD];
+ enum Case vcase[MAXWORD];
- find_def(tmp, def);
+ /* Get case */
+ get_case(vcase, word);
+
+ /* Get lowercase version */
+ get_lowcase_str(low, word);
+
+ /* Find the lowercase version */
+ find_def(low, def);
if (def[0] != 0) /* found */
{
/* Print the word UNTIL a space.
* the definition will have the form:
* ACCENTED_WORD NOMINATIVE1 NOMINATIVE2 ... \n */
char *first_space;
- char *pos;
+ char spacepos;
first_space = strchr(def, ' ');
if (first_space != 0) /* Space found */
- for(pos = def; pos < first_space; ++pos)
- putchar(*pos);
+ {
+ spacepos = first_space - def;
+ def[spacepos] = 0; /* Mark an end of string */
+ reapply_case(recased, def, vcase);
+ printf("%s", recased);
+ }
return;
}
/* if first_space == 0 or word not found */
- printf("%s", tmp);
+ printf("%s", word);
}
static void process_text(FILE *in, int pos, int length)
--- a/zrus.c Wed Aug 29 00:19:14 2007 +0200
+++ b/zrus.c Sat Sep 01 00:50:11 2007 +0200
@@ -1,4 +1,7 @@
#include <stdio.h>
+#include <unicode/uchar.h>
+#include <unicode/ustring.h>
+#include <unicode/utypes.h>
#include "dictre.h"
static int closed_accent(const unsigned char *tmp)
@@ -146,3 +149,99 @@
return -1;
}
+
+int get_case(enum Case *vcase, const char *str)
+{
+ UChar32 c;
+ int i;
+ int o;
+ int len;
+
+ len = strlen(str);
+
+ i=0;
+ o=0;
+ do
+ {
+ U8_NEXT(str, i, len, c);
+ /*printf("[%i] ", c);*/
+ if (c == 0)
+ break;
+ if (u_islower(c))
+ vcase[o] = LCASE;
+ else
+ vcase[o] = UCASE;
+ ++o;
+ } while(1);
+
+ return o;
+}
+
+void get_lowcase_str(char *out, const char *str)
+{
+ UChar32 c;
+ int i;
+ int o;
+ int len;
+ char iserror = 0;
+
+ len = strlen(str);
+
+ i=0;
+ o=0;
+ do
+ {
+ U8_NEXT(str, i, len, c);
+ /*printf("[%i] ", c);*/
+ c = u_tolower(c);
+ U8_APPEND(out, o, MAXWORD, c, iserror);
+ if (iserror)
+ break;
+ if (c == 0)
+ break;
+ } while(1);
+}
+
+void reapply_case(char *out, const char *in, const enum Case *vcase)
+{
+ UChar32 c;
+ int i;
+ int o;
+ int vcasepos;
+ int len;
+ char iserror = 0;
+ const UChar32 inverted = '`';
+
+ len = strlen(in);
+
+ i=0;
+ o=0;
+ vcasepos = 0;
+ do
+ {
+ U8_NEXT(in, i, len, c);
+ /*printf("[%i] ", c);*/
+ if (c == inverted || u_hasBinaryProperty(c, UCHAR_DIACRITIC))
+ {
+ U8_APPEND(out, o, MAXWORD, c, iserror);
+ /* Here we don't increment vcasepos,
+ * so the ` or diacritics gets copied without being taken
+ * care in the recase process. It will
+ * be the only sign that may be _added_ */
+ continue;
+ }
+
+ if (vcase[vcasepos] == LCASE)
+ c = u_tolower(c);
+ else
+ c = u_toupper(c);
+ vcasepos += 1;
+
+ U8_APPEND(out, o, MAXWORD, c, iserror);
+
+ if (iserror)
+ break;
+ if (c == 0)
+ break;
+ } while(1);
+}
Binary file zrustest has changed