Checks for repeated definitions.
--- a/Makefile Sat Aug 11 16:12:27 2007 +0200
+++ b/Makefile Mon Aug 13 16:27:41 2007 +0200
@@ -1,11 +1,12 @@
CFLAGS=-O2 -g
CC=gcc
-all: dictre idx2index
+all: dictre idx2index trim-nou8
idx2index: idx2index.c
+trim-nou8: trim-nou8.c
-dictre: load.o dict.o write.o sort.o filter.o main.o fastmalloc.o
+dictre: load.o dict.o write.o sort.o filter.o main.o fastmalloc.o repeated.o
$(CC) -o $@ $^
dict.c: dictre.h
@@ -15,3 +16,4 @@
filter.c: dictre.h
main.c: dictre.h
fastmalloc.c: dictre.h
+repeated.c: dictre.h
--- a/dictre.h Sat Aug 11 16:12:27 2007 +0200
+++ b/dictre.h Mon Aug 13 16:27:41 2007 +0200
@@ -30,7 +30,6 @@
/* sort.c */
int sort_words();
-
/* load.c */
void load_init();
void load_dictionary(FILE *index, FILE *fdefs);
@@ -38,3 +37,9 @@
/* fastmalloc */
void * fastmalloc(int newsize);
+
+/* repeated.c */
+void new_hashdef(struct Def *ptr, int index);
+int def_repeated(struct Def *ptr);
+void init_repeated();
+void remove_def(int i);
--- a/fastmalloc.c Sat Aug 11 16:12:27 2007 +0200
+++ b/fastmalloc.c Mon Aug 13 16:27:41 2007 +0200
@@ -13,8 +13,7 @@
void * fastmalloc(int newsize)
{
void *outptr;
-
- outptr = base + given;
+ int old_given = given;
given += newsize;
@@ -32,5 +31,7 @@
}
}
+ outptr = base + old_given;
+
return outptr;
}
--- a/load.c Sat Aug 11 16:12:27 2007 +0200
+++ b/load.c Mon Aug 13 16:27:41 2007 +0200
@@ -14,7 +14,7 @@
int dont_touch[20];
int ndont_touch;
-void load_init()
+void init_load()
{
ndefs = 0;
nwords = 0;
@@ -69,6 +69,7 @@
{
struct Word w;
int last_offset = 0;
+ int def_avoided = 0;
do {
int offset, length;
@@ -76,6 +77,7 @@
w.w = get_word(index);
if (w.w == 0)
break;
+ /*printf("Word: %s\n", w.w);*/
offset = get_int(index);
length = get_int(index);
if (offset > last_offset)
@@ -85,10 +87,26 @@
}
else
w.def = search_def(offset, length);
- if (w.def == -1)
+ if (w.def == -1)
{
+ /* New definition */
+ int newindex, repindex;
defstr = get_def(fdefs, offset, length);
- w.def = new_def(defstr, offset, length);
+ newindex = new_def(defstr, offset, length);
+
+ /* Store it in the hash for repeated defs */
+ repindex = def_repeated(&defs[newindex]);
+ if (repindex != -1)
+ {
+ def_avoided += 1;
+ printf("Repeated def avoided %i (word %s)\n", def_avoided, w.w);
+ remove_def(newindex);
+ newindex = repindex;
+ } else
+ new_hashdef(&defs[newindex], newindex);
+
+ /* Store the final index */
+ w.def = newindex;
}
/* sizeof -1 instead of strlen() */
if (strncmp(w.w, "00database", sizeof("00database") - 1) == 0)
--- a/main.c Sat Aug 11 16:12:27 2007 +0200
+++ b/main.c Mon Aug 13 16:27:41 2007 +0200
@@ -1,4 +1,5 @@
#include <stdio.h>
+#include <sys/stat.h>
#include "dictre.h"
@@ -9,11 +10,12 @@
{
char tmpname[500];
FILE *i, *d;
+ int remove_tmp_data = 0;
- if (argn < 4)
+ if (argn < 3)
{
fprintf(stderr, "usage: %s <dict_basename> "
- "<dict_basename_out> <filter>\n",
+ "<dict_basename_out> [filter]\n",
argv[0]);
return 1;
}
@@ -32,24 +34,51 @@
d = fopen(tmpname, "r");
if(d == NULL)
{
- fprintf(stderr, "File: %s ", tmpname);
- perror("- cannot open file.");
- exit(-1);
+ struct stat st;
+ int res;
+ char tmp[500];
+ strcat(tmpname, ".dz");
+ res = stat(tmpname, &st);
+ if (res == -1)
+ {
+ fprintf(stderr, "File: %s ", tmpname);
+ perror("- cannot open file.");
+ exit(-1);
+ }
+ sprintf(tmp, "gzip -cd %s > /tmp/tmp.dict",
+ tmpname);
+ printf("Gunzipping...\n");
+ res = system(tmp);
+ d = fopen("/tmp/tmp.dict", "r");
+ if(d == NULL || res != 0)
+ {
+ fprintf(stderr, "Error gunzipping file: %s ", tmpname);
+ perror("- something happened to /tmp/tmp.dict.");
+ exit(-1);
+ }
+ remove_tmp_data = 1;
}
- load_init();
+ init_load();
+ init_repeated();
+ /* Always line buffered on stdout, for 'ts' */
+ setlinebuf(stdout);
load_dictionary(i, d);
fclose(i);
fclose(d);
+ if (remove_tmp_data)
+ unlink("/tmp/tmp.dict");
+
sort_words();
if (0)
print_words();
- filter_all(argv[3]);
+ if (argn >= 4)
+ filter_all(argv[3]);
write_dictionary(argv[2]);
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/no-difinoj.sed Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,4 @@
+#!/bin/sed -f
+
+/^----.*$/d
+/^$/d
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/nofont.sh Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+./trim-nou8 | sed 's/<\/\?fon[^>]*>//g'
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/repeated.c Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,107 @@
+#include <stdio.h>
+#include "dictre.h"
+
+extern struct Def defs[];
+extern int ndefs;
+
+enum
+{
+ MAXLEN = 200
+};
+
+struct HashElement
+{
+ struct HashElement *next;
+ struct Def *def;
+ int index;
+};
+
+static struct HashElement *dhash[MAXLEN];
+static int ndhash[MAXLEN];
+static struct HashElement *dhash_last[MAXLEN];
+
+void init_repeated()
+{
+ int i;
+ for(i=0; i<MAXLEN; ++i)
+ {
+ ndhash[i] = 0;
+ }
+}
+
+void remove_def(int i)
+{
+ ndefs--;
+ for(i=i; i<ndefs; ++i)
+ defs[i] = defs[i+1];
+}
+
+/*
+static void remove_hashdef(struct Def *ptr, int hash)
+{
+ int i;
+ struct HashElement *root = dhash[hash];
+
+ for(i=0; i<ndhash[hash]; ++i)
+ if (root[i] == ptr)
+ break;
+
+ ndhash[hash]--;
+ for(; i<ndhash[hash]; ++i)
+ root[i] = root[i+1];
+}
+*/
+
+static int calc_hash(struct Def *ptr)
+{
+ int hash;
+
+ hash = ptr->length % MAXLEN;
+ return hash;
+}
+
+void new_hashdef(struct Def *ptr, int index)
+{
+ int hash;
+ struct HashElement *el;
+
+ hash = calc_hash(ptr);
+
+ el = (struct HashElement *) fastmalloc(sizeof(*el));
+ el->def = ptr;
+ el->next = 0;
+ el->index = index;
+
+ /* Let the last point to the new element */
+ if (ndhash[hash] != 0)
+ {
+ dhash_last[hash]->next = el;
+ } else /* 0 elements in row */
+ {
+ dhash[hash] = el;
+ }
+ ndhash[hash] += 1;
+ dhash_last[hash] = el;
+}
+
+int def_repeated(struct Def *ptr)
+{
+ int hash;
+ int i;
+ struct HashElement *h;
+
+ hash = calc_hash(ptr);
+
+ h = dhash[hash];
+ for(i=0; i < ndhash[hash]; ++i)
+ {
+ struct Def *hdef = h->def;
+ if (hdef->length == ptr->length
+ && (strncmp(hdef->d, ptr->d, ptr->length) == 0))
+ /* Repeated found !*/
+ return h->index;
+ }
+
+ /* Not found */
+ return -1;
+}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/trim-nou8.c Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,17 @@
+#include <stdio.h>
+
+int main()
+{
+ int i;
+ do
+ {
+ /* Unsigned char cast to int */
+ i = getchar();
+ if (i == EOF)
+ break;
+ if (i == '\n' || (i >= ' ' && i < 0xfe))
+ putchar(i);
+ } while(1);
+
+ return 0;
+}