Checks for repeated definitions.
authorviric@mandarina
Mon, 13 Aug 2007 16:27:41 +0200
changeset 5 c87681fff7d3
parent 4 b2dfe3374454
child 6 bc41369f4587
Checks for repeated definitions.
Makefile
dictre.h
fastmalloc.c
load.c
main.c
no-difinoj.sed
nofont.sh
repeated.c
trim-nou8.c
--- a/Makefile	Sat Aug 11 16:12:27 2007 +0200
+++ b/Makefile	Mon Aug 13 16:27:41 2007 +0200
@@ -1,11 +1,12 @@
 CFLAGS=-O2 -g
 CC=gcc
 
-all: dictre idx2index
+all: dictre idx2index trim-nou8
 
 idx2index: idx2index.c
+trim-nou8: trim-nou8.c
 
-dictre: load.o dict.o write.o sort.o filter.o main.o fastmalloc.o
+dictre: load.o dict.o write.o sort.o filter.o main.o fastmalloc.o repeated.o
 	$(CC) -o $@ $^
 
 dict.c: dictre.h
@@ -15,3 +16,4 @@
 filter.c: dictre.h
 main.c: dictre.h
 fastmalloc.c: dictre.h
+repeated.c: dictre.h
--- a/dictre.h	Sat Aug 11 16:12:27 2007 +0200
+++ b/dictre.h	Mon Aug 13 16:27:41 2007 +0200
@@ -30,7 +30,6 @@
 
 /* sort.c */
 int sort_words();
-
 /* load.c */
 void load_init();
 void load_dictionary(FILE *index, FILE *fdefs);
@@ -38,3 +37,9 @@
 
 /* fastmalloc */
 void * fastmalloc(int newsize);
+
+/* repeated.c */
+void new_hashdef(struct Def *ptr, int index);
+int def_repeated(struct Def *ptr);
+void init_repeated();
+void remove_def(int i);
--- a/fastmalloc.c	Sat Aug 11 16:12:27 2007 +0200
+++ b/fastmalloc.c	Mon Aug 13 16:27:41 2007 +0200
@@ -13,8 +13,7 @@
 void * fastmalloc(int newsize)
 {
     void *outptr;
-
-    outptr = base + given;
+    int old_given = given;
 
     given += newsize;
 
@@ -32,5 +31,7 @@
         }
     }
 
+    outptr = base + old_given;
+
     return outptr;
 }
--- a/load.c	Sat Aug 11 16:12:27 2007 +0200
+++ b/load.c	Mon Aug 13 16:27:41 2007 +0200
@@ -14,7 +14,7 @@
 int dont_touch[20];
 int ndont_touch;
 
-void load_init()
+void init_load()
 {
     ndefs = 0;
     nwords = 0;
@@ -69,6 +69,7 @@
 {
     struct Word w;
     int last_offset = 0;
+    int def_avoided = 0;
 
     do {
         int offset, length;
@@ -76,6 +77,7 @@
         w.w = get_word(index);
         if (w.w == 0)
             break;
+        /*printf("Word: %s\n", w.w);*/
         offset = get_int(index);
         length = get_int(index);
         if (offset > last_offset)
@@ -85,10 +87,26 @@
         }
         else
             w.def = search_def(offset, length);
-        if (w.def == -1)
+        if (w.def == -1) 
         {
+            /* New definition */
+            int newindex, repindex;
             defstr = get_def(fdefs, offset, length);
-            w.def = new_def(defstr, offset, length);
+            newindex = new_def(defstr, offset, length);
+            
+            /* Store it in the hash for repeated defs */
+            repindex = def_repeated(&defs[newindex]);
+            if (repindex != -1) 
+            {
+                def_avoided += 1;
+                printf("Repeated def avoided %i (word %s)\n", def_avoided, w.w);
+                remove_def(newindex);
+                newindex = repindex;
+            } else
+                new_hashdef(&defs[newindex], newindex);
+
+            /* Store the final index */
+            w.def = newindex;
         }
         /* sizeof -1  instead of strlen() */
         if (strncmp(w.w, "00database", sizeof("00database") - 1) == 0)
--- a/main.c	Sat Aug 11 16:12:27 2007 +0200
+++ b/main.c	Mon Aug 13 16:27:41 2007 +0200
@@ -1,4 +1,5 @@
 #include <stdio.h>
+#include <sys/stat.h>
 
 #include "dictre.h"
 
@@ -9,11 +10,12 @@
 {
     char tmpname[500];
     FILE *i, *d;
+    int remove_tmp_data = 0;
 
-    if (argn < 4)
+    if (argn < 3)
     {
         fprintf(stderr, "usage: %s <dict_basename> "
-                "<dict_basename_out> <filter>\n",
+                "<dict_basename_out> [filter]\n",
                 argv[0]);
         return 1;
     }
@@ -32,24 +34,51 @@
     d = fopen(tmpname, "r");
     if(d == NULL)
     {
-        fprintf(stderr, "File: %s ", tmpname);
-        perror("- cannot open file.");
-        exit(-1);
+        struct stat st;
+        int res;
+        char tmp[500];
+        strcat(tmpname, ".dz");
+        res = stat(tmpname, &st);
+        if (res == -1)
+        {
+            fprintf(stderr, "File: %s ", tmpname);
+            perror("- cannot open file.");
+            exit(-1);
+        }
+        sprintf(tmp, "gzip -cd %s > /tmp/tmp.dict",
+                tmpname);
+        printf("Gunzipping...\n");
+        res = system(tmp);
+        d = fopen("/tmp/tmp.dict", "r");
+        if(d == NULL || res != 0)
+        {
+            fprintf(stderr, "Error gunzipping file: %s ", tmpname);
+            perror("- something happened to /tmp/tmp.dict.");
+            exit(-1);
+        }
+        remove_tmp_data = 1;
     }
 
-    load_init();
+    init_load();
+    init_repeated();
+    /* Always line buffered on stdout, for 'ts' */
+    setlinebuf(stdout);
 
     load_dictionary(i, d);
 
     fclose(i);
     fclose(d);
 
+    if (remove_tmp_data)
+        unlink("/tmp/tmp.dict");
+
     sort_words();
 
     if (0)
         print_words();
 
-    filter_all(argv[3]);
+    if (argn >= 4)
+        filter_all(argv[3]);
 
     write_dictionary(argv[2]);
 
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/no-difinoj.sed	Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,4 @@
+#!/bin/sed -f
+
+/^----.*$/d
+/^$/d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/nofont.sh	Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,3 @@
+#!/bin/sh
+
+./trim-nou8 | sed 's/<\/\?fon[^>]*>//g'
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/repeated.c	Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,107 @@
+#include <stdio.h>
+#include "dictre.h"
+
+extern struct Def defs[];
+extern int ndefs;
+
+enum
+{
+    MAXLEN = 200
+};
+
+struct HashElement
+{
+    struct HashElement *next;
+    struct Def *def;
+    int index;
+};
+
+static struct HashElement *dhash[MAXLEN];
+static int ndhash[MAXLEN];
+static struct HashElement *dhash_last[MAXLEN];
+
+void init_repeated()
+{
+    int i;
+    for(i=0; i<MAXLEN; ++i)
+    {
+        ndhash[i] = 0;
+    }
+}
+
+void remove_def(int i)
+{
+    ndefs--;
+    for(i=i; i<ndefs; ++i)
+        defs[i] = defs[i+1];
+}
+
+/*
+static void remove_hashdef(struct Def *ptr, int hash)
+{
+    int i;
+    struct HashElement *root = dhash[hash];
+
+    for(i=0; i<ndhash[hash]; ++i)
+        if (root[i] == ptr)
+            break;
+
+    ndhash[hash]--;
+    for(; i<ndhash[hash]; ++i)
+        root[i] = root[i+1];
+}
+*/
+
+static int calc_hash(struct Def *ptr)
+{
+    int hash;
+
+    hash = ptr->length % MAXLEN;
+    return hash;
+}
+
+void new_hashdef(struct Def *ptr, int index)
+{
+    int hash;
+    struct HashElement *el;
+
+    hash = calc_hash(ptr);
+
+    el = (struct HashElement *) fastmalloc(sizeof(*el));
+    el->def = ptr;
+    el->next = 0;
+    el->index = index;
+
+    /* Let the last point to the new element */
+    if (ndhash[hash] != 0)
+    {
+        dhash_last[hash]->next = el;
+    } else /* 0 elements in row */
+    {
+        dhash[hash] = el;
+    }
+    ndhash[hash] += 1;
+    dhash_last[hash] = el;
+}
+
+int def_repeated(struct Def *ptr)
+{
+    int hash;
+    int i;
+    struct HashElement *h;
+
+    hash = calc_hash(ptr);
+
+    h = dhash[hash];
+    for(i=0; i < ndhash[hash]; ++i)
+    {
+        struct Def *hdef = h->def;
+        if (hdef->length == ptr->length
+                && (strncmp(hdef->d, ptr->d, ptr->length) == 0))
+            /* Repeated found !*/
+            return h->index;
+    }
+
+    /* Not found */
+    return -1;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/trim-nou8.c	Mon Aug 13 16:27:41 2007 +0200
@@ -0,0 +1,17 @@
+#include <stdio.h>
+
+int main()
+{
+    int i;
+    do
+    {
+        /* Unsigned char cast to int */
+        i = getchar();
+        if (i == EOF)
+            break;
+        if (i == '\n' || (i >= ' ' && i < 0xfe))
+            putchar(i);
+    } while(1);
+
+    return 0;
+}