find.c
author viric@llimona
Sun, 02 Sep 2007 16:01:27 +0200
changeset 27 153c479aa0bc
parent 24 026a2ba0ce16
permissions -rw-r--r--
LEGUMIN kaj aliaj gxisdatigoj.

#include <stdio.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <fcntl.h>
#include "dictre.h"

const static char indexext[] = ".index";
const static char dictext[] = ".dict";

int get_filesize(const char *fname)
{
    struct stat st;
    int res;
    res = stat(fname, &st);
    if (res == -1)
    {
        fprintf(stderr, "Problem stating the file %s\n", fname);
        perror("Error:");
        exit(-1);
    }

    return st.st_size;
}

void init_dictionary(struct Dict *d, const char *base)
{
    char *filename;

    filename = (char *) malloc(strlen(base) + 10);

    /* Prepare .index filename and open it*/
    strcpy(filename, base);
    strcat(filename, indexext);

    d->indexsize = get_filesize(filename);
    d->indexfd = open(filename, O_RDONLY);
    if (d->indexfd == -1)
    {
        fprintf(stderr, "Problem opening the file %s\n", filename);
        perror("Error:");
        exit(-1);
    }
    d->index = (unsigned char *) mmap(0, d->indexsize, PROT_READ, MAP_SHARED,
            d->indexfd, 0);

    /* Prepare .dict filename and open it*/
    strcpy(filename, base);
    strcat(filename, dictext);
    d->defs = fopen(filename, "r");
    if (d->defs == 0)
    {
        fprintf(stderr, "Problem opening the file %s\n", filename);
        perror("Error:");
        exit(-1);
    }

    d->trim_first_line = 0;
    d->trim_last_newlines = 0;

    free(filename);
}

void end_dictionary(struct Dict *d)
{
    munmap(d->index, d->indexsize);
    close(d->indexfd);
    fclose(d->defs);
}

static int trim_first_line(char *def, int len)
{
    int new_line_pos;
    int i,j;
    for(i=0; i < len; ++i)
    {
        /* prepare ltrimming when finding the first newline character */
        if (def[i] == '\n')
        {
            /* Remove spaces after the first newline */
            for(i+=1; def[i] == ' '; ++i);

            /* Break */
            memmove(def, def + i,
                    len - i - 1);
            def[len-i-1] = 0;
            return len-i-1+1/*\0*/;
        }
    }
    return len;
}

static int trim_last_newlines(char *def, int len)
{
    int new_line_pos;
    int i,j;
    if (len < 2)
        return len;

    for(i=len-2; i >= 0; --i)
    {
        if (def[i] != '\n' && def[i] != '\r')
        {
            def[i+1] = '\0';
            return i + 1;
        }
    }
    return len;
}

static void fill_def(struct Dict *d, int offset, int length, char * def)
{
    fseek(d->defs, offset, SEEK_SET);
    fread(def, 1, length, d->defs);
    def[length] = 0;
    if (d->trim_first_line)
    {
        length = trim_first_line(def, length + 1/*\0*/) - 1 /*\0*/;
    }
    if (d->trim_last_newlines)
    {
        length = trim_last_newlines(def, length+1) - 1; /* math as above*/
    }
}

static int pointer_at_end(struct Dict *d, const unsigned char *ptr)
{
    if (ptr >= (d->index + d->indexsize))
        return 1;
    return 0;
}

static const char * skip_until_newline(struct Dict *d, const char *from)
{
    if (pointer_at_end(d, from))
        return 0;
    while(*from != '\n' && *from != 0)
    {
        ++from;
        if(pointer_at_end(d, from))
            return 0;
    }
    return from;
}

static int compare(const unsigned char *word, const unsigned char *test)
{
    int i;

    /*printf("Comparing %s to %.20s\n", word, test);*/
    for(i=0; word[i] != 0 && test[i] != 0; ++i)
    {
        if (word[i] != test[i])
        {
            break;
        }
    }
    if (word[i] == 0 && test[i] == '\t')
        return 0;
    else if (word[i] == 0)
        return -1;
    else if (test[i] == '\t')
        return 1;
    else if (word[i] > test[i])
        return 1;
    else if (word[i] < test[i])
        return -1;

    /* It should never reach this. */
    return -1;
}

static const char * search_next(struct Dict *d, const char *word, const char *from)
{
    const char *ret;
    ret = skip_until_newline(d, from) + 1;
    if (ret == (char *) 1) /* pointer at end */
        return 0;
    if (compare(word, ret) == 0)
        return ret;
    return 0;
}

static const char * bin_search(struct Dict *d, const char *word)
{
    int step, pivot;
    const char *ret;
    const char *test;
    int comparision;
    int found_once = 0;

    pivot = d->indexsize / 2;
    step = d->indexsize / 2;

    do
    {
        test = d->index + pivot;
        test = skip_until_newline(d, test);
        if (test == 0)
            return 0;
        test += 1; /* skip exactly the new line */

        comparision = compare(word, test);
        if (comparision <= 0)
        {
            if (comparision == 0)
                found_once = 1;
            /* If == 0, we don't know that it's the FIRST
             * match possible in the dictionary.
             * We want all possible matches. */
            step = step / 2;
            pivot = pivot - step;
        } else if (comparision > 0)
        {
            step = step / 2;
            pivot = pivot + step;
        }
    } while(step > 0);

    if (!found_once)
        return 0;

    if (comparision == 0) /* last comparision */
    {
        ret = skip_until_newline(d, d->index + pivot) + 1;
    } else
    {
        ret = skip_until_newline(d, test) + 1;
    }
    return ret;
}

static int my_get_int(const char **pos)
{
    int i;
    const char *start;
    int val;

    start = *pos;
    for(i=0; start[i] != '\t' && start[i] != '\n'; ++i)
        ;
    val = str2int_len(start, i);
    *pos += i + 1;
    return val;
}

void find_def(struct Dict *d, const char *word, char * def)
{
    int offset, len;
    const char *found, *pos;
    int wordlen;

    def[0] = 0;
    /* we will get a pointer to the offset for the ints*/
    found = bin_search(d, word);
    if (found == 0)
        return;
    wordlen = strlen(word);
    do
    {
        found += wordlen+1;
        pos = found;
        offset = my_get_int(&pos); /* increments pos */
        len = my_get_int(&pos); /* increments pos */
        fill_def(d, offset, len, def);
        found = search_next(d, word, found);
        if (!found)
            break;
        strcat(def, ", ");
        def += strlen(def);
    } while(1);
}