find.c
author viric@llimona
Sat, 08 Sep 2007 09:12:50 +0200
changeset 28 75b6d5659a19
parent 24 026a2ba0ce16
permissions -rw-r--r--
Better browser support.
viric@14
     1
#include <stdio.h>
viric@14
     2
#include <sys/stat.h>
viric@14
     3
#include <sys/types.h>
viric@14
     4
#include <sys/mman.h>
viric@14
     5
#include <fcntl.h>
viric@14
     6
#include "dictre.h"
viric@14
     7
viric@17
     8
const static char indexext[] = ".index";
viric@17
     9
const static char dictext[] = ".dict";
viric@14
    10
viric@14
    11
int get_filesize(const char *fname)
viric@14
    12
{
viric@14
    13
    struct stat st;
viric@14
    14
    int res;
viric@14
    15
    res = stat(fname, &st);
viric@14
    16
    if (res == -1)
viric@14
    17
    {
viric@14
    18
        fprintf(stderr, "Problem stating the file %s\n", fname);
viric@14
    19
        perror("Error:");
viric@14
    20
        exit(-1);
viric@14
    21
    }
viric@14
    22
viric@14
    23
    return st.st_size;
viric@14
    24
}
viric@14
    25
viric@17
    26
void init_dictionary(struct Dict *d, const char *base)
viric@14
    27
{
viric@17
    28
    char *filename;
viric@17
    29
viric@17
    30
    filename = (char *) malloc(strlen(base) + 10);
viric@17
    31
viric@17
    32
    /* Prepare .index filename and open it*/
viric@17
    33
    strcpy(filename, base);
viric@17
    34
    strcat(filename, indexext);
viric@17
    35
viric@17
    36
    d->indexsize = get_filesize(filename);
viric@17
    37
    d->indexfd = open(filename, O_RDONLY);
viric@17
    38
    if (d->indexfd == -1)
viric@14
    39
    {
viric@17
    40
        fprintf(stderr, "Problem opening the file %s\n", filename);
viric@14
    41
        perror("Error:");
viric@14
    42
        exit(-1);
viric@14
    43
    }
viric@17
    44
    d->index = (unsigned char *) mmap(0, d->indexsize, PROT_READ, MAP_SHARED,
viric@17
    45
            d->indexfd, 0);
viric@14
    46
viric@17
    47
    /* Prepare .dict filename and open it*/
viric@17
    48
    strcpy(filename, base);
viric@17
    49
    strcat(filename, dictext);
viric@17
    50
    d->defs = fopen(filename, "r");
viric@17
    51
    if (d->defs == 0)
viric@14
    52
    {
viric@17
    53
        fprintf(stderr, "Problem opening the file %s\n", filename);
viric@14
    54
        perror("Error:");
viric@14
    55
        exit(-1);
viric@14
    56
    }
viric@17
    57
viric@21
    58
    d->trim_first_line = 0;
viric@21
    59
    d->trim_last_newlines = 0;
viric@21
    60
viric@17
    61
    free(filename);
viric@14
    62
}
viric@14
    63
viric@17
    64
void end_dictionary(struct Dict *d)
viric@14
    65
{
viric@17
    66
    munmap(d->index, d->indexsize);
viric@17
    67
    close(d->indexfd);
viric@17
    68
    fclose(d->defs);
viric@14
    69
}
viric@14
    70
viric@21
    71
static int trim_first_line(char *def, int len)
viric@21
    72
{
viric@21
    73
    int new_line_pos;
viric@21
    74
    int i,j;
viric@21
    75
    for(i=0; i < len; ++i)
viric@21
    76
    {
viric@24
    77
        /* prepare ltrimming when finding the first newline character */
viric@21
    78
        if (def[i] == '\n')
viric@21
    79
        {
viric@24
    80
            /* Remove spaces after the first newline */
viric@24
    81
            for(i+=1; def[i] == ' '; ++i);
viric@24
    82
viric@21
    83
            /* Break */
viric@24
    84
            memmove(def, def + i,
viric@21
    85
                    len - i - 1);
viric@21
    86
            def[len-i-1] = 0;
viric@24
    87
            return len-i-1+1/*\0*/;
viric@21
    88
        }
viric@21
    89
    }
viric@21
    90
    return len;
viric@21
    91
}
viric@21
    92
viric@21
    93
static int trim_last_newlines(char *def, int len)
viric@21
    94
{
viric@21
    95
    int new_line_pos;
viric@21
    96
    int i,j;
viric@22
    97
    if (len < 2)
viric@22
    98
        return len;
viric@22
    99
viric@22
   100
    for(i=len-2; i >= 0; --i)
viric@21
   101
    {
viric@21
   102
        if (def[i] != '\n' && def[i] != '\r')
viric@21
   103
        {
viric@21
   104
            def[i+1] = '\0';
viric@21
   105
            return i + 1;
viric@21
   106
        }
viric@21
   107
    }
viric@21
   108
    return len;
viric@21
   109
}
viric@21
   110
viric@17
   111
static void fill_def(struct Dict *d, int offset, int length, char * def)
viric@14
   112
{
viric@17
   113
    fseek(d->defs, offset, SEEK_SET);
viric@17
   114
    fread(def, 1, length, d->defs);
viric@21
   115
    def[length] = 0;
viric@21
   116
    if (d->trim_first_line)
viric@21
   117
    {
viric@21
   118
        length = trim_first_line(def, length + 1/*\0*/) - 1 /*\0*/;
viric@21
   119
    }
viric@21
   120
    if (d->trim_last_newlines)
viric@21
   121
    {
viric@21
   122
        length = trim_last_newlines(def, length+1) - 1; /* math as above*/
viric@21
   123
    }
viric@14
   124
}
viric@14
   125
viric@21
   126
static int pointer_at_end(struct Dict *d, const unsigned char *ptr)
viric@14
   127
{
viric@17
   128
    if (ptr >= (d->index + d->indexsize))
viric@14
   129
        return 1;
viric@14
   130
    return 0;
viric@14
   131
}
viric@14
   132
viric@21
   133
static const char * skip_until_newline(struct Dict *d, const char *from)
viric@14
   134
{
viric@17
   135
    if (pointer_at_end(d, from))
viric@14
   136
        return 0;
viric@14
   137
    while(*from != '\n' && *from != 0)
viric@14
   138
    {
viric@14
   139
        ++from;
viric@17
   140
        if(pointer_at_end(d, from))
viric@14
   141
            return 0;
viric@14
   142
    }
viric@14
   143
    return from;
viric@14
   144
}
viric@14
   145
viric@14
   146
static int compare(const unsigned char *word, const unsigned char *test)
viric@14
   147
{
viric@14
   148
    int i;
viric@14
   149
viric@14
   150
    /*printf("Comparing %s to %.20s\n", word, test);*/
viric@14
   151
    for(i=0; word[i] != 0 && test[i] != 0; ++i)
viric@14
   152
    {
viric@14
   153
        if (word[i] != test[i])
viric@14
   154
        {
viric@14
   155
            break;
viric@14
   156
        }
viric@14
   157
    }
viric@14
   158
    if (word[i] == 0 && test[i] == '\t')
viric@14
   159
        return 0;
viric@14
   160
    else if (word[i] == 0)
viric@14
   161
        return -1;
viric@14
   162
    else if (test[i] == '\t')
viric@14
   163
        return 1;
viric@14
   164
    else if (word[i] > test[i])
viric@14
   165
        return 1;
viric@14
   166
    else if (word[i] < test[i])
viric@14
   167
        return -1;
viric@14
   168
viric@14
   169
    /* It should never reach this. */
viric@14
   170
    return -1;
viric@14
   171
}
viric@14
   172
viric@21
   173
static const char * search_next(struct Dict *d, const char *word, const char *from)
viric@21
   174
{
viric@21
   175
    const char *ret;
viric@24
   176
    ret = skip_until_newline(d, from) + 1;
viric@24
   177
    if (ret == (char *) 1) /* pointer at end */
viric@24
   178
        return 0;
viric@24
   179
    if (compare(word, ret) == 0)
viric@21
   180
        return ret;
viric@21
   181
    return 0;
viric@21
   182
}
viric@21
   183
viric@21
   184
static const char * bin_search(struct Dict *d, const char *word)
viric@14
   185
{
viric@14
   186
    int step, pivot;
viric@21
   187
    const char *ret;
viric@21
   188
    const char *test;
viric@21
   189
    int comparision;
viric@21
   190
    int found_once = 0;
viric@14
   191
viric@17
   192
    pivot = d->indexsize / 2;
viric@17
   193
    step = d->indexsize / 2;
viric@14
   194
viric@14
   195
    do
viric@14
   196
    {
viric@17
   197
        test = d->index + pivot;
viric@17
   198
        test = skip_until_newline(d, test);
viric@14
   199
        if (test == 0)
viric@14
   200
            return 0;
viric@14
   201
        test += 1; /* skip exactly the new line */
viric@14
   202
viric@14
   203
        comparision = compare(word, test);
viric@21
   204
        if (comparision <= 0)
viric@14
   205
        {
viric@21
   206
            if (comparision == 0)
viric@21
   207
                found_once = 1;
viric@21
   208
            /* If == 0, we don't know that it's the FIRST
viric@21
   209
             * match possible in the dictionary.
viric@21
   210
             * We want all possible matches. */
viric@14
   211
            step = step / 2;
viric@14
   212
            pivot = pivot - step;
viric@14
   213
        } else if (comparision > 0)
viric@14
   214
        {
viric@14
   215
            step = step / 2;
viric@14
   216
            pivot = pivot + step;
viric@14
   217
        }
viric@14
   218
    } while(step > 0);
viric@21
   219
viric@21
   220
    if (!found_once)
viric@21
   221
        return 0;
viric@21
   222
viric@21
   223
    if (comparision == 0) /* last comparision */
viric@21
   224
    {
viric@21
   225
        ret = skip_until_newline(d, d->index + pivot) + 1;
viric@21
   226
    } else
viric@21
   227
    {
viric@21
   228
        ret = skip_until_newline(d, test) + 1;
viric@21
   229
    }
viric@21
   230
    return ret;
viric@14
   231
}
viric@14
   232
viric@21
   233
static int my_get_int(const char **pos)
viric@14
   234
{
viric@14
   235
    int i;
viric@21
   236
    const char *start;
viric@14
   237
    int val;
viric@14
   238
viric@14
   239
    start = *pos;
viric@14
   240
    for(i=0; start[i] != '\t' && start[i] != '\n'; ++i)
viric@14
   241
        ;
viric@14
   242
    val = str2int_len(start, i);
viric@14
   243
    *pos += i + 1;
viric@14
   244
    return val;
viric@14
   245
}
viric@14
   246
viric@17
   247
void find_def(struct Dict *d, const char *word, char * def)
viric@14
   248
{
viric@14
   249
    int offset, len;
viric@21
   250
    const char *found, *pos;
viric@21
   251
    int wordlen;
viric@14
   252
viric@21
   253
    def[0] = 0;
viric@21
   254
    /* we will get a pointer to the offset for the ints*/
viric@21
   255
    found = bin_search(d, word);
viric@21
   256
    if (found == 0)
viric@21
   257
        return;
viric@21
   258
    wordlen = strlen(word);
viric@21
   259
    do
viric@14
   260
    {
viric@21
   261
        found += wordlen+1;
viric@21
   262
        pos = found;
viric@21
   263
        offset = my_get_int(&pos); /* increments pos */
viric@21
   264
        len = my_get_int(&pos); /* increments pos */
viric@21
   265
        fill_def(d, offset, len, def);
viric@21
   266
        found = search_next(d, word, found);
viric@21
   267
        if (!found)
viric@21
   268
            break;
viric@21
   269
        strcat(def, ", ");
viric@21
   270
        def += strlen(def);
viric@21
   271
    } while(1);
viric@14
   272
}