find.c
author viric@llimona
Sat, 01 Sep 2007 01:19:18 +0200
changeset 16 b4e251400e36
parent 14 a961bb8806b9
child 17 d95d9e7a2b81
permissions -rw-r--r--
Improved hash on zprocess, and added parsing for "jo".
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
14
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     1
#include <stdio.h>
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     2
#include <sys/stat.h>
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     3
#include <sys/types.h>
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     4
#include <sys/mman.h>
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     5
#include <fcntl.h>
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     6
#include "dictre.h"
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     7
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     8
static unsigned char *index;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
     9
static int indexfd;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    10
static int indexsize;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    11
static FILE *defs;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    12
const static char indexname[] = "akcentiga.index";
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    13
const static char dictname[] = "akcentiga.dict";
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    14
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    15
int get_filesize(const char *fname)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    16
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    17
    struct stat st;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    18
    int res;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    19
    res = stat(fname, &st);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    20
    if (res == -1)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    21
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    22
        fprintf(stderr, "Problem stating the file %s\n", fname);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    23
        perror("Error:");
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    24
        exit(-1);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    25
    }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    26
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    27
    return st.st_size;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    28
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    29
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    30
void init_dictionary()
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    31
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    32
    indexsize = get_filesize(indexname);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    33
    indexfd = open(indexname, O_RDONLY);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    34
    if (indexfd == -1)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    35
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    36
        fprintf(stderr, "Problem opening the file %s\n", indexname);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    37
        perror("Error:");
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    38
        exit(-1);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    39
    }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    40
    index = (unsigned char *) mmap(0, indexsize, PROT_READ, MAP_SHARED,
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    41
            indexfd, 0);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    42
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    43
    defs = fopen(dictname, "r");
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    44
    if (defs == 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    45
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    46
        fprintf(stderr, "Problem opening the file %s\n", dictname);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    47
        perror("Error:");
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    48
        exit(-1);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    49
    }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    50
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    51
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    52
void end_dictionary()
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    53
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    54
    munmap(index, indexsize);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    55
    close(indexfd);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    56
    fclose(defs);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    57
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    58
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    59
static void fill_def(int offset, int length, char * def)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    60
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    61
    fseek(defs, offset, SEEK_SET);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    62
    fread(def, 1, length, defs);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    63
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    64
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    65
static int pointer_at_end(unsigned char *ptr)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    66
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    67
    if (ptr >= (index + indexsize))
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    68
        return 1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    69
    return 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    70
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    71
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    72
static char * skip_until_newline(char *from)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    73
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    74
    if (pointer_at_end(from))
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    75
        return 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    76
    while(*from != '\n' && *from != 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    77
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    78
        ++from;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    79
        if(pointer_at_end(from))
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    80
            return 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    81
    }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    82
    return from;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    83
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    84
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    85
static int compare(const unsigned char *word, const unsigned char *test)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    86
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    87
    int i;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    88
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    89
    /*printf("Comparing %s to %.20s\n", word, test);*/
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    90
    for(i=0; word[i] != 0 && test[i] != 0; ++i)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    91
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    92
        if (word[i] != test[i])
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    93
        {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    94
            break;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    95
        }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    96
    }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    97
    if (word[i] == 0 && test[i] == '\t')
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    98
        return 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
    99
    else if (word[i] == 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   100
        return -1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   101
    else if (test[i] == '\t')
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   102
        return 1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   103
    else if (word[i] > test[i])
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   104
        return 1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   105
    else if (word[i] < test[i])
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   106
        return -1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   107
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   108
    /* It should never reach this. */
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   109
    return -1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   110
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   111
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   112
static char * bin_search(const char *word)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   113
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   114
    int step, pivot;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   115
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   116
    pivot = indexsize / 2;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   117
    step = indexsize / 2;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   118
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   119
    do
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   120
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   121
        char *test;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   122
        int comparision;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   123
        test = index + pivot;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   124
        test = skip_until_newline(test);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   125
        if (test == 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   126
            return 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   127
        test += 1; /* skip exactly the new line */
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   128
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   129
        comparision = compare(word, test);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   130
        if (comparision == 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   131
        {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   132
            return test + strlen(word) + 1; /* skip word and \n */
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   133
        } else if (comparision < 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   134
        {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   135
            step = step / 2;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   136
            pivot = pivot - step;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   137
        } else if (comparision > 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   138
        {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   139
            step = step / 2;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   140
            pivot = pivot + step;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   141
        }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   142
    } while(step > 0);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   143
    return 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   144
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   145
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   146
static int my_get_int(char **pos)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   147
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   148
    int i;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   149
    char *start;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   150
    int val;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   151
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   152
    start = *pos;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   153
    for(i=0; start[i] != '\t' && start[i] != '\n'; ++i)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   154
        ;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   155
    val = str2int_len(start, i);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   156
    *pos += i + 1;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   157
    return val;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   158
}
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   159
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   160
void find_def(const char *word, char * def)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   161
{
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   162
    int offset, len;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   163
    char *pos;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   164
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   165
    pos = bin_search(word); /* pos points to the offset already. */
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   166
    if (pos == 0)
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   167
    {
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   168
        def[0] = 0;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   169
        /*fprintf(stderr, "Cannot find %s\n", word);*/
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   170
        return;
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   171
    }
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   172
    offset = my_get_int(&pos); /* increments pos */
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   173
    len = my_get_int(&pos); /* increments pos */
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   174
    fill_def(offset, len, def);
a961bb8806b9 first 'zparsetext'.
viric@llimona
parents:
diff changeset
   175
}