parse_text.c
author viric@llimona
Sat, 01 Sep 2007 21:49:41 +0200
changeset 21 01fe372188ac
parent 18 64ed4238657f
child 24 026a2ba0ce16
permissions -rw-r--r--
Added capabilities to the dictionary finder.

#include <stdio.h>
#include <stdlib.h>
#include "dictre.h"

static int is_http = 0;
static int content_length = -1;
static struct Dict dakcentiga;

static void give_accent_to_word(const char *word)
{
    char def[MAXDEF];
    char low[MAXWORD];
    char recased[MAXWORD];
    enum Case vcase[MAXWORD];

    /* Get case */
    get_case(vcase, word);

    /* Get lowercase version */
    get_lowcase_str(low, word);

    /* Find the lowercase version */
    find_def(&dakcentiga, low, def);
    if (def[0] != 0) /* found */
    {
        /* Print the word UNTIL a space.
         * the definition will have the form:
         *    ACCENTED_WORD NOMINATIVE1 NOMINATIVE2 ... \n */
        char *first_space;
        char spacepos;
        first_space = strchr(def, ' ');
        if (first_space != 0) /* Space found */
        {
            spacepos = first_space - def;
            def[spacepos] = 0; /* Mark an end of string */
            reapply_case(recased, def, vcase);
            printf("%s", recased);
        }
        return;
    }

    /* if first_space == 0 or word not found */
    printf("%s", word);
}

static int my_fgetc(FILE *f)
{
    if (is_http)
        return http_getc(f);
    else
        return fgetc(f);
}

static void process_text(FILE *in, int pos, int length)
{
    unsigned char tmp[MAXWORD];
    int wordpos = 0;
    int skip_non_ascii = 0;

    do
    {
        int c;
        /* Check pos only if length >= 0 */
        if (length >= 0 && pos >= length)
            break;
        c = my_fgetc(in);
        if (c == EOF || c == END_OF_URL)
            break;
        if (skip_non_ascii || is_ASCII(c))
        {
            if (wordpos != 0)
            {
                tmp[wordpos] = 0;
                give_accent_to_word(tmp);
                wordpos = 0;
            }
            putchar(c);
            /* End of skip_non_ascii when we find an
             * ascii string */
            if (skip_non_ascii && is_ASCII(c))
                skip_non_ascii = 0;
        }
        else /* non-ASCII - we consider it russian */
        {
            tmp[wordpos++] = c;
            if (wordpos >= MAXWORD)
            {
                /* Dump the word and the rest of non-ASCII, because
                 * we cannot fit it in 'tmp' */
                int i;
                for(i=0; i < wordpos; ++i)
                    putchar(tmp[i]);
                wordpos=0;
                skip_non_ascii = 1;
            }
        }

        pos += 1;
    } while(1);

    /* End word */
    if (wordpos != 0)
    {
        tmp[wordpos] = 0;
        give_accent_to_word(tmp);
        wordpos = 0;
    }
}

static print_http_header()
{
    printf("Content-Type:text/html;charset=utf-8\r\n\r\n");
}

int eat_form_ok()
{
    const char mask[] = "teksto=";
    char tmp[sizeof(mask)];
    fread(tmp, 1, sizeof(mask)-1, stdin);
    tmp[sizeof(mask)-1] = 0;
    if (strcmp(mask, tmp) == 0)
        return 1;
    return 0;
}

int main()
{
    char *c;

    init_dictionary(&dakcentiga, "akcentiga");

    if (c = getenv("CONTENT_LENGTH"))
    {
        content_length = atoi(c);
        is_http = 1;
    }
    if (is_http)
    {
        print_http_header();
        if (!eat_form_ok())
            return -1;
    }
    process_text(stdin, 0, -1);
    end_dictionary(&dakcentiga);

    return 0;
}