dictre: zrus.c@6a1a709330bf


#include <stdio.h>
#include <unicode/uchar.h>
#include <unicode/ustring.h>
#include <unicode/utypes.h>
#include "dictre.h"

static int closed_accent(const unsigned char *tmp)
{
    if (tmp[0] == 0xcc && tmp[1] == 0x81)
        return 1;
    return 0;
}

static int open_accent(const unsigned char *tmp)
{
    if (tmp[0] == 0x60)
        return 1;
    return 0;
}

/* Must free what is needed */
char * mix_accents(char *a, const char *b)
{
    int ia,ib,o;
    char *out;
    char tmp[MAXWORD];

    ia = 0;
    ib = 0;
    o = 0;
    while(a[ia] != 0 || b[ib] != 0)
    {
        if (closed_accent(&a[ia]))
        {
            tmp[o] = a[ia];
            tmp[o+1] = a[ia+1];
            o+=2;
            ia+=2;
            if(closed_accent(&b[ib]))
                ib+=2;
            continue;
        } else if (closed_accent(&b[ib]))
        {
            tmp[o] = b[ib];
            tmp[o+1] = b[ib+1];
            o+=2;
            ib+=2;
            continue;
        } else if (open_accent(&a[ia]))
        {
            tmp[o] = a[ia];
            o+=1;
            ia+=1;
            if (open_accent(&b[ib]))
                ib+=1;
            continue;
        } else if (open_accent(&b[ib]))
        {
            tmp[o] = b[ib];
            o+=1;
            ib+=1;
            continue;
        }
        else
        {
            /* Letter */
            tmp[o] = a[ia];
            if (a[ia] != 0)
                ++ia;
            if (b[ib] != 0)
                ++ib;
            ++o;
        }
    }
    tmp[o] = 0;
    out = strdup(tmp);
    free(a);
    return out;
}

void remove_accent(unsigned char *dest, const unsigned char *from)
{
    int i,o;

    i = 0;
    o = 0;
    while (from[i] != 0)
    {
        if (from[i] == 0xcc && from[i+1] == 0x81)
            i+=2;
        else if (from[i] == 0x60)
            ++i;
        else
        {
            dest[o] = from[i];
            ++o;
            ++i;
        }
    }
    dest[o] = 0;
}

int skip_newline(const char *str, int *index)
{
    while(str[*index] != 0 && str[*index] != '\n')
    {
        ++*index;
    }

    if (str[*index] == '\n')
        return *index;

    return -1;
}

int until_noword(const char *str, int *index)
{
    while(str[*index] != 0 &&
            str[*index] != ' ' &&
            str[*index] != '\n' &&
            str[*index] != '\r' &&
            str[*index] != ',')
    {
        ++*index;
    }

    if (str[*index] != 0)
        return *index;

    return -1;
}

int is_ASCII(unsigned char c)
{
    if (c < 128)
        return 1;
    return 0;
}

int until_newword(const unsigned char *str, int *index)
{
    while(str[*index] != 0 && is_ASCII(str[*index]))
    {
        ++*index;
    }

    if (str[*index] != 0);
        return *index;

    return -1;
}

int get_case(enum Case *vcase, const char *str)
{
    UChar32 c;
    int i;
    int o;
    int len;

    len = strlen(str);

    i=0;
    o=0;
    do
    {
        U8_NEXT(str, i, len, c);
        /*printf("[%i] ", c);*/
        if (c == 0)
            break;
        if (u_islower(c))
            vcase[o] = LCASE;
        else
            vcase[o] = UCASE;
        ++o;
    } while(1);

    return o;
}

void get_lowcase_str(char *out, const char *str)
{
    UChar32 c;
    int i;
    int o;
    int len;
    char iserror = 0;

    len = strlen(str);

    i=0;
    o=0;
    do
    {
        U8_NEXT(str, i, len, c);
        /*printf("[%i] ", c);*/
        c = u_tolower(c);
        U8_APPEND(out, o, MAXWORD, c, iserror);
        if (iserror)
            break;
        if (c == 0)
            break;
    } while(1);
}

void reapply_case(char *out, const char *in, const enum Case *vcase)
{
    UChar32 c;
    int i;
    int o;
    int vcasepos;
    int len;
    char iserror = 0;
    const UChar32 inverted = '`';

    len = strlen(in);

    i=0;
    o=0;
    vcasepos = 0;
    do
    {
        U8_NEXT(in, i, len, c);
        /*printf("[%i] ", c);*/
        if (c == inverted || u_hasBinaryProperty(c, UCHAR_DIACRITIC))
        {
            U8_APPEND(out, o, MAXWORD, c, iserror);
            /* Here we don't increment vcasepos,
             * so the ` or diacritics gets copied without being taken
             * care in the recase process. It will
             * be the only sign that may be _added_ */
            continue;
        }

        if (vcase[vcasepos] == LCASE)
            c = u_tolower(c);
        else
            c = u_toupper(c);
        vcasepos += 1;

        U8_APPEND(out, o, MAXWORD, c, iserror);

        if (iserror)
            break;
        if (c == 0)
            break;
    } while(1);
}

void remove_jo(char *str)
{
    int i, o;
    UChar32 c;
    char iserror = 0;

    i=0;
    o=0;
    do
    {
        U8_NEXT(str, i, MAXWORD, c);
        if (c == 0x0451)
        {
            c = 0x0435;
            U8_APPEND(str, o, MAXWORD, c, iserror);
            if (iserror)
                break;
        }
        o = i;
        if (c == 0)
            break;
    } while(1);
}
author	viric <viriketo@gmail.com>
	Fri, 30 Mar 2012 18:55:30 +0200
branch	sql
changeset 32	6a1a709330bf
parent 16	b4e251400e36
permissions	-rw-r--r--