zrus.c
author viric <viriketo@gmail.com>
Fri, 30 Mar 2012 18:55:30 +0200
branchsql
changeset 32 6a1a709330bf
parent 16 b4e251400e36
permissions -rw-r--r--
Adding code to dump the words as sql.
viric@11
     1
#include <stdio.h>
viric@15
     2
#include <unicode/uchar.h>
viric@15
     3
#include <unicode/ustring.h>
viric@15
     4
#include <unicode/utypes.h>
viric@11
     5
#include "dictre.h"
viric@11
     6
viric@11
     7
static int closed_accent(const unsigned char *tmp)
viric@11
     8
{
viric@11
     9
    if (tmp[0] == 0xcc && tmp[1] == 0x81)
viric@11
    10
        return 1;
viric@11
    11
    return 0;
viric@11
    12
}
viric@11
    13
viric@11
    14
static int open_accent(const unsigned char *tmp)
viric@11
    15
{
viric@11
    16
    if (tmp[0] == 0x60)
viric@11
    17
        return 1;
viric@11
    18
    return 0;
viric@11
    19
}
viric@11
    20
viric@11
    21
/* Must free what is needed */
viric@11
    22
char * mix_accents(char *a, const char *b)
viric@11
    23
{
viric@11
    24
    int ia,ib,o;
viric@11
    25
    char *out;
viric@11
    26
    char tmp[MAXWORD];
viric@11
    27
viric@11
    28
    ia = 0;
viric@11
    29
    ib = 0;
viric@11
    30
    o = 0;
viric@11
    31
    while(a[ia] != 0 || b[ib] != 0)
viric@11
    32
    {
viric@11
    33
        if (closed_accent(&a[ia]))
viric@11
    34
        {
viric@11
    35
            tmp[o] = a[ia];
viric@11
    36
            tmp[o+1] = a[ia+1];
viric@11
    37
            o+=2;
viric@11
    38
            ia+=2;
viric@11
    39
            if(closed_accent(&b[ib]))
viric@11
    40
                ib+=2;
viric@11
    41
            continue;
viric@11
    42
        } else if (closed_accent(&b[ib]))
viric@11
    43
        {
viric@11
    44
            tmp[o] = b[ib];
viric@11
    45
            tmp[o+1] = b[ib+1];
viric@11
    46
            o+=2;
viric@11
    47
            ib+=2;
viric@11
    48
            continue;
viric@11
    49
        } else if (open_accent(&a[ia]))
viric@11
    50
        {
viric@12
    51
            tmp[o] = a[ia];
viric@11
    52
            o+=1;
viric@11
    53
            ia+=1;
viric@11
    54
            if (open_accent(&b[ib]))
viric@11
    55
                ib+=1;
viric@11
    56
            continue;
viric@11
    57
        } else if (open_accent(&b[ib]))
viric@11
    58
        {
viric@11
    59
            tmp[o] = b[ib];
viric@11
    60
            o+=1;
viric@11
    61
            ib+=1;
viric@11
    62
            continue;
viric@11
    63
        }
viric@11
    64
        else
viric@11
    65
        {
viric@11
    66
            /* Letter */
viric@11
    67
            tmp[o] = a[ia];
viric@11
    68
            if (a[ia] != 0)
viric@11
    69
                ++ia;
viric@11
    70
            if (b[ib] != 0)
viric@11
    71
                ++ib;
viric@11
    72
            ++o;
viric@11
    73
        }
viric@11
    74
    }
viric@11
    75
    tmp[o] = 0;
viric@11
    76
    out = strdup(tmp);
viric@11
    77
    free(a);
viric@11
    78
    return out;
viric@11
    79
}
viric@11
    80
viric@11
    81
void remove_accent(unsigned char *dest, const unsigned char *from)
viric@11
    82
{
viric@11
    83
    int i,o;
viric@11
    84
viric@11
    85
    i = 0;
viric@11
    86
    o = 0;
viric@11
    87
    while (from[i] != 0)
viric@11
    88
    {
viric@11
    89
        if (from[i] == 0xcc && from[i+1] == 0x81)
viric@11
    90
            i+=2;
viric@11
    91
        else if (from[i] == 0x60)
viric@11
    92
            ++i;
viric@11
    93
        else
viric@11
    94
        {
viric@11
    95
            dest[o] = from[i];
viric@11
    96
            ++o;
viric@11
    97
            ++i;
viric@11
    98
        }
viric@11
    99
    }
viric@11
   100
    dest[o] = 0;
viric@11
   101
}
viric@14
   102
viric@14
   103
int skip_newline(const char *str, int *index)
viric@14
   104
{
viric@14
   105
    while(str[*index] != 0 && str[*index] != '\n')
viric@14
   106
    {
viric@14
   107
        ++*index;
viric@14
   108
    }
viric@14
   109
viric@14
   110
    if (str[*index] == '\n')
viric@14
   111
        return *index;
viric@14
   112
viric@14
   113
    return -1;
viric@14
   114
}
viric@14
   115
viric@14
   116
int until_noword(const char *str, int *index)
viric@14
   117
{
viric@14
   118
    while(str[*index] != 0 &&
viric@14
   119
            str[*index] != ' ' &&
viric@14
   120
            str[*index] != '\n' &&
viric@14
   121
            str[*index] != '\r' &&
viric@14
   122
            str[*index] != ',')
viric@14
   123
    {
viric@14
   124
        ++*index;
viric@14
   125
    }
viric@14
   126
viric@14
   127
    if (str[*index] != 0)
viric@14
   128
        return *index;
viric@14
   129
viric@14
   130
    return -1;
viric@14
   131
}
viric@14
   132
viric@14
   133
int is_ASCII(unsigned char c)
viric@14
   134
{
viric@14
   135
    if (c < 128)
viric@14
   136
        return 1;
viric@14
   137
    return 0;
viric@14
   138
}
viric@14
   139
viric@14
   140
int until_newword(const unsigned char *str, int *index)
viric@14
   141
{
viric@14
   142
    while(str[*index] != 0 && is_ASCII(str[*index]))
viric@14
   143
    {
viric@14
   144
        ++*index;
viric@14
   145
    }
viric@14
   146
viric@14
   147
    if (str[*index] != 0);
viric@14
   148
        return *index;
viric@14
   149
viric@14
   150
    return -1;
viric@14
   151
}
viric@15
   152
viric@15
   153
int get_case(enum Case *vcase, const char *str)
viric@15
   154
{
viric@15
   155
    UChar32 c;
viric@15
   156
    int i;
viric@15
   157
    int o;
viric@15
   158
    int len;
viric@15
   159
viric@15
   160
    len = strlen(str);
viric@15
   161
viric@15
   162
    i=0;
viric@15
   163
    o=0;
viric@15
   164
    do
viric@15
   165
    {
viric@15
   166
        U8_NEXT(str, i, len, c);
viric@15
   167
        /*printf("[%i] ", c);*/
viric@15
   168
        if (c == 0)
viric@15
   169
            break;
viric@15
   170
        if (u_islower(c))
viric@15
   171
            vcase[o] = LCASE;
viric@15
   172
        else
viric@15
   173
            vcase[o] = UCASE;
viric@15
   174
        ++o;
viric@15
   175
    } while(1);
viric@15
   176
viric@15
   177
    return o;
viric@15
   178
}
viric@15
   179
viric@15
   180
void get_lowcase_str(char *out, const char *str)
viric@15
   181
{
viric@15
   182
    UChar32 c;
viric@15
   183
    int i;
viric@15
   184
    int o;
viric@15
   185
    int len;
viric@15
   186
    char iserror = 0;
viric@15
   187
viric@15
   188
    len = strlen(str);
viric@15
   189
viric@15
   190
    i=0;
viric@15
   191
    o=0;
viric@15
   192
    do
viric@15
   193
    {
viric@15
   194
        U8_NEXT(str, i, len, c);
viric@15
   195
        /*printf("[%i] ", c);*/
viric@15
   196
        c = u_tolower(c);
viric@15
   197
        U8_APPEND(out, o, MAXWORD, c, iserror);
viric@15
   198
        if (iserror)
viric@15
   199
            break;
viric@15
   200
        if (c == 0)
viric@15
   201
            break;
viric@15
   202
    } while(1);
viric@15
   203
}
viric@15
   204
viric@15
   205
void reapply_case(char *out, const char *in, const enum Case *vcase)
viric@15
   206
{
viric@15
   207
    UChar32 c;
viric@15
   208
    int i;
viric@15
   209
    int o;
viric@15
   210
    int vcasepos;
viric@15
   211
    int len;
viric@15
   212
    char iserror = 0;
viric@15
   213
    const UChar32 inverted = '`';
viric@15
   214
viric@15
   215
    len = strlen(in);
viric@15
   216
viric@15
   217
    i=0;
viric@15
   218
    o=0;
viric@15
   219
    vcasepos = 0;
viric@15
   220
    do
viric@15
   221
    {
viric@15
   222
        U8_NEXT(in, i, len, c);
viric@15
   223
        /*printf("[%i] ", c);*/
viric@15
   224
        if (c == inverted || u_hasBinaryProperty(c, UCHAR_DIACRITIC))
viric@15
   225
        {
viric@15
   226
            U8_APPEND(out, o, MAXWORD, c, iserror);
viric@15
   227
            /* Here we don't increment vcasepos,
viric@15
   228
             * so the ` or diacritics gets copied without being taken
viric@15
   229
             * care in the recase process. It will
viric@15
   230
             * be the only sign that may be _added_ */
viric@15
   231
            continue;
viric@15
   232
        }
viric@15
   233
viric@15
   234
        if (vcase[vcasepos] == LCASE)
viric@15
   235
            c = u_tolower(c);
viric@15
   236
        else
viric@15
   237
            c = u_toupper(c);
viric@15
   238
        vcasepos += 1;
viric@15
   239
viric@15
   240
        U8_APPEND(out, o, MAXWORD, c, iserror);
viric@15
   241
viric@15
   242
        if (iserror)
viric@15
   243
            break;
viric@15
   244
        if (c == 0)
viric@15
   245
            break;
viric@15
   246
    } while(1);
viric@15
   247
}
viric@16
   248
viric@16
   249
void remove_jo(char *str)
viric@16
   250
{
viric@16
   251
    int i, o;
viric@16
   252
    UChar32 c;
viric@16
   253
    char iserror = 0;
viric@16
   254
viric@16
   255
    i=0;
viric@16
   256
    o=0;
viric@16
   257
    do
viric@16
   258
    {
viric@16
   259
        U8_NEXT(str, i, MAXWORD, c);
viric@16
   260
        if (c == 0x0451)
viric@16
   261
        {
viric@16
   262
            c = 0x0435;
viric@16
   263
            U8_APPEND(str, o, MAXWORD, c, iserror);
viric@16
   264
            if (iserror)
viric@16
   265
                break;
viric@16
   266
        }
viric@16
   267
        o = i;
viric@16
   268
        if (c == 0)
viric@16
   269
            break;
viric@16
   270
    } while(1);
viric@16
   271
}