Adding code to dump the words as sql.
#include <stdio.h>
#include <unicode/uchar.h>
#include <unicode/ustring.h>
#include <unicode/utypes.h>
#include "dictre.h"
static int closed_accent(const unsigned char *tmp)
{
if (tmp[0] == 0xcc && tmp[1] == 0x81)
return 1;
return 0;
}
static int open_accent(const unsigned char *tmp)
{
if (tmp[0] == 0x60)
return 1;
return 0;
}
/* Must free what is needed */
char * mix_accents(char *a, const char *b)
{
int ia,ib,o;
char *out;
char tmp[MAXWORD];
ia = 0;
ib = 0;
o = 0;
while(a[ia] != 0 || b[ib] != 0)
{
if (closed_accent(&a[ia]))
{
tmp[o] = a[ia];
tmp[o+1] = a[ia+1];
o+=2;
ia+=2;
if(closed_accent(&b[ib]))
ib+=2;
continue;
} else if (closed_accent(&b[ib]))
{
tmp[o] = b[ib];
tmp[o+1] = b[ib+1];
o+=2;
ib+=2;
continue;
} else if (open_accent(&a[ia]))
{
tmp[o] = a[ia];
o+=1;
ia+=1;
if (open_accent(&b[ib]))
ib+=1;
continue;
} else if (open_accent(&b[ib]))
{
tmp[o] = b[ib];
o+=1;
ib+=1;
continue;
}
else
{
/* Letter */
tmp[o] = a[ia];
if (a[ia] != 0)
++ia;
if (b[ib] != 0)
++ib;
++o;
}
}
tmp[o] = 0;
out = strdup(tmp);
free(a);
return out;
}
void remove_accent(unsigned char *dest, const unsigned char *from)
{
int i,o;
i = 0;
o = 0;
while (from[i] != 0)
{
if (from[i] == 0xcc && from[i+1] == 0x81)
i+=2;
else if (from[i] == 0x60)
++i;
else
{
dest[o] = from[i];
++o;
++i;
}
}
dest[o] = 0;
}
int skip_newline(const char *str, int *index)
{
while(str[*index] != 0 && str[*index] != '\n')
{
++*index;
}
if (str[*index] == '\n')
return *index;
return -1;
}
int until_noword(const char *str, int *index)
{
while(str[*index] != 0 &&
str[*index] != ' ' &&
str[*index] != '\n' &&
str[*index] != '\r' &&
str[*index] != ',')
{
++*index;
}
if (str[*index] != 0)
return *index;
return -1;
}
int is_ASCII(unsigned char c)
{
if (c < 128)
return 1;
return 0;
}
int until_newword(const unsigned char *str, int *index)
{
while(str[*index] != 0 && is_ASCII(str[*index]))
{
++*index;
}
if (str[*index] != 0);
return *index;
return -1;
}
int get_case(enum Case *vcase, const char *str)
{
UChar32 c;
int i;
int o;
int len;
len = strlen(str);
i=0;
o=0;
do
{
U8_NEXT(str, i, len, c);
/*printf("[%i] ", c);*/
if (c == 0)
break;
if (u_islower(c))
vcase[o] = LCASE;
else
vcase[o] = UCASE;
++o;
} while(1);
return o;
}
void get_lowcase_str(char *out, const char *str)
{
UChar32 c;
int i;
int o;
int len;
char iserror = 0;
len = strlen(str);
i=0;
o=0;
do
{
U8_NEXT(str, i, len, c);
/*printf("[%i] ", c);*/
c = u_tolower(c);
U8_APPEND(out, o, MAXWORD, c, iserror);
if (iserror)
break;
if (c == 0)
break;
} while(1);
}
void reapply_case(char *out, const char *in, const enum Case *vcase)
{
UChar32 c;
int i;
int o;
int vcasepos;
int len;
char iserror = 0;
const UChar32 inverted = '`';
len = strlen(in);
i=0;
o=0;
vcasepos = 0;
do
{
U8_NEXT(in, i, len, c);
/*printf("[%i] ", c);*/
if (c == inverted || u_hasBinaryProperty(c, UCHAR_DIACRITIC))
{
U8_APPEND(out, o, MAXWORD, c, iserror);
/* Here we don't increment vcasepos,
* so the ` or diacritics gets copied without being taken
* care in the recase process. It will
* be the only sign that may be _added_ */
continue;
}
if (vcase[vcasepos] == LCASE)
c = u_tolower(c);
else
c = u_toupper(c);
vcasepos += 1;
U8_APPEND(out, o, MAXWORD, c, iserror);
if (iserror)
break;
if (c == 0)
break;
} while(1);
}
void remove_jo(char *str)
{
int i, o;
UChar32 c;
char iserror = 0;
i=0;
o=0;
do
{
U8_NEXT(str, i, MAXWORD, c);
if (c == 0x0451)
{
c = 0x0435;
U8_APPEND(str, o, MAXWORD, c, iserror);
if (iserror)
break;
}
o = i;
if (c == 0)
break;
} while(1);
}