Init from working directory of svn repository.
/*--------------------------------------------------------------
Learn.cc -
readlearnfiles - sources the tcl file to change learn files
learn(char * tifffile, char * asciifile)
Performs character learning by reading tiff and ascii translation
Characters are partitioned into character groups as described
in system.cc. See learn function for more details
writeLearnedGroups(char * filename) Writes learned character to file
readLearnedGroups(char * filename) Reads saved learned characters
from file.
---------------------------------------------------------------*/
#include "tcl_interface.h"
#include "system.h"
#include "learn.h"
#include "Page.h"
#include "list.h"
void readLearnFiles()
/*--------------------------------------------------------------
Primary Purpose: Sources learnfile.tcl where new learn files can be
specified without recompiling **/
{
docommand("source learnfile.tcl");
}
bool whitespace(char c)
// Returns TRUE if c is a whitespace charater (called by learn.cc)
{
if ( c == '\n' || c == '\t' || c == ' ') return TRUE;
return FALSE;
}
bool blank(char * string)
{
if (string == NULL) return TRUE;
int len = strlen(string);
for(int c=0; c< len; c++)
{
if (!(whitespace(string[c])))
return FALSE;
}
return TRUE;
}
void printLearnedGroups()
{
// Just print these guys out to make sure they are ok.
for(unsigned int i = 0; i < NumCharGroups; i++)
for(ListElement * ptr = LearnedGroups[i].first;
ptr != NULL; ptr = ptr->next)
{ Component * item = (Component *) ptr->item;
printf("learned char %s, group %d\n", item->fasciiId,
item->charGroup);
}
}
int lengthNextWord(char * buffer,int offset, int buflength)
{
// counts things in '< >' as one character
int count;
for(int c=offset; c < buflength && !(whitespace(buffer[c])); c++)
{
if(buffer[c] == '<')
{
while((buffer[c] != '>') && (c < buflength))
c++;
count++;
}
else
count++;
}
return count;
}
int learn(Component * comp, char * id, Confidence threshold)
/*--------------------------------------------------------------
Primary Purpose: Make a copy of this component and add it to
LearnedGroups. id is ascii identification.
Component will only be learned if confidence
is below threshold or if id and asciiid dont match
Arguments: comp - component to learn
id - ascii identification
threshold - confidence threshold for learning
Return Value: 1 if component was learned, 0 otherwise
Rev: 4/25/96
---------------------------------------------------------------*/
{
Component * newcomp;
if (comp->confid() < threshold || !(strcmp(comp->fasciiId, id)))
{
newcomp = comp->copy();
delete newcomp->fasciiId;
newcomp->fasciiId = new char[strlen(id)+1];
strcpy(newcomp->fasciiId , id);
LearnedGroups[newcomp->charGroup].Append(newcomp);
return 1;
}
return 0;
}
void learn(char * tifFile, char * asciiFile, bool synchwords)
/*--------------------------------------------------------------
Primary Purpose: Learns from TIFF and ascii file. Groups learned
characters by baseline into LearnedGroups and
sets properties.
Arguments: tiffFile name of a tiff file to learn from
asciiFile name of an ascii translation file
Effects: Assumes a one to one correspondence between each connected
component on a line of the tif file and each character on the corresponding
line of the ascii file.
Rev: 4/26/96
---------------------------------------------------------------------*/
{
Page * learnPage = new Page;
initCharBitsSet();
if(learnPage->readMap(tifFile) != VALID)
{
printf("Problem opening the learn image file (file doesn't exist?)\n");
return;
}
learnPage->setLines();
learnPage->extractComponents(MinHorizSeparation);
learnPage->extractWords();
learn(learnPage, asciiFile, synchwords);
// delete learnPage;
}
void learn(Page * learnPage, char * asciiFile, bool synchWords)
/*--------------------------------------------------------------
Primary Purpose: Learns from a Page and an ascii file. Used from
tcl user interface under File/Learn opation
Groups learned
characters by baseline into LearnedGroups and
sets properties.
Arguments: tiffFile name of a tiff file to learn from
asciiFile name of an ascii translation file
Effects: Assumes a one to one correspondence between each connected
component on a line of the tif file and each character on the corresponding
line of the ascii file.
Rev: 4/26/96
---------------------------------------------------------------*/
{
FILE * transFile;
transFile = fopen(asciiFile,"r");
if(!transFile)
{
printf("Could not open the ascii learn file");
return;
}
if (LearnedGroups == NULL)
LearnedGroups = new Components[NumCharGroups];
int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize;
char buffer[maxCharsPerLine];
int i = -1;
int buflength=0;
bool instring= FALSE;
bool emptyLine;
Components * components = NULL;
Words * words;
Component * item;
double width, height = 0.0;
int h;
words = learnPage->words();
int c = 0;
Word * word;
for (ListElement * ptr = words->first; ptr != NULL &&
(i < learnPage->numLines()) ; ptr = ptr->next)
{
word = (Word *) ptr->item;
// if new line get new text line
if (word->characters[0] == '\n' || buflength == 0)
{
char * ok;
do {
ok =fgets(buffer, maxCharsPerLine, transFile);
} while (ok && blank(buffer)); // skip blank lines.
buflength= strlen(buffer);
components = learnPage->line(++i);
c =0;
if (word->characters[0] == '\n') continue;
}
// skip over white space
while(whitespace(buffer[c]) && c < buflength)c++;
// Make sure we have an equal # of components characters
if (synchWords &&
(word->charCount == lengthNextWord(buffer,c,buflength)))
{
// skip over this word
while(!(whitespace(buffer[c])) && c < buflength)
c++;
continue; // move on to the next word
}
for (int ch = 0; ch < word->charCount; ch++)
{
while(whitespace(buffer[c]) && c < buflength)c++;
item = word->character[ch];
if (c >= buflength) break;
// Link string translation to component. Characters between
// brackets are for one component.
if(buffer[c] == '<' && !instring)
{
instring = TRUE;
int startString = c;
while(c++ < buflength && buffer[c] != '>');
int endString = c+1;
int stringSize = endString - startString;
char newstring[stringSize+1];
strncpy(newstring, &buffer[startString],stringSize);
newstring[stringSize] = '\0';
// learn if id's don't match or below threshold
learn(item, newstring, ConfidenceThreshold);
c++;
instring = FALSE;
}
else
{
char newstring[2];
newstring[0] = buffer[c++];
newstring[1]= '\0';
learn(item, newstring, ConfidenceThreshold);
}
LearnedGroups[item->charGroup].Append(item);
//ptr->item = NULL; // Set to Null in page so it wont get
// clobbered on delete
h = item->lr().y() - item->ul().y();
if (h > height) height = h;
width = item->lr().x() - item->ul().x();
if (height/width > MaxHWRatio)
MaxHWRatio = height/width;
if (h/width < MinHWRatio)
MinHWRatio = h/width;
if (width < MinWidth)
MinWidth = (int) width;
}
}
if (fgets(buffer, maxCharsPerLine, transFile))
printf("Uh, oh. There are more characters to learn!\n");
/* printf("Maximum height/width ratio = %f\n", MaxHWRatio); */
/* printf("Minimum height/width ratio = %f\n", MinHWRatio); */
// printLearnedGroups();
}
int writeLearnedGroups(char * filename)
/*--------------------------------------------------------------
Primary Purpose: Write Learned groups out to file for reading
in by readLearnedGroups
Arguments: filename to write learned chars to
Return Value: 1 if successful 0 if not
Effects: Writes contents of LearnedGroups array out to filename
LearnedGroups is an array of lists of components that is decleared
in system.cc and initialized by the learn() function.
For each group writes the number of Components the group contains
followed by the group data.
Other learned values such as MinWidth MinHWRatio etc are written to
the file as well.
Constraints: LearnedGroups must be initialized and filled with learned
chars before this function is invoked.
Rev: 11/27 KM
---------------------------------------------------------------*/
{
int status;
FILE * outfile;
assert(LearnedGroups != NULL);
outfile = fopen(filename, "w");
if (outfile == NULL)
{
printf("error openning %s \n", filename);
return 0;
}
// Write global information about learned characters
fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile);
fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile);
fwrite(&MinWidth, sizeof(MinWidth),1,outfile);
fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile);
for(unsigned int i = 0; i < NumCharGroups; i++)
{
unsigned int numChars = LearnedGroups[i].length;
// Write group number and number of characters
fwrite(&i, sizeof(i), 1, outfile);
status = fwrite(&numChars, sizeof(numChars),1,outfile);
if (status == 0) return 0;
for(ListElement * ptr = LearnedGroups[i].first;
ptr != NULL; ptr = ptr->next)
{
Component * comp = (Component *) ptr->item;
status = fwrite(comp, sizeof(Component),1,outfile);
// printf("\tChar:%c status:%d \n", comp->asciiId(), status);
int stringSize = strlen(comp->fasciiId) +1;
status = fwrite(&stringSize, sizeof(stringSize),1,outfile);
status = fwrite(comp->fasciiId, stringSize,1,outfile);
for(int p = 0; p < numProperties; p++)
{
status = fwrite(&(comp->fproperty[p]),
sizeof(Property),
1, outfile);
if (status == 0)
{
printf("Error writing properties of comp %c",
comp->asciiId());
return 0;
}
}
}
}
status = fclose(outfile);
if (status == -1) return 0;
else return 1;
}
int readLearnedGroups(char * filename)
/*--------------------------------------------------------------
Primary Purpose: Read Learned groups from file that has been
created by writeLearnedGroups
Arguments: filename to read learned chars from
Return Value: 1 if successful 0 if not
Effects: Reads contents of filename into LearnedGroups array
LearnedGroups is an array of lists of components that is decleared
in system.cc and initialized here or in the learn() function.
Constraints: LearnedGroups must not yet be initialized
Rev: 11/27 KM
---------------------------------------------------------------*/
{
int status;
FILE * infile;
unsigned int numGroups; // # of groups stored in file.
initCharBitsSet();
if(LearnedGroups == NULL)
LearnedGroups = new Components[NumCharGroups];
infile = fopen(filename, "r");
if (infile == NULL)
{
printf("error openning %s \n", filename);
return 0;
}
// Read Globals
fread(&numGroups, sizeof(numGroups),1, infile);
assert(numGroups == NumCharGroups);
fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile);
fread(&MinWidth, sizeof(MinWidth),1,infile);
fread(&MinHWRatio, sizeof(MinHWRatio),1,infile);
for(unsigned int i = 0; i < NumCharGroups; i++)
{
unsigned int groupnum;
unsigned int numChars;
fread(&groupnum, sizeof(groupnum), 1, infile);
assert(groupnum == i);
fread(&numChars, sizeof(numChars),1,infile);
printf("\nReading group %d - %d characters\n",i,numChars);
for(unsigned int c = 0; c< numChars; c++)
{
Component * comp = new Component;
short int * savepropptr = comp->fproperty;
status = fread(comp, sizeof(Component),1,infile);
int stringSize;
status = fread(&stringSize, sizeof(stringSize),1,infile);
comp->fasciiId = new char[stringSize];
status = fread(comp->fasciiId, stringSize,1,infile);
comp->fproperty = savepropptr;
for(int p = 0; p < numProperties; p++)
{
status = fread(&(comp->fproperty[p]), sizeof(Property),
1, infile);
if (status == 0)
{
printf("Error reading properties");
return 0;
}
}
// printf("\tChar:%c status:%d ", comp->asciiId(), status);
// printVector(comp->properties(), numProperties);
LearnedGroups[i].Append(comp);
}
}
status = fclose(infile);
if (status == -1) return 0;
else return 1;
}
void testLearn()
{
learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif",
"/amd/nfs/cochise/home/ee/cs169/fa95class/cs169-ab/train.txt");
}
/*****************************************************************
FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS
AND ARE NOT CURRENTLY USED.
*******************************************************************/
void initLearnedChars()
/*--------------------------------------------------------------
Primary Purpose: Initializes learned character array. Sets asciiId
to array offset.
Rev: KM 11/6/95
---------------------------------------------------------------*/
{
LearnedChars = new Component[256];
for (int i=0; i < 256; i++)
{
LearnedChars[i].asciiId() = (char)i;
}
}
void oldlearn(char * tifFile, char * asciiFile)
/*--------------------------------------------------------------
Primary Purpose: builds property vectors for LearnedChars array
Arguments: tiffFile name of a tiff file to learn from
asciiFile name of an ascii translation file
Effects: Assumes a one to one correspondence between each connected
component on a line of the tif file and each character on the corresponding
line of the ascii file. For learned characters confidence is set
to the number of examples.
Rev: 11/6/95
---------------------------------------------------------------*/
{
FILE * transFile;
transFile = fopen(asciiFile,"r");
Page * learnPage = new Page;
initCharBitsSet();
learnPage->readMap(tifFile);
learnPage->setLines();
learnPage->extractComponents(MinHorizSeparation); /* why minlinesize? */
int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize;
char buffer[maxCharsPerLine];
int i = 0;
int buflength;
bool emptyLine;
Components * components;
Component * item;
int count[256]; // a count of how many of each char have been encountered
int prop[256][numProperties]; // Character property sums. Need ints so that
// property sum does
// not exceed char boundaries
char id;
initLearnedChars();
for (i = 0; i < 256; i++)
{
count[i] = 0;
for (int p = 0; p < numProperties; p++)
prop[i][p] = 0;
}
i=0;
int offset;
while (i < learnPage->numLines() &&
fgets(buffer, maxCharsPerLine, transFile))
{
buflength = strlen(buffer);
components = learnPage->line(i++);
int c = 0;
for (ListElement* ptr = components->first; ptr != NULL;
ptr = ptr->next)
{
item = (Component *)(ptr->item);
// skip over white space
while(whitespace(buffer[c]) && c < buflength)c++;
if (c >= buflength)break;
id = buffer[c++];
count[id]++; // increment character count
for (offset=0; offset < numProperties; offset++)
prop[id][offset] += (item->properties())[offset];
LearnedChars[i].numBits() += item->numBits();
}
}
// now divide by count and put in Learned character
for(int j = 0; j < 256; j++)
{
if(count[j] > 0)
{
for (int offset=0; offset < numProperties; offset++)
prop[j][offset] /= count[j];
LearnedChars[j].numBits() /= count[j];
LearnedChars[j].confid() = count[j];
for (offset=0; offset < numProperties; offset++)
(LearnedChars[j].properties())[offset] = prop[j][offset];
// printf("%d occurrences of %c\n", count[j], (char)j);
printVector(LearnedChars[j].properties(), numProperties);
}
}
}
void oldtestLearn()
{
learn("train.tif", "train.txt");
if (ENABLE_USER_INTERFACE)
docommand(".main_window.display.work_space delete IMAGE_TAG");
}