--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-new/learn.cc Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,558 @@
+/*--------------------------------------------------------------
+ Learn.cc -
+ readlearnfiles - sources the tcl file to change learn files
+ learn(char * tifffile, char * asciifile)
+ Performs character learning by reading tiff and ascii translation
+ Characters are partitioned into character groups as described
+ in system.cc. See learn function for more details
+
+ writeLearnedGroups(char * filename) Writes learned character to file
+ readLearnedGroups(char * filename) Reads saved learned characters
+ from file.
+---------------------------------------------------------------*/
+#include "tcl_interface.h"
+#include "system.h"
+#include "learn.h"
+#include "Page.h"
+#include "list.h"
+
+void readLearnFiles()
+/*--------------------------------------------------------------
+Primary Purpose: Sources learnfile.tcl where new learn files can be
+specified without recompiling **/
+{
+ docommand("source learnfile.tcl");
+}
+
+bool whitespace(char c)
+// Returns TRUE if c is a whitespace charater (called by learn.cc)
+{
+ if ( c == '\n' || c == '\t' || c == ' ') return TRUE;
+ return FALSE;
+
+}
+
+bool blank(char * string)
+{
+ if (string == NULL) return TRUE;
+ int len = strlen(string);
+ for(int c=0; c< len; c++)
+ {
+ if (!(whitespace(string[c])))
+ return FALSE;
+ }
+ return TRUE;
+}
+
+
+void printLearnedGroups()
+{
+ // Just print these guys out to make sure they are ok.
+ for(unsigned int i = 0; i < NumCharGroups; i++)
+ for(ListElement * ptr = LearnedGroups[i].first;
+ ptr != NULL; ptr = ptr->next)
+ { Component * item = (Component *) ptr->item;
+ printf("learned char %s, group %d\n", item->fasciiId,
+ item->charGroup);
+ }
+
+}
+
+int lengthNextWord(char * buffer,int offset, int buflength)
+{
+ // counts things in '< >' as one character
+ int count;
+
+ for(int c=offset; c < buflength && !(whitespace(buffer[c])); c++)
+ {
+ if(buffer[c] == '<')
+ {
+ while((buffer[c] != '>') && (c < buflength))
+ c++;
+ count++;
+ }
+ else
+ count++;
+ }
+ return count;
+}
+
+
+int learn(Component * comp, char * id, Confidence threshold)
+/*--------------------------------------------------------------
+Primary Purpose: Make a copy of this component and add it to
+ LearnedGroups. id is ascii identification.
+ Component will only be learned if confidence
+ is below threshold or if id and asciiid dont match
+Arguments: comp - component to learn
+ id - ascii identification
+ threshold - confidence threshold for learning
+Return Value: 1 if component was learned, 0 otherwise
+Rev: 4/25/96
+---------------------------------------------------------------*/
+{
+ Component * newcomp;
+
+ if (comp->confid() < threshold || !(strcmp(comp->fasciiId, id)))
+ {
+ newcomp = comp->copy();
+ delete newcomp->fasciiId;
+ newcomp->fasciiId = new char[strlen(id)+1];
+ strcpy(newcomp->fasciiId , id);
+
+ LearnedGroups[newcomp->charGroup].Append(newcomp);
+ return 1;
+ }
+ return 0;
+}
+
+void learn(char * tifFile, char * asciiFile, bool synchwords)
+/*--------------------------------------------------------------
+Primary Purpose: Learns from TIFF and ascii file. Groups learned
+ characters by baseline into LearnedGroups and
+ sets properties.
+Arguments: tiffFile name of a tiff file to learn from
+ asciiFile name of an ascii translation file
+Effects: Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file.
+
+Rev: 4/26/96
+---------------------------------------------------------------------*/
+{
+
+ Page * learnPage = new Page;
+ initCharBitsSet();
+ if(learnPage->readMap(tifFile) != VALID)
+ {
+ printf("Problem opening the learn image file (file doesn't exist?)\n");
+ return;
+ }
+ learnPage->setLines();
+ learnPage->extractComponents(MinHorizSeparation);
+ learnPage->extractWords();
+ learn(learnPage, asciiFile, synchwords);
+
+ // delete learnPage;
+
+}
+
+
+void learn(Page * learnPage, char * asciiFile, bool synchWords)
+/*--------------------------------------------------------------
+Primary Purpose: Learns from a Page and an ascii file. Used from
+ tcl user interface under File/Learn opation
+ Groups learned
+ characters by baseline into LearnedGroups and
+ sets properties.
+Arguments: tiffFile name of a tiff file to learn from
+ asciiFile name of an ascii translation file
+Effects: Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file.
+
+Rev: 4/26/96
+---------------------------------------------------------------*/
+{
+ FILE * transFile;
+
+ transFile = fopen(asciiFile,"r");
+ if(!transFile)
+ {
+ printf("Could not open the ascii learn file");
+ return;
+ }
+ if (LearnedGroups == NULL)
+ LearnedGroups = new Components[NumCharGroups];
+
+ int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize;
+ char buffer[maxCharsPerLine];
+ int i = -1;
+ int buflength=0;
+ bool instring= FALSE;
+ bool emptyLine;
+ Components * components = NULL;
+ Words * words;
+ Component * item;
+
+ double width, height = 0.0;
+ int h;
+
+
+ words = learnPage->words();
+ int c = 0;
+ Word * word;
+
+ for (ListElement * ptr = words->first; ptr != NULL &&
+ (i < learnPage->numLines()) ; ptr = ptr->next)
+ {
+
+ word = (Word *) ptr->item;
+ // if new line get new text line
+ if (word->characters[0] == '\n' || buflength == 0)
+ {
+ char * ok;
+ do {
+ ok =fgets(buffer, maxCharsPerLine, transFile);
+ } while (ok && blank(buffer)); // skip blank lines.
+ buflength= strlen(buffer);
+ components = learnPage->line(++i);
+ c =0;
+ if (word->characters[0] == '\n') continue;
+ }
+
+
+ // skip over white space
+ while(whitespace(buffer[c]) && c < buflength)c++;
+
+ // Make sure we have an equal # of components characters
+ if (synchWords &&
+ (word->charCount == lengthNextWord(buffer,c,buflength)))
+ {
+ // skip over this word
+ while(!(whitespace(buffer[c])) && c < buflength)
+ c++;
+ continue; // move on to the next word
+ }
+
+ for (int ch = 0; ch < word->charCount; ch++)
+ {
+ while(whitespace(buffer[c]) && c < buflength)c++;
+ item = word->character[ch];
+ if (c >= buflength) break;
+
+ // Link string translation to component. Characters between
+ // brackets are for one component.
+ if(buffer[c] == '<' && !instring)
+ {
+ instring = TRUE;
+ int startString = c;
+ while(c++ < buflength && buffer[c] != '>');
+ int endString = c+1;
+
+ int stringSize = endString - startString;
+ char newstring[stringSize+1];
+ strncpy(newstring, &buffer[startString],stringSize);
+ newstring[stringSize] = '\0';
+ // learn if id's don't match or below threshold
+ learn(item, newstring, ConfidenceThreshold);
+ c++;
+ instring = FALSE;
+ }
+ else
+ {
+ char newstring[2];
+ newstring[0] = buffer[c++];
+ newstring[1]= '\0';
+ learn(item, newstring, ConfidenceThreshold);
+ }
+
+ LearnedGroups[item->charGroup].Append(item);
+ //ptr->item = NULL; // Set to Null in page so it wont get
+ // clobbered on delete
+ h = item->lr().y() - item->ul().y();
+ if (h > height) height = h;
+ width = item->lr().x() - item->ul().x();
+ if (height/width > MaxHWRatio)
+ MaxHWRatio = height/width;
+
+ if (h/width < MinHWRatio)
+ MinHWRatio = h/width;
+
+ if (width < MinWidth)
+ MinWidth = (int) width;
+
+
+ }
+ }
+
+
+
+
+ if (fgets(buffer, maxCharsPerLine, transFile))
+ printf("Uh, oh. There are more characters to learn!\n");
+ /* printf("Maximum height/width ratio = %f\n", MaxHWRatio); */
+ /* printf("Minimum height/width ratio = %f\n", MinHWRatio); */
+
+
+ // printLearnedGroups();
+
+}
+
+
+int writeLearnedGroups(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Write Learned groups out to file for reading
+ in by readLearnedGroups
+Arguments: filename to write learned chars to
+Return Value: 1 if successful 0 if not
+Effects: Writes contents of LearnedGroups array out to filename
+LearnedGroups is an array of lists of components that is decleared
+in system.cc and initialized by the learn() function.
+For each group writes the number of Components the group contains
+followed by the group data.
+Other learned values such as MinWidth MinHWRatio etc are written to
+the file as well.
+Constraints: LearnedGroups must be initialized and filled with learned
+chars before this function is invoked.
+Rev: 11/27 KM
+---------------------------------------------------------------*/
+{
+ int status;
+ FILE * outfile;
+ assert(LearnedGroups != NULL);
+
+ outfile = fopen(filename, "w");
+ if (outfile == NULL)
+ {
+ printf("error openning %s \n", filename);
+ return 0;
+ }
+
+ // Write global information about learned characters
+
+ fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile);
+ fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile);
+ fwrite(&MinWidth, sizeof(MinWidth),1,outfile);
+ fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile);
+ for(unsigned int i = 0; i < NumCharGroups; i++)
+ {
+ unsigned int numChars = LearnedGroups[i].length;
+ // Write group number and number of characters
+ fwrite(&i, sizeof(i), 1, outfile);
+ status = fwrite(&numChars, sizeof(numChars),1,outfile);
+ if (status == 0) return 0;
+ for(ListElement * ptr = LearnedGroups[i].first;
+ ptr != NULL; ptr = ptr->next)
+ {
+
+ Component * comp = (Component *) ptr->item;
+
+ status = fwrite(comp, sizeof(Component),1,outfile);
+// printf("\tChar:%c status:%d \n", comp->asciiId(), status);
+ int stringSize = strlen(comp->fasciiId) +1;
+ status = fwrite(&stringSize, sizeof(stringSize),1,outfile);
+ status = fwrite(comp->fasciiId, stringSize,1,outfile);
+ for(int p = 0; p < numProperties; p++)
+ {
+ status = fwrite(&(comp->fproperty[p]),
+ sizeof(Property),
+ 1, outfile);
+ if (status == 0)
+ {
+ printf("Error writing properties of comp %c",
+ comp->asciiId());
+ return 0;
+ }
+ }
+ }
+ }
+ status = fclose(outfile);
+ if (status == -1) return 0;
+ else return 1;
+
+}
+
+int readLearnedGroups(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Read Learned groups from file that has been
+ created by writeLearnedGroups
+Arguments: filename to read learned chars from
+Return Value: 1 if successful 0 if not
+Effects: Reads contents of filename into LearnedGroups array
+LearnedGroups is an array of lists of components that is decleared
+in system.cc and initialized here or in the learn() function.
+Constraints: LearnedGroups must not yet be initialized
+Rev: 11/27 KM
+---------------------------------------------------------------*/
+{
+ int status;
+ FILE * infile;
+ unsigned int numGroups; // # of groups stored in file.
+
+ initCharBitsSet();
+ if(LearnedGroups == NULL)
+ LearnedGroups = new Components[NumCharGroups];
+
+
+ infile = fopen(filename, "r");
+ if (infile == NULL)
+ {
+ printf("error openning %s \n", filename);
+ return 0;
+ }
+
+ // Read Globals
+ fread(&numGroups, sizeof(numGroups),1, infile);
+ assert(numGroups == NumCharGroups);
+ fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile);
+ fread(&MinWidth, sizeof(MinWidth),1,infile);
+ fread(&MinHWRatio, sizeof(MinHWRatio),1,infile);
+ for(unsigned int i = 0; i < NumCharGroups; i++)
+ {
+ unsigned int groupnum;
+ unsigned int numChars;
+ fread(&groupnum, sizeof(groupnum), 1, infile);
+ assert(groupnum == i);
+ fread(&numChars, sizeof(numChars),1,infile);
+
+ printf("\nReading group %d - %d characters\n",i,numChars);
+ for(unsigned int c = 0; c< numChars; c++)
+ {
+ Component * comp = new Component;
+ short int * savepropptr = comp->fproperty;
+
+ status = fread(comp, sizeof(Component),1,infile);
+ int stringSize;
+ status = fread(&stringSize, sizeof(stringSize),1,infile);
+ comp->fasciiId = new char[stringSize];
+ status = fread(comp->fasciiId, stringSize,1,infile);
+
+ comp->fproperty = savepropptr;
+
+ for(int p = 0; p < numProperties; p++)
+ {
+ status = fread(&(comp->fproperty[p]), sizeof(Property),
+ 1, infile);
+ if (status == 0)
+ {
+ printf("Error reading properties");
+ return 0;
+ }
+ }
+// printf("\tChar:%c status:%d ", comp->asciiId(), status);
+// printVector(comp->properties(), numProperties);
+ LearnedGroups[i].Append(comp);
+
+ }
+
+ }
+ status = fclose(infile);
+ if (status == -1) return 0;
+ else return 1;
+}
+
+void testLearn()
+{
+
+ learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif",
+ "/amd/nfs/cochise/home/ee/cs169/fa95class/cs169-ab/train.txt");
+}
+
+/*****************************************************************
+ FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS
+ AND ARE NOT CURRENTLY USED.
+*******************************************************************/
+
+void initLearnedChars()
+/*--------------------------------------------------------------
+Primary Purpose: Initializes learned character array. Sets asciiId
+to array offset.
+Rev: KM 11/6/95
+---------------------------------------------------------------*/
+{
+ LearnedChars = new Component[256];
+
+ for (int i=0; i < 256; i++)
+ {
+ LearnedChars[i].asciiId() = (char)i;
+ }
+
+}
+
+void oldlearn(char * tifFile, char * asciiFile)
+/*--------------------------------------------------------------
+Primary Purpose: builds property vectors for LearnedChars array
+Arguments: tiffFile name of a tiff file to learn from
+ asciiFile name of an ascii translation file
+Effects: Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file. For learned characters confidence is set
+to the number of examples.
+
+Rev: 11/6/95
+---------------------------------------------------------------*/
+{
+ FILE * transFile;
+ transFile = fopen(asciiFile,"r");
+ Page * learnPage = new Page;
+ initCharBitsSet();
+ learnPage->readMap(tifFile);
+ learnPage->setLines();
+ learnPage->extractComponents(MinHorizSeparation); /* why minlinesize? */
+ int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize;
+ char buffer[maxCharsPerLine];
+ int i = 0;
+ int buflength;
+ bool emptyLine;
+ Components * components;
+ Component * item;
+ int count[256]; // a count of how many of each char have been encountered
+ int prop[256][numProperties]; // Character property sums. Need ints so that
+ // property sum does
+ // not exceed char boundaries
+ char id;
+
+ initLearnedChars();
+ for (i = 0; i < 256; i++)
+ {
+ count[i] = 0;
+ for (int p = 0; p < numProperties; p++)
+ prop[i][p] = 0;
+ }
+ i=0;
+
+ int offset;
+ while (i < learnPage->numLines() &&
+ fgets(buffer, maxCharsPerLine, transFile))
+ {
+ buflength = strlen(buffer);
+ components = learnPage->line(i++);
+ int c = 0;
+ for (ListElement* ptr = components->first; ptr != NULL;
+ ptr = ptr->next)
+ {
+ item = (Component *)(ptr->item);
+ // skip over white space
+ while(whitespace(buffer[c]) && c < buflength)c++;
+ if (c >= buflength)break;
+ id = buffer[c++];
+ count[id]++; // increment character count
+ for (offset=0; offset < numProperties; offset++)
+ prop[id][offset] += (item->properties())[offset];
+ LearnedChars[i].numBits() += item->numBits();
+ }
+ }
+ // now divide by count and put in Learned character
+ for(int j = 0; j < 256; j++)
+ {
+ if(count[j] > 0)
+ {
+ for (int offset=0; offset < numProperties; offset++)
+ prop[j][offset] /= count[j];
+ LearnedChars[j].numBits() /= count[j];
+ LearnedChars[j].confid() = count[j];
+ for (offset=0; offset < numProperties; offset++)
+ (LearnedChars[j].properties())[offset] = prop[j][offset];
+// printf("%d occurrences of %c\n", count[j], (char)j);
+ printVector(LearnedChars[j].properties(), numProperties);
+
+ }
+
+ }
+}
+
+void oldtestLearn()
+{
+
+
+ learn("train.tif", "train.txt");
+ if (ENABLE_USER_INTERFACE)
+ docommand(".main_window.display.work_space delete IMAGE_TAG");
+}
+
+
+
+
+
+