--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-simple/learn.cc Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,395 @@
+/*--------------------------------------------------------------
+ Learn.cc -
+ readlearnfiles - sources the tcl file to change learn files
+ learn(char * tifffile, char * asciifile)
+ Performs character learning by reading tiff and ascii translation
+ Characters are partitioned into character groups as described
+ in system.cc. See learn function for more details
+
+ writeLearnedGroups(char * filename) Writes learned character to file
+ readLearnedGroups(char * filename) Reads saved learned characters
+ from file.
+---------------------------------------------------------------*/
+#include "tcl_interface.h"
+#include "system.h"
+#include "learn.h"
+#include "Page.h"
+#include "list.h"
+
+void readLearnFiles()
+/*--------------------------------------------------------------
+Primary Purpose: Sources learnfile.tcl where new learn files can be
+specified without recompiling **/
+{
+ docommand("source learnfile.tcl");
+}
+
+bool whitespace(char c)
+// Returns TRUE if c is a whitespace charater (called by learn.cc)
+{
+ if ( c == '\n' || c == '\t' || c == ' ') return TRUE;
+ return FALSE;
+
+}
+
+
+void learn(char * tifFile, char * asciiFile)
+/*--------------------------------------------------------------
+Primary Purpose: Learns from TIFF and ascii file. Groups learned
+ characters by baseline into LearnedGroups and
+ sets properties.
+Arguments: tiffFile name of a tiff file to learn from
+ asciiFile name of an ascii translation file
+Effects: Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file.
+
+Rev: 11/20/95
+---------------------------------------------------------------*/
+{
+ FILE * transFile;
+
+ transFile = fopen(asciiFile,"r");
+ if(!transFile)
+ {
+ printf("Could not open the ascii learn file");
+ return;
+ }
+ if (LearnedGroups == NULL)
+ LearnedGroups = new Components[NumCharGroups];
+
+ Page * learnPage = new Page;
+ initCharBitsSet();
+ if(learnPage->readMap(tifFile) != VALID)
+ {
+ printf("Problem opening the learn image file (file doesn't exist?)\n");
+ return;
+ }
+ learnPage->setLines();
+ learnPage->extractComponents();
+ int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize;
+ char buffer[maxCharsPerLine];
+ int i = 0;
+ int buflength;
+ bool emptyLine;
+ Components * components;
+ Component * item;
+
+ double width, height = 0.0;
+ int h;
+
+ while (i < learnPage->numLines() &&
+ fgets(buffer, maxCharsPerLine, transFile))
+ {
+ buflength = strlen(buffer);
+ components = learnPage->line(i++);
+ int c = 0;
+ for (ListElement* ptr = components->first; ptr != NULL;
+ ptr = ptr->next)
+ {
+ item = (Component *)(ptr->item);
+
+ // skip over white space
+ while(whitespace(buffer[c]) && c < buflength)c++;
+
+ if (c >= buflength)
+ break;
+
+ item->asciiId() = buffer[c++];
+
+ LearnedGroups[item->charGroup].Append((void*) item);
+ ptr->item = NULL; // Set to Null in page so it wont get
+ // clobbered on delete
+ h = item->lr().y() - item->ul().y();
+ if (h > height) height = h;
+ width = item->lr().x() - item->ul().x();
+ if (height/width > MaxHWRatio)
+ MaxHWRatio = height/width;
+
+ if (h/width < MinHWRatio)
+ MinHWRatio = h/width;
+
+ if (width < MinWidth)
+ MinWidth = (int) width;
+
+/* printf("learned char %c, group %d\n", item->asciiId(),
+ item->charGroup);
+*/
+
+ }
+
+ }
+
+
+ if (fgets(buffer, maxCharsPerLine, transFile))
+ printf("Uh, oh. There are more characters to learn!\n");
+ /* printf("Maximum height/width ratio = %f\n", MaxHWRatio); */
+ /* printf("Minimum height/width ratio = %f\n", MinHWRatio); */
+ delete learnPage;
+}
+
+
+int writeLearnedGroups(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Write Learned groups out to file for reading
+ in by readLearnedGroups
+Arguments: filename to write learned chars to
+Return Value: 1 if successful 0 if not
+Effects: Writes contents of LearnedGroups array out to filename
+LearnedGroups is an array of lists of components that is decleared
+in system.cc and initialized by the learn() function.
+For each group writes the number of Components the group contains
+followed by the group data.
+Other learned values such as MinWidth MinHWRatio etc are written to
+the file as well.
+Constraints: LearnedGroups must be initialized and filled with learned
+chars before this function is invoked.
+Rev: 11/27 KM
+---------------------------------------------------------------*/
+{
+ int status;
+ FILE * outfile;
+ assert(LearnedGroups != NULL);
+
+ outfile = fopen(filename, "w");
+ if (outfile == NULL)
+ {
+ printf("error openning %s \n", filename);
+ return 0;
+ }
+
+ // Write global information about learned characters
+
+ fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile);
+ fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile);
+ fwrite(&MinWidth, sizeof(MinWidth),1,outfile);
+ fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile);
+ for(unsigned int i = 0; i < NumCharGroups; i++)
+ {
+ unsigned int numChars = LearnedGroups[i].length;
+ // Write group number and number of characters
+ fwrite(&i, sizeof(i), 1, outfile);
+ status = fwrite(&numChars, sizeof(numChars),1,outfile);
+ if (status == 0) return 0;
+ for(ListElement * ptr = LearnedGroups[i].first;
+ ptr != NULL; ptr = ptr->next)
+ {
+
+ Component * comp = (Component *) ptr->item;
+
+ status = fwrite(comp, sizeof(Component),1,outfile);
+// printf("\tChar:%c status:%d \n", comp->asciiId(), status);
+
+ for(int p = 0; p < numProperties; p++)
+ {
+ status = fwrite(&(comp->fproperty[p]),
+ sizeof(Property),
+ 1, outfile);
+ if (status == 0)
+ {
+ printf("Error writing properties of comp %c",
+ comp->asciiId());
+ return 0;
+ }
+ }
+ }
+ }
+ status = fclose(outfile);
+ if (status == -1) return 0;
+ else return 1;
+
+}
+
+int readLearnedGroups(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Read Learned groups from file that has been
+ created by writeLearnedGroups
+Arguments: filename to read learned chars from
+Return Value: 1 if successful 0 if not
+Effects: Reads contents of filename into LearnedGroups array
+LearnedGroups is an array of lists of components that is decleared
+in system.cc and initialized here or in the learn() function.
+Constraints: LearnedGroups must not yet be initialized
+Rev: 11/27 KM
+---------------------------------------------------------------*/
+{
+ int status;
+ FILE * infile;
+ unsigned int numGroups; // # of groups stored in file.
+
+ initCharBitsSet();
+ if(LearnedGroups == NULL)
+ LearnedGroups = new Components[NumCharGroups];
+
+
+ infile = fopen(filename, "r");
+ if (infile == NULL)
+ {
+ printf("error openning %s \n", filename);
+ return 0;
+ }
+
+ // Read Globals
+ fread(&numGroups, sizeof(numGroups),1, infile);
+ assert(numGroups == NumCharGroups);
+ fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile);
+ fread(&MinWidth, sizeof(MinWidth),1,infile);
+ fread(&MinHWRatio, sizeof(MinHWRatio),1,infile);
+ for(unsigned int i = 0; i < NumCharGroups; i++)
+ {
+ unsigned int groupnum;
+ unsigned int numChars;
+ fread(&groupnum, sizeof(groupnum), 1, infile);
+ assert(groupnum == i);
+ fread(&numChars, sizeof(numChars),1,infile);
+
+ printf("\nReading group %d - %d characters\n",i,numChars);
+ for(unsigned int c = 0; c< numChars; c++)
+ {
+ Component * comp = new Component;
+ short int * savepropptr = comp->fproperty;
+ status = fread(comp, sizeof(Component),1,infile);
+ comp->fproperty = savepropptr;
+ for(int p = 0; p < numProperties; p++)
+ {
+ status = fread(&(comp->fproperty[p]), sizeof(Property),
+ 1, infile);
+ if (status == 0)
+ {
+ printf("Error reading properties");
+ return 0;
+ }
+ }
+// printf("\tChar:%c status:%d ", comp->asciiId(), status);
+// printVector(comp->properties(), numProperties);
+ LearnedGroups[i].Append(comp);
+
+ }
+
+ }
+ status = fclose(infile);
+ if (status == -1) return 0;
+ else return 1;
+}
+
+void testLearn()
+{
+
+ learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif",
+ "/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.txt");
+}
+
+/*****************************************************************
+ FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS
+ AND ARE NOT CURRENTLY USED.
+*******************************************************************/
+
+void initLearnedChars()
+/*--------------------------------------------------------------
+Primary Purpose: Initializes learned character array. Sets asciiId
+to array offset.
+Rev: KM 11/6/95
+---------------------------------------------------------------*/
+{
+ LearnedChars = new Component[256];
+
+ for (int i=0; i < 256; i++)
+ {
+ LearnedChars[i].asciiId() = (char)i;
+ }
+
+}
+
+void oldlearn(char * tifFile, char * asciiFile)
+/*--------------------------------------------------------------
+Primary Purpose: builds property vectors for LearnedChars array
+Arguments: tiffFile name of a tiff file to learn from
+ asciiFile name of an ascii translation file
+Effects: Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file. For learned characters confidence is set
+to the number of examples.
+
+Rev: 11/6/95
+---------------------------------------------------------------*/
+{
+ FILE * transFile;
+ transFile = fopen(asciiFile,"r");
+ Page * learnPage = new Page;
+ initCharBitsSet();
+ learnPage->readMap(tifFile);
+ learnPage->setLines();
+ learnPage->extractComponents(); /* why minlinesize? */
+ int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize;
+ char buffer[maxCharsPerLine];
+ int i = 0;
+ int buflength;
+ bool emptyLine;
+ Components * components;
+ Component * item;
+ int count[256]; // a count of how many of each char have been encountered
+ int prop[256][numProperties]; // Character property sums. Need ints so that
+ // property sum does
+ // not exceed char boundaries
+ char id;
+
+ initLearnedChars();
+ for (i = 0; i < 256; i++)
+ {
+ count[i] = 0;
+ for (int p = 0; p < numProperties; p++)
+ prop[i][p] = 0;
+ }
+ i=0;
+
+ int offset;
+ while (i < learnPage->numLines() &&
+ fgets(buffer, maxCharsPerLine, transFile))
+ {
+ buflength = strlen(buffer);
+ components = learnPage->line(i++);
+ int c = 0;
+ for (ListElement* ptr = components->first; ptr != NULL;
+ ptr = ptr->next)
+ {
+ item = (Component *)(ptr->item);
+ // skip over white space
+ while(whitespace(buffer[c]) && c < buflength)c++;
+ if (c >= buflength)break;
+ id = buffer[c++];
+ count[id]++; // increment character count
+ for (offset=0; offset < numProperties; offset++)
+ prop[id][offset] += (item->properties())[offset];
+ LearnedChars[i].numBits() += item->numBits();
+ }
+ }
+ // now divide by count and put in Learned character
+ for(int j = 0; j < 256; j++)
+ {
+ if(count[j] > 0)
+ {
+ for (int offset=0; offset < numProperties; offset++)
+ prop[j][offset] /= count[j];
+ LearnedChars[j].numBits() /= count[j];
+ LearnedChars[j].confid() = count[j];
+ for (offset=0; offset < numProperties; offset++)
+ (LearnedChars[j].properties())[offset] = prop[j][offset];
+// printf("%d occurrences of %c\n", count[j], (char)j);
+ printVector(LearnedChars[j].properties(), numProperties);
+
+ }
+
+ }
+}
+
+void oldtestLearn()
+{
+
+
+ learn("train.tif", "train.txt");
+ if (ENABLE_USER_INTERFACE)
+ docommand(".main_window.display.work_space delete IMAGE_TAG"); //6/16/00
+ //docommand("button .b -text \"hello\" -command exit \n pack .b\n");
+
+}
+