reference/ocr-new/learn.cc
changeset 0 6b8091ca909a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-new/learn.cc	Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,558 @@
+/*--------------------------------------------------------------
+ Learn.cc - 
+ readlearnfiles - sources the tcl file to change learn files
+ learn(char * tifffile, char * asciifile)
+ Performs character learning by reading tiff and  ascii translation
+ Characters are partitioned into character groups as described 
+ in system.cc.  See learn function for more details
+
+ writeLearnedGroups(char * filename) Writes learned character to file
+ readLearnedGroups(char * filename) Reads saved learned characters 
+                                    from file. 
+---------------------------------------------------------------*/
+#include "tcl_interface.h"
+#include "system.h"
+#include "learn.h"
+#include "Page.h"
+#include "list.h"
+
+void readLearnFiles()
+/*--------------------------------------------------------------
+Primary Purpose: Sources learnfile.tcl where new learn files can be 
+specified without recompiling   **/
+{
+  docommand("source learnfile.tcl");
+}
+
+bool whitespace(char c)
+// Returns TRUE if c is a whitespace charater (called by learn.cc)
+{
+  if ( c == '\n' || c == '\t' || c == ' ') return TRUE;
+  return FALSE;
+
+}
+
+bool blank(char * string)
+{
+  if (string == NULL) return TRUE;
+  int len = strlen(string);
+  for(int c=0; c< len; c++)
+    {
+    if (!(whitespace(string[c])))
+	return FALSE;
+    }
+  return TRUE;
+}
+
+
+void printLearnedGroups()
+{
+  // Just print these guys out to make sure they are ok.
+    for(unsigned int i = 0; i < NumCharGroups; i++)   
+	for(ListElement * ptr = LearnedGroups[i].first; 
+	    ptr != NULL; ptr = ptr->next)
+	  { Component * item = (Component *) ptr->item;
+	  printf("learned char %s, group %d\n", item->fasciiId, 
+			  item->charGroup);
+	  }
+
+}
+
+int lengthNextWord(char * buffer,int offset, int buflength)
+{
+  // counts things in '< >' as one character
+  int count;
+
+  for(int c=offset; c < buflength && !(whitespace(buffer[c])); c++)
+    {
+	 if(buffer[c] == '<')
+	   {
+	     while((buffer[c] != '>') && (c < buflength))
+	       c++;
+	     count++;
+	   }
+	 else
+	   count++;
+    }
+  return count;
+}
+
+
+int learn(Component * comp, char * id, Confidence threshold)
+/*--------------------------------------------------------------
+Primary Purpose: Make a copy of this component and add it to 
+                 LearnedGroups. id is ascii identification.
+		 Component will only be learned if confidence
+		 is below threshold or if id and asciiid dont match
+Arguments: comp - component to learn
+                  id - ascii identification
+		  threshold - confidence threshold for learning
+Return Value: 1 if component was learned, 0 otherwise
+Rev: 4/25/96
+---------------------------------------------------------------*/
+{
+  Component * newcomp;
+
+ if (comp->confid() < threshold || !(strcmp(comp->fasciiId, id)))
+    {
+      newcomp = comp->copy();
+      delete newcomp->fasciiId;
+      newcomp->fasciiId = new char[strlen(id)+1];
+      strcpy(newcomp->fasciiId , id);
+      
+      LearnedGroups[newcomp->charGroup].Append(newcomp);
+      return 1;
+    }
+  return 0;
+}
+
+void learn(char * tifFile, char * asciiFile, bool synchwords)
+/*--------------------------------------------------------------
+Primary Purpose:  Learns from TIFF and ascii file.  Groups learned
+                  characters by baseline into LearnedGroups and
+                   sets properties.
+Arguments: tiffFile name of a tiff file to learn from
+           asciiFile name of an ascii translation file
+Effects:  Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file. 
+
+Rev:  4/26/96
+---------------------------------------------------------------------*/
+{
+
+  Page * learnPage = new Page;
+  initCharBitsSet();
+  if(learnPage->readMap(tifFile) != VALID)
+    {
+      printf("Problem opening the learn image file (file doesn't exist?)\n");
+      return;
+    }
+  learnPage->setLines();
+  learnPage->extractComponents(MinHorizSeparation);
+  learnPage->extractWords();
+  learn(learnPage, asciiFile, synchwords);
+  
+  //  delete learnPage; 
+
+}
+
+
+void learn(Page * learnPage, char * asciiFile, bool synchWords)
+/*--------------------------------------------------------------
+Primary Purpose:  Learns from a Page and an ascii file.  Used from
+                  tcl user interface under File/Learn opation
+		  Groups learned
+                  characters by baseline into LearnedGroups and
+                   sets properties.
+Arguments: tiffFile name of a tiff file to learn from
+           asciiFile name of an ascii translation file
+Effects:  Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file. 
+
+Rev:  4/26/96
+---------------------------------------------------------------*/
+{
+  FILE * transFile;
+
+  transFile = fopen(asciiFile,"r");
+  if(!transFile)
+    {
+      printf("Could not open the ascii learn file");
+      return;
+    }
+  if (LearnedGroups == NULL)
+    LearnedGroups = new Components[NumCharGroups]; 
+
+  int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; 
+  char buffer[maxCharsPerLine];
+  int i = -1;
+  int buflength=0;
+  bool instring= FALSE;
+  bool emptyLine;
+  Components * components = NULL;
+  Words * words;
+  Component * item;
+  
+  double width, height = 0.0;
+  int h;
+
+
+      words = learnPage->words();
+      int c = 0;
+      Word * word;
+
+      for (ListElement * ptr = words->first; ptr != NULL &&
+	  (i < learnPage->numLines()) ; ptr = ptr->next)
+	{	
+
+	  word = (Word *) ptr->item;
+	  // if new line get new  text line
+	  if (word->characters[0] == '\n' || buflength == 0)
+	    {
+	      char * ok;
+	      do {
+		ok =fgets(buffer, maxCharsPerLine, transFile);
+	      } while (ok && blank(buffer)); // skip blank lines.
+	      buflength= strlen(buffer);
+              components = learnPage->line(++i);
+	      c =0;
+	      if (word->characters[0] == '\n') continue;
+	    }
+	  
+
+	  // skip over white space
+	  while(whitespace(buffer[c]) && c < buflength)c++;
+
+	  // Make sure we have an equal # of components characters
+	  if (synchWords && 
+	      (word->charCount == lengthNextWord(buffer,c,buflength)))
+	    {
+	      // skip over this word
+	      while(!(whitespace(buffer[c])) && c < buflength)
+		c++;
+	      continue; // move on to the next word
+	    }
+
+	  for (int ch = 0; ch < word->charCount; ch++) 
+	    { 
+	      while(whitespace(buffer[c]) && c < buflength)c++;
+	      item = word->character[ch]; 
+	      if (c >= buflength) break;
+
+		 // Link string translation to component.  Characters between
+		 // brackets are for one component.
+		   if(buffer[c] == '<' && !instring)
+		     {
+		       instring = TRUE;
+		       int startString = c;
+		       while(c++ < buflength && buffer[c] != '>');
+		       int endString = c+1;
+		       
+		       int stringSize = endString - startString;
+		       char newstring[stringSize+1];
+		       strncpy(newstring, &buffer[startString],stringSize);
+		       newstring[stringSize] = '\0';
+		       // learn if id's don't match or below threshold
+		       learn(item, newstring, ConfidenceThreshold);
+		       c++;
+		       instring = FALSE;
+		     }
+		   else
+		     {
+                       char newstring[2];
+      		       newstring[0] = buffer[c++];
+		       newstring[1]= '\0';
+		       learn(item, newstring, ConfidenceThreshold);
+		     }
+
+		   LearnedGroups[item->charGroup].Append(item);
+		   //ptr->item = NULL; // Set to Null in page so it wont get
+	                    // clobbered on delete
+		   h = item->lr().y() - item->ul().y();
+		   if (h > height) height = h;
+		   width = item->lr().x() - item->ul().x();
+		   if (height/width > MaxHWRatio)
+		     MaxHWRatio = height/width;
+
+		   if (h/width < MinHWRatio)
+		     MinHWRatio = h/width;
+
+		   if (width < MinWidth)
+		     MinWidth = (int) width;
+	  
+
+		 }
+	}
+
+
+
+
+  if (fgets(buffer, maxCharsPerLine, transFile))
+      printf("Uh, oh. There are more characters to learn!\n");
+  /*  printf("Maximum height/width ratio = %f\n", MaxHWRatio); */
+  /*  printf("Minimum height/width ratio = %f\n", MinHWRatio); */
+
+
+  // printLearnedGroups();
+
+}
+
+
+int writeLearnedGroups(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose:  Write Learned groups out to file for reading
+                  in by readLearnedGroups
+Arguments: filename to write learned chars to 
+Return Value: 1 if successful 0 if not
+Effects:  Writes contents of LearnedGroups array out to filename
+LearnedGroups is an array of lists of components that is decleared
+in system.cc and initialized by the learn() function.
+For each group writes the number of Components the group contains
+followed by the group data.
+Other learned values such as MinWidth MinHWRatio etc are written to
+the file as well.
+Constraints: LearnedGroups must be initialized and filled with learned
+chars before this function is invoked.
+Rev: 11/27 KM
+---------------------------------------------------------------*/
+{
+  int status;
+  FILE * outfile;
+  assert(LearnedGroups != NULL);
+  
+  outfile = fopen(filename, "w");
+  if (outfile == NULL)
+      {
+	printf("error openning %s \n", filename);
+	return 0;
+      }
+
+  // Write global information about learned characters
+
+  fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile);
+  fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile);
+  fwrite(&MinWidth, sizeof(MinWidth),1,outfile);
+  fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile);
+  for(unsigned int i = 0; i < NumCharGroups; i++)   
+      {
+	unsigned int numChars = LearnedGroups[i].length;
+	// Write group number and number of characters
+	fwrite(&i, sizeof(i), 1, outfile);
+	status = fwrite(&numChars, sizeof(numChars),1,outfile);
+	if (status == 0) return 0;
+	for(ListElement * ptr = LearnedGroups[i].first; 
+	    ptr != NULL; ptr = ptr->next)
+	    {
+	      
+	      Component * comp = (Component *) ptr->item;
+
+	      status = fwrite(comp, sizeof(Component),1,outfile);	     
+//	      printf("\tChar:%c status:%d \n", comp->asciiId(), status);
+	      int stringSize = strlen(comp->fasciiId) +1;
+	      status = fwrite(&stringSize, sizeof(stringSize),1,outfile);
+              status = fwrite(comp->fasciiId, stringSize,1,outfile);
+	      for(int p = 0; p < numProperties; p++)
+		  {
+		    status = fwrite(&(comp->fproperty[p]), 
+				  sizeof(Property),
+				  1, outfile);
+		    if (status == 0) 
+			{
+			  printf("Error writing properties of comp %c",
+				 comp->asciiId());
+			  return 0;
+			}
+		  }
+	    }
+      }
+  status = fclose(outfile);
+  if (status == -1) return 0;
+  else return 1;
+
+}
+
+int readLearnedGroups(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose:  Read Learned groups from file that has been
+                  created by writeLearnedGroups
+Arguments: filename to read learned chars from 
+Return Value: 1 if successful 0 if not
+Effects:  Reads contents of filename into LearnedGroups array
+LearnedGroups is an array of lists of components that is decleared
+in system.cc and initialized here or in the learn() function.
+Constraints: LearnedGroups must not yet be initialized
+Rev: 11/27 KM
+---------------------------------------------------------------*/
+{
+  int status;
+  FILE * infile;
+  unsigned int numGroups;           // # of groups stored in file.
+
+  initCharBitsSet();
+  if(LearnedGroups == NULL)
+    LearnedGroups = new Components[NumCharGroups];
+
+
+  infile = fopen(filename, "r");
+  if (infile == NULL)
+      {
+	printf("error openning %s \n", filename);
+	return 0;
+      }
+
+  // Read Globals
+  fread(&numGroups, sizeof(numGroups),1, infile);
+  assert(numGroups == NumCharGroups);
+  fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile);
+  fread(&MinWidth, sizeof(MinWidth),1,infile);
+  fread(&MinHWRatio, sizeof(MinHWRatio),1,infile);
+  for(unsigned int i = 0; i < NumCharGroups; i++)   
+      {
+	unsigned int groupnum;
+	unsigned int numChars;
+	fread(&groupnum, sizeof(groupnum), 1, infile);
+	assert(groupnum == i);
+	fread(&numChars, sizeof(numChars),1,infile);
+
+	printf("\nReading group %d - %d characters\n",i,numChars);
+	for(unsigned int c = 0; c< numChars; c++)
+	    {
+	      Component * comp = new Component;
+	      short int * savepropptr = comp->fproperty;
+
+	      status = fread(comp, sizeof(Component),1,infile);	     
+	      int stringSize;
+	      status = fread(&stringSize, sizeof(stringSize),1,infile);
+              comp->fasciiId = new  char[stringSize];
+              status = fread(comp->fasciiId, stringSize,1,infile);
+
+	      comp->fproperty = savepropptr;
+
+	      for(int p = 0; p < numProperties; p++)
+		  {
+		    status = fread(&(comp->fproperty[p]), sizeof(Property),
+			      1, infile);
+		    if (status == 0) 
+		      {
+			printf("Error reading properties");
+			return 0;
+		      }
+		  }
+//	      printf("\tChar:%c status:%d ", comp->asciiId(), status);
+//	      printVector(comp->properties(), numProperties);
+	      LearnedGroups[i].Append(comp);
+
+	    }
+
+      }
+  status = fclose(infile);
+  if (status == -1) return 0;
+  else return 1;
+}
+
+void testLearn()
+{
+
+  learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif",
+	   "/amd/nfs/cochise/home/ee/cs169/fa95class/cs169-ab/train.txt"); 
+}
+
+/*****************************************************************
+  FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS
+  AND ARE NOT CURRENTLY USED.
+*******************************************************************/
+
+void initLearnedChars()
+/*--------------------------------------------------------------
+Primary Purpose: Initializes learned character array. Sets asciiId
+to array offset.
+Rev: KM 11/6/95
+---------------------------------------------------------------*/
+{
+  LearnedChars = new Component[256];
+ 
+  for (int i=0; i < 256; i++)
+    {
+      LearnedChars[i].asciiId() = (char)i;
+    }
+
+}
+
+void oldlearn(char * tifFile, char * asciiFile)
+/*--------------------------------------------------------------
+Primary Purpose:  builds property vectors for LearnedChars array
+Arguments: tiffFile name of a tiff file to learn from
+           asciiFile name of an ascii translation file
+Effects:  Assumes a one to one correspondence between each connected
+component on a line of the tif file and each character on the corresponding
+line of the ascii file.  For learned characters confidence is set
+to the number of examples.
+
+Rev:  11/6/95
+---------------------------------------------------------------*/
+{
+  FILE * transFile;
+  transFile = fopen(asciiFile,"r");
+  Page * learnPage = new Page;
+  initCharBitsSet();
+  learnPage->readMap(tifFile);
+  learnPage->setLines();
+  learnPage->extractComponents(MinHorizSeparation);         /* why minlinesize? */
+  int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; 
+  char buffer[maxCharsPerLine];
+  int i = 0;
+  int buflength;
+  bool emptyLine;
+  Components * components;
+  Component * item;
+  int count[256]; // a count of how many of each char have been encountered
+  int prop[256][numProperties]; // Character property sums. Need ints so that 
+                                 // property sum does
+                                // not exceed char boundaries
+  char id;
+
+  initLearnedChars();
+  for (i = 0; i < 256; i++)
+    {
+      count[i] = 0;
+      for (int p  = 0; p < numProperties; p++)
+	prop[i][p] = 0;
+    }
+  i=0; 
+
+  int offset;
+  while (i < learnPage->numLines() &&  
+	 fgets(buffer, maxCharsPerLine, transFile))
+      {
+	buflength = strlen(buffer);
+	components = learnPage->line(i++);
+	int c = 0;
+	for (ListElement* ptr = components->first; ptr != NULL; 
+	     ptr = ptr->next) 
+	  {
+	  item = (Component *)(ptr->item);
+	  // skip over white space
+	  while(whitespace(buffer[c]) && c < buflength)c++;
+	  if (c >= buflength)break;
+	  id =  buffer[c++];
+	  count[id]++;  // increment character count
+	  for (offset=0; offset < numProperties; offset++)
+	    prop[id][offset] += (item->properties())[offset];
+	  LearnedChars[i].numBits() += item->numBits();
+	}
+      }
+  // now divide by count and put in Learned character
+  for(int j = 0; j < 256; j++)
+      {
+	if(count[j] > 0)
+	    {
+	      for (int offset=0; offset < numProperties; offset++)
+		prop[j][offset] /= count[j];
+	      LearnedChars[j].numBits() /= count[j]; 
+	      LearnedChars[j].confid() = count[j];
+	      for (offset=0; offset < numProperties; offset++)
+		(LearnedChars[j].properties())[offset] = prop[j][offset];
+//	      printf("%d occurrences of %c\n", count[j], (char)j);
+	      printVector(LearnedChars[j].properties(), numProperties);
+			   
+	    }
+
+      }
+}
+
+void oldtestLearn()
+{
+
+
+  learn("train.tif", "train.txt");
+  if (ENABLE_USER_INTERFACE)
+  docommand(".main_window.display.work_space delete IMAGE_TAG");
+}
+
+
+
+
+
+