reference/ocr-new/learn.cc
author viric@llimona
Thu, 18 May 2006 23:12:51 +0200
changeset 0 6b8091ca909a
permissions -rw-r--r--
Init from working directory of svn repository.

/*--------------------------------------------------------------
 Learn.cc - 
 readlearnfiles - sources the tcl file to change learn files
 learn(char * tifffile, char * asciifile)
 Performs character learning by reading tiff and  ascii translation
 Characters are partitioned into character groups as described 
 in system.cc.  See learn function for more details

 writeLearnedGroups(char * filename) Writes learned character to file
 readLearnedGroups(char * filename) Reads saved learned characters 
                                    from file. 
---------------------------------------------------------------*/
#include "tcl_interface.h"
#include "system.h"
#include "learn.h"
#include "Page.h"
#include "list.h"

void readLearnFiles()
/*--------------------------------------------------------------
Primary Purpose: Sources learnfile.tcl where new learn files can be 
specified without recompiling   **/
{
  docommand("source learnfile.tcl");
}

bool whitespace(char c)
// Returns TRUE if c is a whitespace charater (called by learn.cc)
{
  if ( c == '\n' || c == '\t' || c == ' ') return TRUE;
  return FALSE;

}

bool blank(char * string)
{
  if (string == NULL) return TRUE;
  int len = strlen(string);
  for(int c=0; c< len; c++)
    {
    if (!(whitespace(string[c])))
	return FALSE;
    }
  return TRUE;
}


void printLearnedGroups()
{
  // Just print these guys out to make sure they are ok.
    for(unsigned int i = 0; i < NumCharGroups; i++)   
	for(ListElement * ptr = LearnedGroups[i].first; 
	    ptr != NULL; ptr = ptr->next)
	  { Component * item = (Component *) ptr->item;
	  printf("learned char %s, group %d\n", item->fasciiId, 
			  item->charGroup);
	  }

}

int lengthNextWord(char * buffer,int offset, int buflength)
{
  // counts things in '< >' as one character
  int count;

  for(int c=offset; c < buflength && !(whitespace(buffer[c])); c++)
    {
	 if(buffer[c] == '<')
	   {
	     while((buffer[c] != '>') && (c < buflength))
	       c++;
	     count++;
	   }
	 else
	   count++;
    }
  return count;
}


int learn(Component * comp, char * id, Confidence threshold)
/*--------------------------------------------------------------
Primary Purpose: Make a copy of this component and add it to 
                 LearnedGroups. id is ascii identification.
		 Component will only be learned if confidence
		 is below threshold or if id and asciiid dont match
Arguments: comp - component to learn
                  id - ascii identification
		  threshold - confidence threshold for learning
Return Value: 1 if component was learned, 0 otherwise
Rev: 4/25/96
---------------------------------------------------------------*/
{
  Component * newcomp;

 if (comp->confid() < threshold || !(strcmp(comp->fasciiId, id)))
    {
      newcomp = comp->copy();
      delete newcomp->fasciiId;
      newcomp->fasciiId = new char[strlen(id)+1];
      strcpy(newcomp->fasciiId , id);
      
      LearnedGroups[newcomp->charGroup].Append(newcomp);
      return 1;
    }
  return 0;
}

void learn(char * tifFile, char * asciiFile, bool synchwords)
/*--------------------------------------------------------------
Primary Purpose:  Learns from TIFF and ascii file.  Groups learned
                  characters by baseline into LearnedGroups and
                   sets properties.
Arguments: tiffFile name of a tiff file to learn from
           asciiFile name of an ascii translation file
Effects:  Assumes a one to one correspondence between each connected
component on a line of the tif file and each character on the corresponding
line of the ascii file. 

Rev:  4/26/96
---------------------------------------------------------------------*/
{

  Page * learnPage = new Page;
  initCharBitsSet();
  if(learnPage->readMap(tifFile) != VALID)
    {
      printf("Problem opening the learn image file (file doesn't exist?)\n");
      return;
    }
  learnPage->setLines();
  learnPage->extractComponents(MinHorizSeparation);
  learnPage->extractWords();
  learn(learnPage, asciiFile, synchwords);
  
  //  delete learnPage; 

}


void learn(Page * learnPage, char * asciiFile, bool synchWords)
/*--------------------------------------------------------------
Primary Purpose:  Learns from a Page and an ascii file.  Used from
                  tcl user interface under File/Learn opation
		  Groups learned
                  characters by baseline into LearnedGroups and
                   sets properties.
Arguments: tiffFile name of a tiff file to learn from
           asciiFile name of an ascii translation file
Effects:  Assumes a one to one correspondence between each connected
component on a line of the tif file and each character on the corresponding
line of the ascii file. 

Rev:  4/26/96
---------------------------------------------------------------*/
{
  FILE * transFile;

  transFile = fopen(asciiFile,"r");
  if(!transFile)
    {
      printf("Could not open the ascii learn file");
      return;
    }
  if (LearnedGroups == NULL)
    LearnedGroups = new Components[NumCharGroups]; 

  int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; 
  char buffer[maxCharsPerLine];
  int i = -1;
  int buflength=0;
  bool instring= FALSE;
  bool emptyLine;
  Components * components = NULL;
  Words * words;
  Component * item;
  
  double width, height = 0.0;
  int h;


      words = learnPage->words();
      int c = 0;
      Word * word;

      for (ListElement * ptr = words->first; ptr != NULL &&
	  (i < learnPage->numLines()) ; ptr = ptr->next)
	{	

	  word = (Word *) ptr->item;
	  // if new line get new  text line
	  if (word->characters[0] == '\n' || buflength == 0)
	    {
	      char * ok;
	      do {
		ok =fgets(buffer, maxCharsPerLine, transFile);
	      } while (ok && blank(buffer)); // skip blank lines.
	      buflength= strlen(buffer);
              components = learnPage->line(++i);
	      c =0;
	      if (word->characters[0] == '\n') continue;
	    }
	  

	  // skip over white space
	  while(whitespace(buffer[c]) && c < buflength)c++;

	  // Make sure we have an equal # of components characters
	  if (synchWords && 
	      (word->charCount == lengthNextWord(buffer,c,buflength)))
	    {
	      // skip over this word
	      while(!(whitespace(buffer[c])) && c < buflength)
		c++;
	      continue; // move on to the next word
	    }

	  for (int ch = 0; ch < word->charCount; ch++) 
	    { 
	      while(whitespace(buffer[c]) && c < buflength)c++;
	      item = word->character[ch]; 
	      if (c >= buflength) break;

		 // Link string translation to component.  Characters between
		 // brackets are for one component.
		   if(buffer[c] == '<' && !instring)
		     {
		       instring = TRUE;
		       int startString = c;
		       while(c++ < buflength && buffer[c] != '>');
		       int endString = c+1;
		       
		       int stringSize = endString - startString;
		       char newstring[stringSize+1];
		       strncpy(newstring, &buffer[startString],stringSize);
		       newstring[stringSize] = '\0';
		       // learn if id's don't match or below threshold
		       learn(item, newstring, ConfidenceThreshold);
		       c++;
		       instring = FALSE;
		     }
		   else
		     {
                       char newstring[2];
      		       newstring[0] = buffer[c++];
		       newstring[1]= '\0';
		       learn(item, newstring, ConfidenceThreshold);
		     }

		   LearnedGroups[item->charGroup].Append(item);
		   //ptr->item = NULL; // Set to Null in page so it wont get
	                    // clobbered on delete
		   h = item->lr().y() - item->ul().y();
		   if (h > height) height = h;
		   width = item->lr().x() - item->ul().x();
		   if (height/width > MaxHWRatio)
		     MaxHWRatio = height/width;

		   if (h/width < MinHWRatio)
		     MinHWRatio = h/width;

		   if (width < MinWidth)
		     MinWidth = (int) width;
	  

		 }
	}




  if (fgets(buffer, maxCharsPerLine, transFile))
      printf("Uh, oh. There are more characters to learn!\n");
  /*  printf("Maximum height/width ratio = %f\n", MaxHWRatio); */
  /*  printf("Minimum height/width ratio = %f\n", MinHWRatio); */


  // printLearnedGroups();

}


int writeLearnedGroups(char * filename)
/*--------------------------------------------------------------
Primary Purpose:  Write Learned groups out to file for reading
                  in by readLearnedGroups
Arguments: filename to write learned chars to 
Return Value: 1 if successful 0 if not
Effects:  Writes contents of LearnedGroups array out to filename
LearnedGroups is an array of lists of components that is decleared
in system.cc and initialized by the learn() function.
For each group writes the number of Components the group contains
followed by the group data.
Other learned values such as MinWidth MinHWRatio etc are written to
the file as well.
Constraints: LearnedGroups must be initialized and filled with learned
chars before this function is invoked.
Rev: 11/27 KM
---------------------------------------------------------------*/
{
  int status;
  FILE * outfile;
  assert(LearnedGroups != NULL);
  
  outfile = fopen(filename, "w");
  if (outfile == NULL)
      {
	printf("error openning %s \n", filename);
	return 0;
      }

  // Write global information about learned characters

  fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile);
  fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile);
  fwrite(&MinWidth, sizeof(MinWidth),1,outfile);
  fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile);
  for(unsigned int i = 0; i < NumCharGroups; i++)   
      {
	unsigned int numChars = LearnedGroups[i].length;
	// Write group number and number of characters
	fwrite(&i, sizeof(i), 1, outfile);
	status = fwrite(&numChars, sizeof(numChars),1,outfile);
	if (status == 0) return 0;
	for(ListElement * ptr = LearnedGroups[i].first; 
	    ptr != NULL; ptr = ptr->next)
	    {
	      
	      Component * comp = (Component *) ptr->item;

	      status = fwrite(comp, sizeof(Component),1,outfile);	     
//	      printf("\tChar:%c status:%d \n", comp->asciiId(), status);
	      int stringSize = strlen(comp->fasciiId) +1;
	      status = fwrite(&stringSize, sizeof(stringSize),1,outfile);
              status = fwrite(comp->fasciiId, stringSize,1,outfile);
	      for(int p = 0; p < numProperties; p++)
		  {
		    status = fwrite(&(comp->fproperty[p]), 
				  sizeof(Property),
				  1, outfile);
		    if (status == 0) 
			{
			  printf("Error writing properties of comp %c",
				 comp->asciiId());
			  return 0;
			}
		  }
	    }
      }
  status = fclose(outfile);
  if (status == -1) return 0;
  else return 1;

}

int readLearnedGroups(char * filename)
/*--------------------------------------------------------------
Primary Purpose:  Read Learned groups from file that has been
                  created by writeLearnedGroups
Arguments: filename to read learned chars from 
Return Value: 1 if successful 0 if not
Effects:  Reads contents of filename into LearnedGroups array
LearnedGroups is an array of lists of components that is decleared
in system.cc and initialized here or in the learn() function.
Constraints: LearnedGroups must not yet be initialized
Rev: 11/27 KM
---------------------------------------------------------------*/
{
  int status;
  FILE * infile;
  unsigned int numGroups;           // # of groups stored in file.

  initCharBitsSet();
  if(LearnedGroups == NULL)
    LearnedGroups = new Components[NumCharGroups];


  infile = fopen(filename, "r");
  if (infile == NULL)
      {
	printf("error openning %s \n", filename);
	return 0;
      }

  // Read Globals
  fread(&numGroups, sizeof(numGroups),1, infile);
  assert(numGroups == NumCharGroups);
  fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile);
  fread(&MinWidth, sizeof(MinWidth),1,infile);
  fread(&MinHWRatio, sizeof(MinHWRatio),1,infile);
  for(unsigned int i = 0; i < NumCharGroups; i++)   
      {
	unsigned int groupnum;
	unsigned int numChars;
	fread(&groupnum, sizeof(groupnum), 1, infile);
	assert(groupnum == i);
	fread(&numChars, sizeof(numChars),1,infile);

	printf("\nReading group %d - %d characters\n",i,numChars);
	for(unsigned int c = 0; c< numChars; c++)
	    {
	      Component * comp = new Component;
	      short int * savepropptr = comp->fproperty;

	      status = fread(comp, sizeof(Component),1,infile);	     
	      int stringSize;
	      status = fread(&stringSize, sizeof(stringSize),1,infile);
              comp->fasciiId = new  char[stringSize];
              status = fread(comp->fasciiId, stringSize,1,infile);

	      comp->fproperty = savepropptr;

	      for(int p = 0; p < numProperties; p++)
		  {
		    status = fread(&(comp->fproperty[p]), sizeof(Property),
			      1, infile);
		    if (status == 0) 
		      {
			printf("Error reading properties");
			return 0;
		      }
		  }
//	      printf("\tChar:%c status:%d ", comp->asciiId(), status);
//	      printVector(comp->properties(), numProperties);
	      LearnedGroups[i].Append(comp);

	    }

      }
  status = fclose(infile);
  if (status == -1) return 0;
  else return 1;
}

void testLearn()
{

  learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif",
	   "/amd/nfs/cochise/home/ee/cs169/fa95class/cs169-ab/train.txt"); 
}

/*****************************************************************
  FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS
  AND ARE NOT CURRENTLY USED.
*******************************************************************/

void initLearnedChars()
/*--------------------------------------------------------------
Primary Purpose: Initializes learned character array. Sets asciiId
to array offset.
Rev: KM 11/6/95
---------------------------------------------------------------*/
{
  LearnedChars = new Component[256];
 
  for (int i=0; i < 256; i++)
    {
      LearnedChars[i].asciiId() = (char)i;
    }

}

void oldlearn(char * tifFile, char * asciiFile)
/*--------------------------------------------------------------
Primary Purpose:  builds property vectors for LearnedChars array
Arguments: tiffFile name of a tiff file to learn from
           asciiFile name of an ascii translation file
Effects:  Assumes a one to one correspondence between each connected
component on a line of the tif file and each character on the corresponding
line of the ascii file.  For learned characters confidence is set
to the number of examples.

Rev:  11/6/95
---------------------------------------------------------------*/
{
  FILE * transFile;
  transFile = fopen(asciiFile,"r");
  Page * learnPage = new Page;
  initCharBitsSet();
  learnPage->readMap(tifFile);
  learnPage->setLines();
  learnPage->extractComponents(MinHorizSeparation);         /* why minlinesize? */
  int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; 
  char buffer[maxCharsPerLine];
  int i = 0;
  int buflength;
  bool emptyLine;
  Components * components;
  Component * item;
  int count[256]; // a count of how many of each char have been encountered
  int prop[256][numProperties]; // Character property sums. Need ints so that 
                                 // property sum does
                                // not exceed char boundaries
  char id;

  initLearnedChars();
  for (i = 0; i < 256; i++)
    {
      count[i] = 0;
      for (int p  = 0; p < numProperties; p++)
	prop[i][p] = 0;
    }
  i=0; 

  int offset;
  while (i < learnPage->numLines() &&  
	 fgets(buffer, maxCharsPerLine, transFile))
      {
	buflength = strlen(buffer);
	components = learnPage->line(i++);
	int c = 0;
	for (ListElement* ptr = components->first; ptr != NULL; 
	     ptr = ptr->next) 
	  {
	  item = (Component *)(ptr->item);
	  // skip over white space
	  while(whitespace(buffer[c]) && c < buflength)c++;
	  if (c >= buflength)break;
	  id =  buffer[c++];
	  count[id]++;  // increment character count
	  for (offset=0; offset < numProperties; offset++)
	    prop[id][offset] += (item->properties())[offset];
	  LearnedChars[i].numBits() += item->numBits();
	}
      }
  // now divide by count and put in Learned character
  for(int j = 0; j < 256; j++)
      {
	if(count[j] > 0)
	    {
	      for (int offset=0; offset < numProperties; offset++)
		prop[j][offset] /= count[j];
	      LearnedChars[j].numBits() /= count[j]; 
	      LearnedChars[j].confid() = count[j];
	      for (offset=0; offset < numProperties; offset++)
		(LearnedChars[j].properties())[offset] = prop[j][offset];
//	      printf("%d occurrences of %c\n", count[j], (char)j);
	      printVector(LearnedChars[j].properties(), numProperties);
			   
	    }

      }
}

void oldtestLearn()
{


  learn("train.tif", "train.txt");
  if (ENABLE_USER_INTERFACE)
  docommand(".main_window.display.work_space delete IMAGE_TAG");
}