reference/ocr-new/learn.cc
changeset 0 6b8091ca909a
equal deleted inserted replaced
-1:000000000000 0:6b8091ca909a
       
     1 /*--------------------------------------------------------------
       
     2  Learn.cc - 
       
     3  readlearnfiles - sources the tcl file to change learn files
       
     4  learn(char * tifffile, char * asciifile)
       
     5  Performs character learning by reading tiff and  ascii translation
       
     6  Characters are partitioned into character groups as described 
       
     7  in system.cc.  See learn function for more details
       
     8 
       
     9  writeLearnedGroups(char * filename) Writes learned character to file
       
    10  readLearnedGroups(char * filename) Reads saved learned characters 
       
    11                                     from file. 
       
    12 ---------------------------------------------------------------*/
       
    13 #include "tcl_interface.h"
       
    14 #include "system.h"
       
    15 #include "learn.h"
       
    16 #include "Page.h"
       
    17 #include "list.h"
       
    18 
       
    19 void readLearnFiles()
       
    20 /*--------------------------------------------------------------
       
    21 Primary Purpose: Sources learnfile.tcl where new learn files can be 
       
    22 specified without recompiling   **/
       
    23 {
       
    24   docommand("source learnfile.tcl");
       
    25 }
       
    26 
       
    27 bool whitespace(char c)
       
    28 // Returns TRUE if c is a whitespace charater (called by learn.cc)
       
    29 {
       
    30   if ( c == '\n' || c == '\t' || c == ' ') return TRUE;
       
    31   return FALSE;
       
    32 
       
    33 }
       
    34 
       
    35 bool blank(char * string)
       
    36 {
       
    37   if (string == NULL) return TRUE;
       
    38   int len = strlen(string);
       
    39   for(int c=0; c< len; c++)
       
    40     {
       
    41     if (!(whitespace(string[c])))
       
    42 	return FALSE;
       
    43     }
       
    44   return TRUE;
       
    45 }
       
    46 
       
    47 
       
    48 void printLearnedGroups()
       
    49 {
       
    50   // Just print these guys out to make sure they are ok.
       
    51     for(unsigned int i = 0; i < NumCharGroups; i++)   
       
    52 	for(ListElement * ptr = LearnedGroups[i].first; 
       
    53 	    ptr != NULL; ptr = ptr->next)
       
    54 	  { Component * item = (Component *) ptr->item;
       
    55 	  printf("learned char %s, group %d\n", item->fasciiId, 
       
    56 			  item->charGroup);
       
    57 	  }
       
    58 
       
    59 }
       
    60 
       
    61 int lengthNextWord(char * buffer,int offset, int buflength)
       
    62 {
       
    63   // counts things in '< >' as one character
       
    64   int count;
       
    65 
       
    66   for(int c=offset; c < buflength && !(whitespace(buffer[c])); c++)
       
    67     {
       
    68 	 if(buffer[c] == '<')
       
    69 	   {
       
    70 	     while((buffer[c] != '>') && (c < buflength))
       
    71 	       c++;
       
    72 	     count++;
       
    73 	   }
       
    74 	 else
       
    75 	   count++;
       
    76     }
       
    77   return count;
       
    78 }
       
    79 
       
    80 
       
    81 int learn(Component * comp, char * id, Confidence threshold)
       
    82 /*--------------------------------------------------------------
       
    83 Primary Purpose: Make a copy of this component and add it to 
       
    84                  LearnedGroups. id is ascii identification.
       
    85 		 Component will only be learned if confidence
       
    86 		 is below threshold or if id and asciiid dont match
       
    87 Arguments: comp - component to learn
       
    88                   id - ascii identification
       
    89 		  threshold - confidence threshold for learning
       
    90 Return Value: 1 if component was learned, 0 otherwise
       
    91 Rev: 4/25/96
       
    92 ---------------------------------------------------------------*/
       
    93 {
       
    94   Component * newcomp;
       
    95 
       
    96  if (comp->confid() < threshold || !(strcmp(comp->fasciiId, id)))
       
    97     {
       
    98       newcomp = comp->copy();
       
    99       delete newcomp->fasciiId;
       
   100       newcomp->fasciiId = new char[strlen(id)+1];
       
   101       strcpy(newcomp->fasciiId , id);
       
   102       
       
   103       LearnedGroups[newcomp->charGroup].Append(newcomp);
       
   104       return 1;
       
   105     }
       
   106   return 0;
       
   107 }
       
   108 
       
   109 void learn(char * tifFile, char * asciiFile, bool synchwords)
       
   110 /*--------------------------------------------------------------
       
   111 Primary Purpose:  Learns from TIFF and ascii file.  Groups learned
       
   112                   characters by baseline into LearnedGroups and
       
   113                    sets properties.
       
   114 Arguments: tiffFile name of a tiff file to learn from
       
   115            asciiFile name of an ascii translation file
       
   116 Effects:  Assumes a one to one correspondence between each connected
       
   117 component on a line of the tif file and each character on the corresponding
       
   118 line of the ascii file. 
       
   119 
       
   120 Rev:  4/26/96
       
   121 ---------------------------------------------------------------------*/
       
   122 {
       
   123 
       
   124   Page * learnPage = new Page;
       
   125   initCharBitsSet();
       
   126   if(learnPage->readMap(tifFile) != VALID)
       
   127     {
       
   128       printf("Problem opening the learn image file (file doesn't exist?)\n");
       
   129       return;
       
   130     }
       
   131   learnPage->setLines();
       
   132   learnPage->extractComponents(MinHorizSeparation);
       
   133   learnPage->extractWords();
       
   134   learn(learnPage, asciiFile, synchwords);
       
   135   
       
   136   //  delete learnPage; 
       
   137 
       
   138 }
       
   139 
       
   140 
       
   141 void learn(Page * learnPage, char * asciiFile, bool synchWords)
       
   142 /*--------------------------------------------------------------
       
   143 Primary Purpose:  Learns from a Page and an ascii file.  Used from
       
   144                   tcl user interface under File/Learn opation
       
   145 		  Groups learned
       
   146                   characters by baseline into LearnedGroups and
       
   147                    sets properties.
       
   148 Arguments: tiffFile name of a tiff file to learn from
       
   149            asciiFile name of an ascii translation file
       
   150 Effects:  Assumes a one to one correspondence between each connected
       
   151 component on a line of the tif file and each character on the corresponding
       
   152 line of the ascii file. 
       
   153 
       
   154 Rev:  4/26/96
       
   155 ---------------------------------------------------------------*/
       
   156 {
       
   157   FILE * transFile;
       
   158 
       
   159   transFile = fopen(asciiFile,"r");
       
   160   if(!transFile)
       
   161     {
       
   162       printf("Could not open the ascii learn file");
       
   163       return;
       
   164     }
       
   165   if (LearnedGroups == NULL)
       
   166     LearnedGroups = new Components[NumCharGroups]; 
       
   167 
       
   168   int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; 
       
   169   char buffer[maxCharsPerLine];
       
   170   int i = -1;
       
   171   int buflength=0;
       
   172   bool instring= FALSE;
       
   173   bool emptyLine;
       
   174   Components * components = NULL;
       
   175   Words * words;
       
   176   Component * item;
       
   177   
       
   178   double width, height = 0.0;
       
   179   int h;
       
   180 
       
   181 
       
   182       words = learnPage->words();
       
   183       int c = 0;
       
   184       Word * word;
       
   185 
       
   186       for (ListElement * ptr = words->first; ptr != NULL &&
       
   187 	  (i < learnPage->numLines()) ; ptr = ptr->next)
       
   188 	{	
       
   189 
       
   190 	  word = (Word *) ptr->item;
       
   191 	  // if new line get new  text line
       
   192 	  if (word->characters[0] == '\n' || buflength == 0)
       
   193 	    {
       
   194 	      char * ok;
       
   195 	      do {
       
   196 		ok =fgets(buffer, maxCharsPerLine, transFile);
       
   197 	      } while (ok && blank(buffer)); // skip blank lines.
       
   198 	      buflength= strlen(buffer);
       
   199               components = learnPage->line(++i);
       
   200 	      c =0;
       
   201 	      if (word->characters[0] == '\n') continue;
       
   202 	    }
       
   203 	  
       
   204 
       
   205 	  // skip over white space
       
   206 	  while(whitespace(buffer[c]) && c < buflength)c++;
       
   207 
       
   208 	  // Make sure we have an equal # of components characters
       
   209 	  if (synchWords && 
       
   210 	      (word->charCount == lengthNextWord(buffer,c,buflength)))
       
   211 	    {
       
   212 	      // skip over this word
       
   213 	      while(!(whitespace(buffer[c])) && c < buflength)
       
   214 		c++;
       
   215 	      continue; // move on to the next word
       
   216 	    }
       
   217 
       
   218 	  for (int ch = 0; ch < word->charCount; ch++) 
       
   219 	    { 
       
   220 	      while(whitespace(buffer[c]) && c < buflength)c++;
       
   221 	      item = word->character[ch]; 
       
   222 	      if (c >= buflength) break;
       
   223 
       
   224 		 // Link string translation to component.  Characters between
       
   225 		 // brackets are for one component.
       
   226 		   if(buffer[c] == '<' && !instring)
       
   227 		     {
       
   228 		       instring = TRUE;
       
   229 		       int startString = c;
       
   230 		       while(c++ < buflength && buffer[c] != '>');
       
   231 		       int endString = c+1;
       
   232 		       
       
   233 		       int stringSize = endString - startString;
       
   234 		       char newstring[stringSize+1];
       
   235 		       strncpy(newstring, &buffer[startString],stringSize);
       
   236 		       newstring[stringSize] = '\0';
       
   237 		       // learn if id's don't match or below threshold
       
   238 		       learn(item, newstring, ConfidenceThreshold);
       
   239 		       c++;
       
   240 		       instring = FALSE;
       
   241 		     }
       
   242 		   else
       
   243 		     {
       
   244                        char newstring[2];
       
   245       		       newstring[0] = buffer[c++];
       
   246 		       newstring[1]= '\0';
       
   247 		       learn(item, newstring, ConfidenceThreshold);
       
   248 		     }
       
   249 
       
   250 		   LearnedGroups[item->charGroup].Append(item);
       
   251 		   //ptr->item = NULL; // Set to Null in page so it wont get
       
   252 	                    // clobbered on delete
       
   253 		   h = item->lr().y() - item->ul().y();
       
   254 		   if (h > height) height = h;
       
   255 		   width = item->lr().x() - item->ul().x();
       
   256 		   if (height/width > MaxHWRatio)
       
   257 		     MaxHWRatio = height/width;
       
   258 
       
   259 		   if (h/width < MinHWRatio)
       
   260 		     MinHWRatio = h/width;
       
   261 
       
   262 		   if (width < MinWidth)
       
   263 		     MinWidth = (int) width;
       
   264 	  
       
   265 
       
   266 		 }
       
   267 	}
       
   268 
       
   269 
       
   270 
       
   271 
       
   272   if (fgets(buffer, maxCharsPerLine, transFile))
       
   273       printf("Uh, oh. There are more characters to learn!\n");
       
   274   /*  printf("Maximum height/width ratio = %f\n", MaxHWRatio); */
       
   275   /*  printf("Minimum height/width ratio = %f\n", MinHWRatio); */
       
   276 
       
   277 
       
   278   // printLearnedGroups();
       
   279 
       
   280 }
       
   281 
       
   282 
       
   283 int writeLearnedGroups(char * filename)
       
   284 /*--------------------------------------------------------------
       
   285 Primary Purpose:  Write Learned groups out to file for reading
       
   286                   in by readLearnedGroups
       
   287 Arguments: filename to write learned chars to 
       
   288 Return Value: 1 if successful 0 if not
       
   289 Effects:  Writes contents of LearnedGroups array out to filename
       
   290 LearnedGroups is an array of lists of components that is decleared
       
   291 in system.cc and initialized by the learn() function.
       
   292 For each group writes the number of Components the group contains
       
   293 followed by the group data.
       
   294 Other learned values such as MinWidth MinHWRatio etc are written to
       
   295 the file as well.
       
   296 Constraints: LearnedGroups must be initialized and filled with learned
       
   297 chars before this function is invoked.
       
   298 Rev: 11/27 KM
       
   299 ---------------------------------------------------------------*/
       
   300 {
       
   301   int status;
       
   302   FILE * outfile;
       
   303   assert(LearnedGroups != NULL);
       
   304   
       
   305   outfile = fopen(filename, "w");
       
   306   if (outfile == NULL)
       
   307       {
       
   308 	printf("error openning %s \n", filename);
       
   309 	return 0;
       
   310       }
       
   311 
       
   312   // Write global information about learned characters
       
   313 
       
   314   fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile);
       
   315   fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile);
       
   316   fwrite(&MinWidth, sizeof(MinWidth),1,outfile);
       
   317   fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile);
       
   318   for(unsigned int i = 0; i < NumCharGroups; i++)   
       
   319       {
       
   320 	unsigned int numChars = LearnedGroups[i].length;
       
   321 	// Write group number and number of characters
       
   322 	fwrite(&i, sizeof(i), 1, outfile);
       
   323 	status = fwrite(&numChars, sizeof(numChars),1,outfile);
       
   324 	if (status == 0) return 0;
       
   325 	for(ListElement * ptr = LearnedGroups[i].first; 
       
   326 	    ptr != NULL; ptr = ptr->next)
       
   327 	    {
       
   328 	      
       
   329 	      Component * comp = (Component *) ptr->item;
       
   330 
       
   331 	      status = fwrite(comp, sizeof(Component),1,outfile);	     
       
   332 //	      printf("\tChar:%c status:%d \n", comp->asciiId(), status);
       
   333 	      int stringSize = strlen(comp->fasciiId) +1;
       
   334 	      status = fwrite(&stringSize, sizeof(stringSize),1,outfile);
       
   335               status = fwrite(comp->fasciiId, stringSize,1,outfile);
       
   336 	      for(int p = 0; p < numProperties; p++)
       
   337 		  {
       
   338 		    status = fwrite(&(comp->fproperty[p]), 
       
   339 				  sizeof(Property),
       
   340 				  1, outfile);
       
   341 		    if (status == 0) 
       
   342 			{
       
   343 			  printf("Error writing properties of comp %c",
       
   344 				 comp->asciiId());
       
   345 			  return 0;
       
   346 			}
       
   347 		  }
       
   348 	    }
       
   349       }
       
   350   status = fclose(outfile);
       
   351   if (status == -1) return 0;
       
   352   else return 1;
       
   353 
       
   354 }
       
   355 
       
   356 int readLearnedGroups(char * filename)
       
   357 /*--------------------------------------------------------------
       
   358 Primary Purpose:  Read Learned groups from file that has been
       
   359                   created by writeLearnedGroups
       
   360 Arguments: filename to read learned chars from 
       
   361 Return Value: 1 if successful 0 if not
       
   362 Effects:  Reads contents of filename into LearnedGroups array
       
   363 LearnedGroups is an array of lists of components that is decleared
       
   364 in system.cc and initialized here or in the learn() function.
       
   365 Constraints: LearnedGroups must not yet be initialized
       
   366 Rev: 11/27 KM
       
   367 ---------------------------------------------------------------*/
       
   368 {
       
   369   int status;
       
   370   FILE * infile;
       
   371   unsigned int numGroups;           // # of groups stored in file.
       
   372 
       
   373   initCharBitsSet();
       
   374   if(LearnedGroups == NULL)
       
   375     LearnedGroups = new Components[NumCharGroups];
       
   376 
       
   377 
       
   378   infile = fopen(filename, "r");
       
   379   if (infile == NULL)
       
   380       {
       
   381 	printf("error openning %s \n", filename);
       
   382 	return 0;
       
   383       }
       
   384 
       
   385   // Read Globals
       
   386   fread(&numGroups, sizeof(numGroups),1, infile);
       
   387   assert(numGroups == NumCharGroups);
       
   388   fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile);
       
   389   fread(&MinWidth, sizeof(MinWidth),1,infile);
       
   390   fread(&MinHWRatio, sizeof(MinHWRatio),1,infile);
       
   391   for(unsigned int i = 0; i < NumCharGroups; i++)   
       
   392       {
       
   393 	unsigned int groupnum;
       
   394 	unsigned int numChars;
       
   395 	fread(&groupnum, sizeof(groupnum), 1, infile);
       
   396 	assert(groupnum == i);
       
   397 	fread(&numChars, sizeof(numChars),1,infile);
       
   398 
       
   399 	printf("\nReading group %d - %d characters\n",i,numChars);
       
   400 	for(unsigned int c = 0; c< numChars; c++)
       
   401 	    {
       
   402 	      Component * comp = new Component;
       
   403 	      short int * savepropptr = comp->fproperty;
       
   404 
       
   405 	      status = fread(comp, sizeof(Component),1,infile);	     
       
   406 	      int stringSize;
       
   407 	      status = fread(&stringSize, sizeof(stringSize),1,infile);
       
   408               comp->fasciiId = new  char[stringSize];
       
   409               status = fread(comp->fasciiId, stringSize,1,infile);
       
   410 
       
   411 	      comp->fproperty = savepropptr;
       
   412 
       
   413 	      for(int p = 0; p < numProperties; p++)
       
   414 		  {
       
   415 		    status = fread(&(comp->fproperty[p]), sizeof(Property),
       
   416 			      1, infile);
       
   417 		    if (status == 0) 
       
   418 		      {
       
   419 			printf("Error reading properties");
       
   420 			return 0;
       
   421 		      }
       
   422 		  }
       
   423 //	      printf("\tChar:%c status:%d ", comp->asciiId(), status);
       
   424 //	      printVector(comp->properties(), numProperties);
       
   425 	      LearnedGroups[i].Append(comp);
       
   426 
       
   427 	    }
       
   428 
       
   429       }
       
   430   status = fclose(infile);
       
   431   if (status == -1) return 0;
       
   432   else return 1;
       
   433 }
       
   434 
       
   435 void testLearn()
       
   436 {
       
   437 
       
   438   learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif",
       
   439 	   "/amd/nfs/cochise/home/ee/cs169/fa95class/cs169-ab/train.txt"); 
       
   440 }
       
   441 
       
   442 /*****************************************************************
       
   443   FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS
       
   444   AND ARE NOT CURRENTLY USED.
       
   445 *******************************************************************/
       
   446 
       
   447 void initLearnedChars()
       
   448 /*--------------------------------------------------------------
       
   449 Primary Purpose: Initializes learned character array. Sets asciiId
       
   450 to array offset.
       
   451 Rev: KM 11/6/95
       
   452 ---------------------------------------------------------------*/
       
   453 {
       
   454   LearnedChars = new Component[256];
       
   455  
       
   456   for (int i=0; i < 256; i++)
       
   457     {
       
   458       LearnedChars[i].asciiId() = (char)i;
       
   459     }
       
   460 
       
   461 }
       
   462 
       
   463 void oldlearn(char * tifFile, char * asciiFile)
       
   464 /*--------------------------------------------------------------
       
   465 Primary Purpose:  builds property vectors for LearnedChars array
       
   466 Arguments: tiffFile name of a tiff file to learn from
       
   467            asciiFile name of an ascii translation file
       
   468 Effects:  Assumes a one to one correspondence between each connected
       
   469 component on a line of the tif file and each character on the corresponding
       
   470 line of the ascii file.  For learned characters confidence is set
       
   471 to the number of examples.
       
   472 
       
   473 Rev:  11/6/95
       
   474 ---------------------------------------------------------------*/
       
   475 {
       
   476   FILE * transFile;
       
   477   transFile = fopen(asciiFile,"r");
       
   478   Page * learnPage = new Page;
       
   479   initCharBitsSet();
       
   480   learnPage->readMap(tifFile);
       
   481   learnPage->setLines();
       
   482   learnPage->extractComponents(MinHorizSeparation);         /* why minlinesize? */
       
   483   int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; 
       
   484   char buffer[maxCharsPerLine];
       
   485   int i = 0;
       
   486   int buflength;
       
   487   bool emptyLine;
       
   488   Components * components;
       
   489   Component * item;
       
   490   int count[256]; // a count of how many of each char have been encountered
       
   491   int prop[256][numProperties]; // Character property sums. Need ints so that 
       
   492                                  // property sum does
       
   493                                 // not exceed char boundaries
       
   494   char id;
       
   495 
       
   496   initLearnedChars();
       
   497   for (i = 0; i < 256; i++)
       
   498     {
       
   499       count[i] = 0;
       
   500       for (int p  = 0; p < numProperties; p++)
       
   501 	prop[i][p] = 0;
       
   502     }
       
   503   i=0; 
       
   504 
       
   505   int offset;
       
   506   while (i < learnPage->numLines() &&  
       
   507 	 fgets(buffer, maxCharsPerLine, transFile))
       
   508       {
       
   509 	buflength = strlen(buffer);
       
   510 	components = learnPage->line(i++);
       
   511 	int c = 0;
       
   512 	for (ListElement* ptr = components->first; ptr != NULL; 
       
   513 	     ptr = ptr->next) 
       
   514 	  {
       
   515 	  item = (Component *)(ptr->item);
       
   516 	  // skip over white space
       
   517 	  while(whitespace(buffer[c]) && c < buflength)c++;
       
   518 	  if (c >= buflength)break;
       
   519 	  id =  buffer[c++];
       
   520 	  count[id]++;  // increment character count
       
   521 	  for (offset=0; offset < numProperties; offset++)
       
   522 	    prop[id][offset] += (item->properties())[offset];
       
   523 	  LearnedChars[i].numBits() += item->numBits();
       
   524 	}
       
   525       }
       
   526   // now divide by count and put in Learned character
       
   527   for(int j = 0; j < 256; j++)
       
   528       {
       
   529 	if(count[j] > 0)
       
   530 	    {
       
   531 	      for (int offset=0; offset < numProperties; offset++)
       
   532 		prop[j][offset] /= count[j];
       
   533 	      LearnedChars[j].numBits() /= count[j]; 
       
   534 	      LearnedChars[j].confid() = count[j];
       
   535 	      for (offset=0; offset < numProperties; offset++)
       
   536 		(LearnedChars[j].properties())[offset] = prop[j][offset];
       
   537 //	      printf("%d occurrences of %c\n", count[j], (char)j);
       
   538 	      printVector(LearnedChars[j].properties(), numProperties);
       
   539 			   
       
   540 	    }
       
   541 
       
   542       }
       
   543 }
       
   544 
       
   545 void oldtestLearn()
       
   546 {
       
   547 
       
   548 
       
   549   learn("train.tif", "train.txt");
       
   550   if (ENABLE_USER_INTERFACE)
       
   551   docommand(".main_window.display.work_space delete IMAGE_TAG");
       
   552 }
       
   553 
       
   554 
       
   555 
       
   556 
       
   557 
       
   558