reference/ocr-simple/Page.cc
changeset 0 6b8091ca909a
equal deleted inserted replaced
-1:000000000000 0:6b8091ca909a
       
     1 /** Page.cc contains the member functions for the primary OCR class Page */
       
     2 #include "system.h"
       
     3 #include "Page.h"
       
     4 #include "convertMap.h"
       
     5 #include "get_skew.h"
       
     6 #include "Component.h"
       
     7 #include "status_message.h"
       
     8 
       
     9 /*** Member functions of class Page.     ***/
       
    10 
       
    11 int Page::get_height()
       
    12 {
       
    13   return fRLEMap->imageLength();
       
    14 }
       
    15 
       
    16 int Page::get_width()
       
    17 {
       
    18   return fRLEMap->imageWidth();
       
    19 }
       
    20 
       
    21 int Page::send_words_to_tcl()
       
    22 /*--------------------------------------------------------------
       
    23 Primary Purpose:  Display words in tcl
       
    24 Rev - AR
       
    25 ---------------------------------------------------------------*/
       
    26 {
       
    27   int word_count = 0;
       
    28   int unknown_char_count = 0;
       
    29   int low_precision_count = 0;
       
    30   int mispelled_count = 0;
       
    31   char* send_chars;
       
    32   Word* temp_word;
       
    33   if(ENABLE_USER_INTERFACE) set_status("Displaying text");
       
    34   for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
       
    35     {
       
    36       word_count++;
       
    37       set_text_display_status(word_count, fWordList->num_words);
       
    38       temp_word = (Word*)ptr->item;
       
    39       send_chars = backslashify(temp_word->characters);
       
    40       /*	printf("Added word %s Confidence = %d\n", send_chars, 
       
    41 	       temp_word->confid); */
       
    42       if(temp_word->confid < VERY_LOW_CONFIDENCE)
       
    43 	  {
       
    44 	    docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    45 	    unknown_char_count++;
       
    46 	  }
       
    47       else if(temp_word->confid < LOW_CONFIDENCE)
       
    48 	  {
       
    49 	    docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    50 	    low_precision_count++;
       
    51 	  }
       
    52       else if((temp_word->mispelled) && SPELLCHECK)
       
    53 	  {
       
    54 	    docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    55 	    mispelled_count++;
       
    56 	  }
       
    57       else
       
    58 	  {
       
    59 	    docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    60 	  }
       
    61       update();
       
    62     }
       
    63   if(ENABLE_USER_INTERFACE)
       
    64       {
       
    65     set_status("Done displaying text");
       
    66     set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count))));
       
    67   }
       
    68 }
       
    69 
       
    70 
       
    71 int Page::deskew(int deskew_method)
       
    72 /*--------------------------------------------------------------
       
    73 Primary Purpose: Deskew the page
       
    74 Arguments: 1 - RLE Rotation
       
    75            0 - BitMap Rotation
       
    76 Return Value: 1 if successful, 0 if unsuccessful
       
    77 Effects: updates the bitmap and rlemap of the page
       
    78 Constraints: RLEMap Rotation is not currently reliable and probably
       
    79 should not be used
       
    80 Rev: AR
       
    81 ---------------------------------------------------------------*/
       
    82 {
       
    83   /* a little ugly.... if the page is rotated
       
    84      in here, return 1, else 0 */
       
    85 
       
    86   if(deskew_method == RLE_DESKEW)
       
    87       {
       
    88 	if(fRLEMap->deskew())
       
    89 	{
       
    90 	  convertMap(fRLEMap, fBitMap);
       
    91 	  return 1;
       
    92 	}
       
    93 	return 0;
       
    94       }
       
    95   else
       
    96       {
       
    97       double skew = get_skew(fRLEMap);
       
    98       if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE))
       
    99 	  {
       
   100 	    fBitMap->rotateMap(skew);
       
   101 	    convertMap(fBitMap, fRLEMap);
       
   102 	    return 1;
       
   103 	  }
       
   104       return 0;
       
   105     }
       
   106 }
       
   107 
       
   108 Page::Page()
       
   109 /**Page::Page - constructor allocates bitmap and rlemap*/
       
   110 {
       
   111   fBitMap = new BitMap;
       
   112   fRLEMap = new RLEMap;
       
   113   fLineComponents = NULL;
       
   114   fWordList = NULL;
       
   115 }
       
   116 
       
   117 Page::~Page()
       
   118 /*--------------------------------------------------------------
       
   119 Primary Purpose:  Destructor deallocates private fields that
       
   120 have been created.
       
   121 Rev:
       
   122 ---------------------------------------------------------------*/
       
   123 {
       
   124  
       
   125   if (flineinfo) delete flineinfo;
       
   126   for (int  i = 0; i <fnumLines; i++)
       
   127         if(fLineComponents[i]) delete fLineComponents[i];
       
   128   if(fLineComponents) delete fLineComponents;
       
   129   if (fBitMap) delete fBitMap;
       
   130   if (fRLEMap) delete fRLEMap;
       
   131   if (fWordList) delete fWordList;
       
   132 }
       
   133 
       
   134 Angle Page::skewAngle()
       
   135 /*--------------------------------------------------------------
       
   136 Primary Purpose: Determine the angle of rotation of the RLEMap r
       
   137 Arguments: pointer to an RLEMap
       
   138 Return Value: detected angle of rotation
       
   139 Code is in get_skew.cc
       
   140 Rev: AR
       
   141 ---------------------------------------------------------------*/
       
   142 {
       
   143   return get_skew(fRLEMap);
       
   144 }
       
   145 
       
   146 
       
   147 MapStatus Page::readMap(char * filename)
       
   148  // Calls BitMap::readMap and then converts
       
   149 {
       
   150   MapStatus status;
       
   151   status = fBitMap->readMap(filename);
       
   152   convertMap(fBitMap, fRLEMap);
       
   153   return status;
       
   154 }
       
   155 
       
   156 
       
   157 
       
   158 MapStatus Page::setLines()
       
   159 /*--------------------------------------------------------------
       
   160 Primary Purpose:  Set flineinfo array in Page class with the 
       
   161       starting and ending rows of each line of text.
       
   162       Also sets fnumLines to the number of lines
       
   163 Arguments: none
       
   164 Return Value: A Mapstatus either VALID, EMPTY if there is no
       
   165    data in the RLEMAP, or OTHERERROR if there is an unexpected error
       
   166 Effects:  Allocates flineinfo and fills with starting and ending row
       
   167    of each line.  The following global variables are used as parameters
       
   168    in this function.  These are defined in system.cc
       
   169    NoiseTolerance - Rows whose number of pixels is less than  this value
       
   170                 will be considered empty (current val 6). 
       
   171    MinVertSeparation - The minimum number of rows separating lines of text.
       
   172                  Lines will be merged if actual Separation is less than this
       
   173 		 value. (current val 3)
       
   174    MinLineSize - The minimum number of rows in a line of text.  
       
   175                  Any smaller lines are discarded (currentval 5)
       
   176 
       
   177 Constraints: Page::readMap() must be run first to fill fRLEMap 
       
   178 Rev: 10/26 KM
       
   179 ---------------------------------------------------------------*/
       
   180 {
       
   181 
       
   182    int maxrow = fRLEMap->imageLength() - 1;      // maximum row number 
       
   183    int actualSeparation = MinVertSeparation + 1; // must be bigger than min
       
   184                                                  // for line 0
       
   185 
       
   186    int linenum=0;                                // current line number
       
   187    int prvlinenum = 0;
       
   188    int lineSize;                                 // # rows in current line 
       
   189 
       
   190    int maxLines = maxrow/MinLineSize;           // max # of lines of text 
       
   191 
       
   192    if(maxrow == 0) return EMPTY;
       
   193 
       
   194    flineinfo = new LineMarker[maxLines]; 
       
   195 
       
   196    for (int i = 0; i < maxrow;)
       
   197 	{
       
   198 	  LineMarker & thisLine = flineinfo[linenum];
       
   199 	  LineMarker & prevLine = flineinfo[prvlinenum];
       
   200 
       
   201 	  while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance)
       
   202 	    i++;
       
   203 	  thisLine.fstartrow = i++;
       
   204 	  while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance)
       
   205 	    i++;
       
   206 	  
       
   207 
       
   208 	  lineSize = i - thisLine.fstartrow +1;
       
   209 
       
   210 	  // If this line is less than MinVertSeparation away
       
   211 	  //  from the last line.  Join the two together.
       
   212 	  if (linenum > 0)
       
   213 	    {
       
   214 	      actualSeparation = thisLine.fstartrow - prevLine.fendrow;
       
   215 	    }
       
   216 	  if (actualSeparation < MinVertSeparation)
       
   217 	    {
       
   218 	     // If too small of a separation, add into prev row
       
   219 	     prevLine.fendrow = i;
       
   220 	   }
       
   221 	  else if (lineSize >= MinLineSize)
       
   222 	    {
       
   223 	    thisLine.fendrow = i;
       
   224 /*	    printf (" Line %d  Start: %d  End: %d  lineHeight %d\n", 
       
   225 	        linenum,thisLine.fstartrow,
       
   226 	        thisLine.fendrow, 
       
   227 	        thisLine.fendrow  - thisLine.fstartrow +1);
       
   228 */
       
   229 	    prvlinenum = linenum;
       
   230 	    linenum++;
       
   231 
       
   232 	  }
       
   233 	  if (linenum >= maxLines) return OTHERERROR;
       
   234 	}
       
   235 
       
   236    fnumLines = linenum;   // Set number of lines in page class
       
   237 
       
   238    fLineComponents = new Components*[fnumLines];
       
   239    if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES)
       
   240      {
       
   241        display_line_boundaries();
       
   242      }
       
   243    /*   printf("Setlines found a total of %d lines.\n", fnumLines); */
       
   244    if(ENABLE_USER_INTERFACE) 
       
   245      update(); 
       
   246    return VALID;
       
   247  }
       
   248 
       
   249 void Page::display_line_boundaries()
       
   250 /*--------------------------------------------------------------
       
   251 Primary Purpose: Display line boundaries in TCL/TK.  Called from
       
   252 setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are
       
   253 set to TRUE
       
   254 Effects:  Draws a blue line between each line of text
       
   255 Rev:  AR
       
   256 ---------------------------------------------------------------*/
       
   257 {
       
   258   int centerline, width;
       
   259   for(int j=0; j < fnumLines; j++)
       
   260     {
       
   261       centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2;
       
   262       width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow;
       
   263 
       
   264       scale(centerline);
       
   265       scale(width);
       
   266       /* having this pathname here is probably not such a good idea...*/
       
   267       
       
   268       docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width);
       
   269     }
       
   270 }
       
   271 
       
   272 
       
   273 int test_rlemap_lines(RLEMap* rmap)
       
   274 {
       
   275   int length = rmap->imageLength();
       
   276   for(int i = 0; i < length; i++)
       
   277     printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels);
       
   278 }
       
   279 
       
   280 
       
   281 MapStatus Page::extractComponents()
       
   282 /*--------------------------------------------------------------
       
   283                      Component extraction routines.
       
   284 *
       
   285 * Given the top and bottom line of a row we want to generate a list of
       
   286 * components. The general method is to find the closest dot, trace its 
       
   287 * connected dots, then project upwards and downwards and add anything we 
       
   288 * find there to the component. We will erase the component from the RLEMap
       
   289 * as it is added to the component list. By projecting up and down 
       
   290 * from the piece we first find we should be able
       
   291 * to completely encompass characters like :;i?|! The only problems are 
       
   292 * italic or ligatured characters where we may pick up two or more 
       
   293 * characters at a time (which would be bad) or characters fragmented 
       
   294 * with a vertical gap.
       
   295 
       
   296 Primary Purpose: Main extraction routine.
       
   297 Effects: Makes new components and puts them in a list. Deletes components 
       
   298          from RLE map. Fills in component boundaries and calls 
       
   299 	 Component::setProperties to set the property vector
       
   300          Lastly convertMap is run to rebuild the RLEMap
       
   301 Constraints: Page::setLines() must be run first 
       
   302 Rev: 11/2 JMH
       
   303      11/8 KM add set properties and
       
   304      avgSpacing;
       
   305 ---------------------------------------------------------------*/
       
   306 {
       
   307   int currentCol, startRow, endRow, rowHeight;
       
   308   ListElement* intrvl;
       
   309   ListElement* tempintrvl;
       
   310   /*  printf("fnumLines = %d\n", fnumLines); */
       
   311   Component* comp;
       
   312   int  totalSpacing = 0;  // total blank horizontal pixels between components
       
   313   int  baselines[MaxVertSize];     // array for finding the baseline
       
   314   last_status = 0.0;
       
   315   int compCounter = 0;
       
   316   int i;
       
   317   int j;
       
   318     printf("Extracting Components\n");
       
   319   for (i = 0; i < fnumLines; i++) {
       
   320     if(ENABLE_USER_INTERFACE)
       
   321       set_component_status(i, fnumLines);
       
   322     currentCol = 0;
       
   323     startRow = flineinfo[i].fstartrow;
       
   324     endRow = flineinfo[i].fendrow;
       
   325     rowHeight = endRow - startRow;
       
   326     assert(rowHeight > 0);
       
   327 
       
   328     for (j=0; j < MaxVertSize; j++)
       
   329       baselines[j] = 0;
       
   330     fLineComponents[i] = new Components();
       
   331 
       
   332 
       
   333     while (currentCol<=fRLEMap->imageWidth()) {  //until we reach the end of the page
       
   334 
       
   335 	//Build component starting with closest black dot
       
   336 	intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow);
       
   337 	if (intrvl == NULL) {
       
   338 	//  printf("Reached end of line\n");
       
   339 	  break;
       
   340 	}
       
   341 	comp = new Component(); //Make a new component named comp
       
   342 	assert(comp->AddToComponent(intrvl, fRLEMap));
       
   343 
       
   344 	//Now we want to extend upwards
       
   345 	//First check if there is a blank space to the right
       
   346 	tempintrvl = fRLEMap->FindNearHorizDot(comp->lr().x(), 
       
   347 					       startRow, endRow);
       
   348 	if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start > 
       
   349 	    comp->lr().x()+MinHorizSeparation+1)
       
   350 	  while (comp->ul().y() < endRow) {
       
   351 	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
       
   352 					      comp->lr().x(), comp->lr().y(),
       
   353 					      startRow);
       
   354 	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)))
       
   355 	      break;
       
   356 	    if (intrvl == NULL) break;
       
   357 	  }
       
   358 	else
       
   359 	  while (comp->ul().y() < endRow) {
       
   360 	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
       
   361 					      comp->lr().x(), comp->ul().y(),
       
   362 					      startRow);
       
   363 	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)));
       
   364 	    break;
       
   365 	    if (intrvl == NULL) break;
       
   366 	  }
       
   367 
       
   368 	//Now we want to extend downwards
       
   369 	while (comp->lr().y() > startRow) {
       
   370 	  intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(), 
       
   371 						    comp->lr().y(), endRow);
       
   372 	  if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)))
       
   373 	    break;
       
   374 	  if (intrvl == NULL) break;
       
   375 	}
       
   376 
       
   377 	// Now we toss out the noise
       
   378 	int size;
       
   379 	if (comp != NULL) {
       
   380 	  if (comp->ul() < Point(0,0))
       
   381 	    printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y());
       
   382 	  else
       
   383 	    size = fBitMap->pixelsInRegion(comp->ul(), comp->lr());
       
   384 	}
       
   385 	  else
       
   386 	    size = 0;
       
   387 	if (size < MinComponentSize) {
       
   388 //	  printf("Deleting some noise of size %d\n", size);
       
   389 	  // printComponent(comp);
       
   390 	  delete comp;
       
   391 	  comp = NULL;
       
   392 	}
       
   393 	else
       
   394 	    {
       
   395 	      compCounter++;
       
   396 	      // display a rectangle around the component
       
   397 	      if(ENABLE_USER_INTERFACE)
       
   398 		  {
       
   399 		    if(DISPLAY_BOUNDING_BOXES)
       
   400 		      comp->display_bounding_box();
       
   401 		  }
       
   402 	  
       
   403 	  // JMH - make an array of frequency of the y coord of bottom of comp
       
   404 	      int vertOffset = endRow - comp->lr().y();
       
   405 	      if(vertOffset < MaxVertSize && vertOffset >= 0)
       
   406 		baselines[vertOffset]++;
       
   407 
       
   408 	  
       
   409 	      comp->setProperties(fBitMap);
       
   410 	      if(fLineComponents[i]->last != NULL)
       
   411 		totalSpacing += 
       
   412 		  comp->ul().x() - 
       
   413 		    ((Component *) (fLineComponents[i]->last->item))->lr().x();
       
   414 
       
   415 	      fLineComponents[i]->Append(comp);       // add this component to list
       
   416 	      currentCol = (comp->lr()).x() + 1;   // update position on page
       
   417 	    }
       
   418       }
       
   419     
       
   420     // find most popular bottom of comp and call it the baseline
       
   421     int counter = 0;
       
   422     int baseline;
       
   423     for (j=0; j < MaxVertSize; j++) {
       
   424       if (counter < baselines[j]) {
       
   425 	counter = baselines[j];
       
   426 	baseline = endRow - j;
       
   427       }
       
   428     }
       
   429     //    printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline);
       
   430     // Now assign each character a group based on it's location
       
   431     for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
       
   432 	 ptr = ptr->next) {
       
   433       comp = (Component*) ptr->item;
       
   434       comp->charGroup = 0;
       
   435       
       
   436       // if top of char is higher than top - tolerance 
       
   437       if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) {
       
   438 	comp->charGroup += 2; //tall like a T
       
   439       }
       
   440       
       
   441       // if bottom of char is lower than base - tolerance
       
   442       if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) {
       
   443 	comp->charGroup += 1; //has a tail like a y
       
   444       } else 
       
   445 	if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) {
       
   446 	  comp->charGroup = 4; //floating like a '
       
   447 	  /*	  printf("bottom at %d < %d\n", comp->lr().y(),
       
   448 		  baseline - (2*rowHeight/BaseLineTolerance)); */
       
   449 	}
       
   450       //      printf("added character in group %d\n", comp->charGroup);
       
   451     }
       
   452   }
       
   453   /*  printf("Found %d components on this page.\n", compCounter); */
       
   454   //  printComponents();
       
   455   last_status = 0.0;
       
   456   if(ENABLE_USER_INTERFACE)
       
   457     set_status("Done extracting characters");
       
   458   if((compCounter - fnumLines) > 0) /* don't want divide by zero */
       
   459     {
       
   460       favgSpacing = totalSpacing / (compCounter - fnumLines);
       
   461     }
       
   462   else
       
   463     {
       
   464       favgSpacing = 1;  
       
   465     }
       
   466   delete fRLEMap;
       
   467   fRLEMap = new RLEMap;
       
   468   convertMap(fBitMap, fRLEMap);
       
   469 }
       
   470 
       
   471 void Page::printComponents()
       
   472 /*--------------------------------------------------------------
       
   473 Primary Purpose: Debugging routine that prints little bitmaps
       
   474 of low confidence characters
       
   475 ---------------------------------------------------------------*/
       
   476 {
       
   477   int compcounter = 0;
       
   478   for (int i = 0; i < fnumLines; i++) {
       
   479     Component* comp;
       
   480     for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
       
   481 	 ptr = ptr->next) {
       
   482       compcounter++;
       
   483       comp = (Component *) ptr->item;
       
   484       if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n')
       
   485       {
       
   486 	printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n", 
       
   487 	   (comp->ul()).x(), (comp->ul()).y(),
       
   488 	   (comp->lr()).x(), (comp->lr()).y());
       
   489 	printComponent(comp);
       
   490 	printf("properties: "); 
       
   491 	printVector(comp->properties(), numProperties);
       
   492 	printf("I think it's a -> %c <-   confidence: %d  line: %d  group: %d Comp#%d\n",
       
   493 	       comp->asciiId(),
       
   494 	       comp->confid(), i+1, comp->charGroup, compcounter);
       
   495 	printf("\n*******************************************************\n");
       
   496       }
       
   497     }
       
   498   }
       
   499 }
       
   500 
       
   501 void Page::printComponent(Component* comp)
       
   502 // Print a single component.
       
   503 {
       
   504   int right = comp->ul().x()+78;
       
   505   if (comp->lr().x() < right) 
       
   506     right = comp->lr().x();
       
   507 
       
   508   for (int r = comp->ul().y(); 
       
   509        r <= comp->lr().y(); r++){
       
   510     for (int c = comp->ul().x();
       
   511 	 c <= right; c++)
       
   512       bitprint(fBitMap->row(r)[c/8], c%8);
       
   513     printf( "\n");
       
   514   }
       
   515 }
       
   516 
       
   517 int spacing(ListElement * compa, ListElement * compb);
       
   518 // helper function for extractWords  (defined below)
       
   519 
       
   520 MapStatus Page::extractWords()
       
   521 /*--------------------------------------------------------------
       
   522 Primary Purpose: Extract words from each lines components
       
   523 Effects: sets the fWordsList to be a list of all of the words
       
   524 in the document.
       
   525 Constraints: extractComponents must be run first
       
   526 Rev: KM 11/7/95
       
   527 ---------------------------------------------------------------*/
       
   528 {
       
   529   bool inWord;
       
   530   ListElement * start;   // word Start
       
   531   int count;   // counts the characters in the word
       
   532   int word_count = 0;
       
   533   int spacingThreshold = (int) (1.25 * ((float) (favgSpacing)));
       
   534   fWordList = new Words;
       
   535   last_status = 0.0;
       
   536   for (int i = 0; i < fnumLines; i++)
       
   537       {
       
   538 	if(ENABLE_USER_INTERFACE)
       
   539 	  set_extract_status(i, fnumLines);
       
   540 	inWord = FALSE;
       
   541 	for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) {
       
   542 	  if(!inWord)
       
   543 	      {
       
   544 		start = ptr;
       
   545 		count = 1;
       
   546 		inWord = TRUE;
       
   547 	      }
       
   548 	  if( spacing(ptr, ptr->next) > spacingThreshold)
       
   549 	      {
       
   550 		Word * newWord = new Word(start,count);
       
   551 		(words())->Append(newWord);
       
   552 		if(1)
       
   553 		  printf("%s ",newWord->characters);
       
   554 		inWord = FALSE;
       
   555 		word_count++;
       
   556 	      }
       
   557 	  else
       
   558 	    count++;
       
   559 	}
       
   560 	// Add in a separate word for new line
       
   561 	Word * newWord = new Word("\n",2);
       
   562         (words())->Append(newWord);
       
   563 	printf("%s", newWord->characters);
       
   564 	word_count++;
       
   565       }
       
   566   last_status = 0.0;
       
   567   fWordList->num_words = word_count;
       
   568   if(ENABLE_USER_INTERFACE)
       
   569     set_status("Done extracting words");
       
   570   return VALID;
       
   571 }
       
   572 
       
   573 void Page::spellcheck()
       
   574 /*--------------------------------------------------------------
       
   575 Primary Purpose: Run spell checker on word list.
       
   576 Constraints: extractWords must be run first
       
   577 Rev: AR
       
   578 ---------------------------------------------------------------*/
       
   579 {
       
   580   int word_count = 0;
       
   581   Word* temp_word;
       
   582   for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
       
   583     {
       
   584       word_count++;
       
   585       if(ENABLE_USER_INTERFACE)
       
   586 	set_spellcheck_status(word_count, fWordList->num_words);
       
   587       temp_word = (Word*)ptr->item;
       
   588       if(0)
       
   589 	printf("Spellchecking word %s\n", temp_word->characters);
       
   590       if(mispelled(temp_word->characters))
       
   591 	{
       
   592 	  temp_word->mispelled = TRUE;
       
   593 	}
       
   594     }
       
   595 }
       
   596 
       
   597 int Page::spacing(ListElement * compa, ListElement * compb)
       
   598 // spacing from end of comp_a to begining of comp_b
       
   599 {
       
   600   int x;
       
   601   if (compb == NULL) return 1000;  // end of line
       
   602 
       
   603   Component * a = ((Component *) (compa)->item);
       
   604   Component * b = ((Component *) (compb)->item);
       
   605   int returnval =  (b->ul().x() - a->lr().x());
       
   606   if (returnval < 0) 
       
   607     {
       
   608       return 0;
       
   609     }
       
   610   assert (returnval >= 0);
       
   611   return returnval;
       
   612 
       
   613 }
       
   614 
       
   615 
       
   616 void Page::printWords()
       
   617 // Prits out each component of each word. This can take a very long time
       
   618 {
       
   619 
       
   620   Word * thisWord;
       
   621   for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next)
       
   622       {
       
   623 	thisWord = (Word *) ptr->item;
       
   624 	printf("!!!!!! NEW WORD  %s  confid : %d !!!!!\n", thisWord->characters, thisWord->confid);
       
   625 	for(int i = 0; i < thisWord->charCount; i++)
       
   626 	    {
       
   627 	      Component * comp = thisWord->character[i];
       
   628 	      if (comp == NULL) continue;
       
   629 	      printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n", 
       
   630 		     (comp->ul()).x(), (comp->ul()).y(),
       
   631 		     (comp->lr()).x(), (comp->lr()).y());
       
   632 	      for (int r = comp->ul().y(); 
       
   633 		   r <= comp->lr().y(); r++){
       
   634 		for (int c = comp->ul().x();
       
   635 		     c <= comp->lr().x(); c++)
       
   636 		  bitprint(fBitMap->row(r)[c/8], c%8);
       
   637 		printf( "\n");
       
   638 	      }
       
   639 	      printf("properties: "); 
       
   640 	      printVector(comp->properties(), numProperties);
       
   641 	      printf("Identification:  %c distance: %d confidence %d\n",
       
   642 		     comp->asciiId(),
       
   643 		     comp->distance(&LearnedChars[comp->asciiId()]),
       
   644 	             comp->confid());
       
   645 	      printf("\n***********************************************\n");
       
   646 	    }
       
   647       }
       
   648 }
       
   649 
       
   650 MapStatus Page::recognize()
       
   651 /*--------------------------------------------------------------
       
   652 Primary Purpose: Recognize entire page.  Sets font and ascii id of
       
   653 each component
       
   654 Return Value: VALID if no error occurred OTHERERROR otherwise
       
   655 Constraints: extractComponents must be run first.
       
   656 See recognize(line) below for more detailed info
       
   657 Rev: KM
       
   658 ---------------------------------------------------------------*/
       
   659 {
       
   660   printf("Recognizing document\n");
       
   661   last_status = 0.0;
       
   662   for (int i = 0; i< fnumLines; i++)
       
   663       { 
       
   664 	if(ENABLE_USER_INTERFACE)
       
   665 	  set_recognize_status(i, fnumLines);
       
   666 	recognize(i);
       
   667       }
       
   668 
       
   669   last_status = 0.0;
       
   670   return VALID;
       
   671 
       
   672 }
       
   673 
       
   674 
       
   675 MapStatus Page::recognize(int linenum)
       
   676 /*--------------------------------------------------------------
       
   677 Primary Purpose: Recognize a line of connected components
       
   678 Arguments:  linenum is line number to recognize
       
   679 Effects: sets ascii identification fontid and confidence in each component
       
   680 If confidence is low and character is big enough for two characters.
       
   681 divideAndRecognize is called to split up the component.
       
   682 Constraints: extractComponents must be run first
       
   683 Rev: KM 11/9/95
       
   684 ---------------------------------------------------------------*/
       
   685 {
       
   686   Component * comp;
       
   687   Distance d;
       
   688 
       
   689   for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next) 
       
   690       {
       
   691 	comp = (Component *) ptr->item;
       
   692 
       
   693 	d = comp->recognize(LearnedGroups);
       
   694 	if (comp->confid() < ConfidenceThreshold && 
       
   695 	    comp->width() > 2*MinWidth) // really wide
       
   696 	  divideAndRecognize(line(linenum), ptr, d);
       
   697       }
       
   698 
       
   699   return VALID;
       
   700 }
       
   701 
       
   702 
       
   703 
       
   704 void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d)
       
   705 /*--------------------------------------------------------------
       
   706 Primary Purpose: Identify and separate merged characters
       
   707 Arguments:ptr is a pointer to a list element containing a component
       
   708           d is the current recognition distance on the component
       
   709 Effects: Subdivides component into two parts, Division is made at
       
   710          the minimum vertical height of the component.  If the 
       
   711 	 minHeight > JoinTolerance no divison will be made.
       
   712 	 (JoinTolerance is a global var that determines
       
   713 	 the maximum number of merged pixels that are allowed in a
       
   714 	 column for a division to be made)
       
   715 	 When a division is made.  The component's boundaries are 
       
   716 	 adjusted accordingly and a new component is inserted into
       
   717 	 the list.
       
   718 
       
   719 	 Returns if distance is acceptable or width of component
       
   720 	 is <= MinWidth*2
       
   721 Rev: KM 11/24/95
       
   722 ---------------------------------------------------------------*/
       
   723 {
       
   724   Component * comp = (Component *) ptr->item;
       
   725   Component * newComp;
       
   726   bool allGroups = TRUE;
       
   727 
       
   728   // Save the original component boundaries just in case we cant improve
       
   729   Point oldlr = comp->lr();
       
   730   Point oldul = comp->ul();
       
   731   int oldwidth = (int) comp->width();
       
   732 
       
   733   // Some easy access x,y coordinates
       
   734   int ulx = comp->ul().x();
       
   735   int uly = comp->ul().y();
       
   736   int lrx = comp->lr().x();
       
   737   int lry = comp->lr().y();
       
   738 
       
   739   Distance newdist, bestdist;
       
   740   int bestlrx;
       
   741 
       
   742   if (comp->confid() > ConfidenceThreshold)
       
   743     return;
       
   744 
       
   745   if (oldwidth < MinWidth*2)  // cant be split in two
       
   746       {
       
   747 	return;
       
   748       }
       
   749 
       
   750   // Determine where to split.  Split at the thinnest point
       
   751   // within JoinTolerance (maximum number of pixels that might be fused)
       
   752 
       
   753   int minHeight = (int)comp->height();
       
   754   bestlrx = comp->lr().x();
       
   755   for(int i = MinWidth; i < oldwidth - MinWidth; i++)
       
   756       {
       
   757 	int newHeight = 
       
   758 	  fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
       
   759 	if (newHeight < minHeight)
       
   760 	    {
       
   761 	      minHeight = newHeight;
       
   762 	      bestlrx = ulx+i;
       
   763 	    }
       
   764       }
       
   765 //  printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
       
   766 
       
   767 
       
   768   if (bestlrx < lrx  && minHeight < JoinTolerance)
       
   769       {
       
   770 	comp->lr().x() = bestlrx;
       
   771 	int shrunk = comp->vertShrink(fBitMap);
       
   772 	comp->setProperties(fBitMap);
       
   773 	if (shrunk)  // ignore group if we had to shrink down
       
   774 	  newdist = comp->recognize(LearnedGroups, allGroups);
       
   775 	else
       
   776 	  newdist = comp->recognize(LearnedGroups);
       
   777 
       
   778 //	printf("Distance = %u  asciiid = %c \n", newdist, comp->asciiId());
       
   779 
       
   780 	Component * newcomp = new Component(Point(bestlrx+1, oldul.y())
       
   781 					    , oldlr);
       
   782 	newcomp->vertShrink(fBitMap);
       
   783 	newcomp->setProperties(fBitMap);
       
   784 	int newcompdist = newcomp->recognize(LearnedGroups,allGroups);
       
   785 
       
   786 	if (newdist < d)
       
   787       	  list->insertAfter(ptr, newcomp);
       
   788 	else
       
   789 	    {
       
   790 	      	comp->ul() = oldul;
       
   791 		comp->lr() = oldlr;
       
   792 		comp->setProperties(fBitMap);
       
   793 		comp->recognize(LearnedGroups);
       
   794 		delete newcomp;
       
   795 	    }
       
   796 	return;
       
   797       }
       
   798 
       
   799 
       
   800   return;
       
   801 
       
   802 }
       
   803 
       
   804 
       
   805 void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d)
       
   806 /*--------------------------------------------------------------
       
   807 Primary Purpose: Identify and merge a separated character
       
   808 Arguments:ptr is a pointer to a list element containing a component
       
   809           d is the current recognition distance on the component
       
   810 Effects: Unite two components into one.
       
   811     
       
   812 Rev: JMH 12/10/95
       
   813 ---------------------------------------------------------------*/
       
   814 {
       
   815   Component * part1 = (Component *) ptr->previous->item;
       
   816   Component * part2 = (Component *) ptr->item;
       
   817   Point ul, lr;
       
   818   ul = part1->ul();
       
   819   lr = part2->lr();
       
   820   if (ul.y() > lr.y() || ul.x() > lr.x())
       
   821     return;
       
   822   Component * newcomp = new Component(part1->ul(), part2->lr());
       
   823 
       
   824   newcomp->setProperties(fBitMap);
       
   825   if (part1->charGroup <= 3 && part2->charGroup <= 3)
       
   826     newcomp->charGroup = (part1->charGroup | part2->charGroup);
       
   827   else if (part1->charGroup == 4)
       
   828     newcomp->charGroup = (part2->charGroup | 2);
       
   829   else
       
   830     newcomp->charGroup = (part1->charGroup | 2);
       
   831   if (newcomp->charGroup > 4) newcomp->charGroup = 4;
       
   832 
       
   833   int newdist = newcomp->recognize(LearnedGroups);
       
   834 
       
   835   if (newdist < d) {
       
   836     list->removeAt(ptr->previous);
       
   837     list->insertAfter(ptr, newcomp);
       
   838     list->removeAt(ptr); 
       
   839   } else
       
   840     delete newcomp;
       
   841   return;
       
   842 
       
   843 }
       
   844 
       
   845 
       
   846 int Page::writeWordPos(char * filename)
       
   847 /*--------------------------------------------------------------
       
   848 Primary Purpose: Writes word position, confidence, length and string to file
       
   849 Arguments: output file name
       
   850 Return Value: 1 if successful. 0 if an error occured
       
   851 Effects: Calls fWordList->printWordPos
       
   852 	  // Output format for each word
       
   853 	      "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(),
       
   854 		          word->confid, word->charCount, word->characters 
       
   855 Rev: 11/25/95
       
   856 ---------------------------------------------------------------*/
       
   857 { return fWordList->writeWordPos(filename);};
       
   858 
       
   859 int Page::writeAscii(char * filename)
       
   860 /*--------------------------------------------------------------
       
   861 Primary Purpose: Write word list to asii file
       
   862 Arguments: filename to write to
       
   863 Return Value:  1 if successful 0 if unsuccessful
       
   864 Effects:  Calss fWordList->writeAscii(filename)
       
   865 Writes words to fill in text format using MinLineSize
       
   866 to differentiate lines.
       
   867 Rev: 11/25 KM
       
   868 ---------------------------------------------------------------*/
       
   869 
       
   870 {return fWordList->writeAscii(filename);};
       
   871 
       
   872 
       
   873