reference/ocr-new/Page.cc
changeset 0 6b8091ca909a
equal deleted inserted replaced
-1:000000000000 0:6b8091ca909a
       
     1 /** Page.cc contains the member functions for the primary OCR class Page */
       
     2 #include "system.h"
       
     3 #include "Page.h"
       
     4 #include "convertMap.h"
       
     5 #include "get_skew.h"
       
     6 #include "Component.h"
       
     7 #include "status_message.h"
       
     8 
       
     9 /*** Member functions of class Page.     ***/
       
    10 
       
    11 int Page::get_height()
       
    12 {
       
    13   return fRLEMap->imageLength();
       
    14 }
       
    15 
       
    16 int Page::get_width()
       
    17 {
       
    18   return fRLEMap->imageWidth();
       
    19 }
       
    20 
       
    21 int Page::get_linenum(int col, int row)
       
    22  /*--------------------------------------------------------------
       
    23 Primary Purpose: Returns line number of x,y coordinates (just uses y for now)
       
    24                  called from proc equation_mark in new_ui.tcl
       
    25 Return value: line number or -1 if no line is here.
       
    26 Requires: setLines be run first
       
    27 Rev: 4/21/96
       
    28 ---------------------------------------------------------------*/
       
    29 {
       
    30   assert (flineinfo != NULL);
       
    31   int linenum= -1;
       
    32 
       
    33   for (int i = 0; i < fnumLines; i++)
       
    34     if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row)
       
    35       {
       
    36 	linenum = i;
       
    37 	if (ENABLE_USER_INTERFACE)
       
    38 	  {
       
    39 	    // save last mark before it is overwritten
       
    40 
       
    41 	    docommand("set curline %d",linenum);
       
    42 	    docommand("set curline_startrow %d",flineinfo[i].fstartrow);
       
    43 	    docommand("set curline_endrow %d",flineinfo[i].fendrow);
       
    44 
       
    45 	    
       
    46 	    // this will change with zoning
       
    47 	    docommand("set curline_startcol %d",0);
       
    48 	    docommand("set curline_endcol %d",get_width());
       
    49 	      
       
    50 	  }
       
    51 
       
    52 	break;
       
    53 
       
    54       }
       
    55 return linenum;
       
    56 
       
    57 }
       
    58 
       
    59 int Page::send_words_to_tcl()
       
    60 /*--------------------------------------------------------------
       
    61 Primary Purpose:  Display words in tcl
       
    62 Rev - AR
       
    63 ---------------------------------------------------------------*/
       
    64 {
       
    65   int word_count = 0;
       
    66   int unknown_char_count = 0;
       
    67   int low_precision_count = 0;
       
    68   int mispelled_count = 0;
       
    69   char* send_chars;
       
    70   Word* temp_word;
       
    71   if(ENABLE_USER_INTERFACE) set_status("Displaying text");
       
    72   for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
       
    73     {
       
    74       word_count++;
       
    75       set_text_display_status(word_count, fWordList->num_words);
       
    76       temp_word = (Word*)ptr->item;
       
    77       send_chars = backslashify(temp_word->characters);
       
    78       /*	printf("Added word %s Confidence = %d\n", send_chars, 
       
    79 	       temp_word->confid); */
       
    80       if(temp_word->confid < VERY_LOW_CONFIDENCE)
       
    81 	  {
       
    82 	    docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    83 	    unknown_char_count++;
       
    84 	  }
       
    85       else if(temp_word->confid < LOW_CONFIDENCE)
       
    86 	  {
       
    87 	    docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    88 	    low_precision_count++;
       
    89 	  }
       
    90       else if((temp_word->mispelled) && SPELLCHECK)
       
    91 	  {
       
    92 	    docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    93 	    mispelled_count++;
       
    94 	  }
       
    95       else
       
    96 	  {
       
    97 	    docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(),  temp_word->ul.y());
       
    98 	  }
       
    99       update();
       
   100     }
       
   101   if(ENABLE_USER_INTERFACE)
       
   102       {
       
   103     set_status("Done displaying text");
       
   104     set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count))));
       
   105   }
       
   106 }
       
   107 
       
   108 
       
   109 int Page::deskew(int deskew_method)
       
   110 /*--------------------------------------------------------------
       
   111 Primary Purpose: Deskew the page
       
   112 Arguments: 1 - RLE Rotation
       
   113            0 - BitMap Rotation
       
   114 Return Value: 1 if successful, 0 if unsuccessful
       
   115 Effects: updates the bitmap and rlemap of the page
       
   116 Constraints: RLEMap Rotation is not currently reliable and probably
       
   117 should not be used
       
   118 Rev: AR
       
   119 ---------------------------------------------------------------*/
       
   120 {
       
   121   /* a little ugly.... if the page is rotated
       
   122      in here, return 1, else 0 */
       
   123 
       
   124   if(deskew_method == RLE_DESKEW)
       
   125       {
       
   126 	if(fRLEMap->deskew())
       
   127 	{
       
   128 	  convertMap(fRLEMap, fBitMap);
       
   129 	  return 1;
       
   130 	}
       
   131 	return 0;
       
   132       }
       
   133   else
       
   134       {
       
   135       double skew = get_skew(fRLEMap);
       
   136       if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE))
       
   137 	  {
       
   138 	    fBitMap->rotateMap(skew);
       
   139 	    convertMap(fBitMap, fRLEMap);
       
   140 	    return 1;
       
   141 	  }
       
   142       return 0;
       
   143     }
       
   144 }
       
   145 
       
   146 Page::Page()
       
   147 /**Page::Page - constructor allocates bitmap and rlemap*/
       
   148 {
       
   149   fBitMap = new BitMap;
       
   150   fRLEMap = new RLEMap;
       
   151   fEqnList = new EqnMarkers;
       
   152   fLineComponents = NULL;
       
   153   fWordList = NULL;
       
   154 }
       
   155 
       
   156 Page::~Page()
       
   157 /*--------------------------------------------------------------
       
   158 Primary Purpose:  Destructor deallocates private fields that
       
   159 have been created.
       
   160 Rev:
       
   161 ---------------------------------------------------------------*/
       
   162 {
       
   163  
       
   164   if (flineinfo) delete flineinfo;
       
   165   for (int  i = 0; i <fnumLines; i++)
       
   166         if(fLineComponents[i] != NULL) delete fLineComponents[i];
       
   167   if(fLineComponents) delete fLineComponents;
       
   168   if (fBitMap) delete fBitMap;
       
   169   if (fRLEMap) delete fRLEMap;
       
   170   if (fWordList) delete fWordList;
       
   171   if (fEqnList) delete fEqnList;
       
   172 }
       
   173 
       
   174 Angle Page::skewAngle()
       
   175 /*--------------------------------------------------------------
       
   176 Primary Purpose: Determine the angle of rotation of the RLEMap r
       
   177 Arguments: pointer to an RLEMap
       
   178 Return Value: detected angle of rotation
       
   179 Code is in get_skew.cc
       
   180 Rev: AR
       
   181 ---------------------------------------------------------------*/
       
   182 {
       
   183   return get_skew(fRLEMap);
       
   184 }
       
   185 
       
   186 
       
   187 MapStatus Page::readMap(char * filename)
       
   188  // Calls BitMap::readMap and then converts
       
   189 {
       
   190   MapStatus status;
       
   191   status = fBitMap->readMap(filename);
       
   192   convertMap(fBitMap, fRLEMap);
       
   193   return status;
       
   194 }
       
   195 
       
   196 
       
   197 
       
   198 MapStatus Page::setLines()
       
   199 /*--------------------------------------------------------------
       
   200 Primary Purpose:  Set flineinfo array in Page class with the 
       
   201       starting and ending rows of each line of text.
       
   202       Also sets fnumLines to the number of lines
       
   203 Arguments: none
       
   204 Return Value: A Mapstatus either VALID, EMPTY if there is no
       
   205    data in the RLEMAP, or OTHERERROR if there is an unexpected error
       
   206 Effects:  Allocates flineinfo and fills with starting and ending row
       
   207    of each line.  The following global variables are used as parameters
       
   208    in this function.  These are defined in system.cc
       
   209    NoiseTolerance - Rows whose number of pixels is less than  this value
       
   210                 will be considered empty (current val 6). 
       
   211    MinVertSeparation - The minimum number of rows separating lines of text.
       
   212                  Lines will be merged if actual Separation is less than this
       
   213 		 value. (current val 3)
       
   214    MinLineSize - The minimum number of rows in a line of text.  
       
   215                  Any smaller lines are discarded (currentval 5)
       
   216 
       
   217 Constraints: Page::readMap() must be run first to fill fRLEMap 
       
   218 Rev: 10/26 KM
       
   219 ---------------------------------------------------------------*/
       
   220 {
       
   221 
       
   222    int maxrow = fRLEMap->imageLength() - 1;      // maximum row number 
       
   223    int actualSeparation = MinVertSeparation + 1; // must be bigger than min
       
   224                                                  // for line 0
       
   225 
       
   226    int linenum=0;                                // current line number
       
   227    int prvlinenum = 0;
       
   228    int lineSize;                                 // # rows in current line 
       
   229 
       
   230    int maxLines = maxrow/MinLineSize;           // max # of lines of text 
       
   231 
       
   232    if(maxrow == 0) return EMPTY;
       
   233 
       
   234    flineinfo = new LineMarker[maxLines]; 
       
   235 
       
   236    for (int i = 0; i < maxrow;)
       
   237 	{
       
   238 	  LineMarker & thisLine = flineinfo[linenum];
       
   239 	  LineMarker & prevLine = flineinfo[prvlinenum];
       
   240 
       
   241 	  while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance)
       
   242 	    i++;
       
   243 	  thisLine.fstartrow = i++;
       
   244 	  while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance)
       
   245 	    i++;
       
   246 	  
       
   247 
       
   248 	  lineSize = i - thisLine.fstartrow +1;
       
   249 
       
   250 	  // If this line is less than MinVertSeparation away
       
   251 	  //  from the last line.  Join the two together.
       
   252 	  if (linenum > 0)
       
   253 	    {
       
   254 	      actualSeparation = thisLine.fstartrow - prevLine.fendrow;
       
   255 	    }
       
   256 	  if (actualSeparation < MinVertSeparation)
       
   257 	    {
       
   258 	     // If too small of a separation, add into prev row
       
   259 	     prevLine.fendrow = i;
       
   260 	   }
       
   261 	  else if (lineSize >= MinLineSize)
       
   262 	    {
       
   263 	    thisLine.fendrow = i;
       
   264 /*	    printf (" Line %d  Start: %d  End: %d  lineHeight %d\n", 
       
   265 	        linenum,thisLine.fstartrow,
       
   266 	        thisLine.fendrow, 
       
   267 	        thisLine.fendrow  - thisLine.fstartrow +1);
       
   268 */
       
   269 	    prvlinenum = linenum;
       
   270 	    linenum++;
       
   271 
       
   272 	  }
       
   273 	  if (linenum >= maxLines) return OTHERERROR;
       
   274 	}
       
   275 
       
   276    fnumLines = linenum;   // Set number of lines in page class
       
   277 
       
   278 
       
   279    if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES)
       
   280      {
       
   281        display_line_boundaries();
       
   282      }
       
   283    /*   printf("Setlines found a total of %d lines.\n", fnumLines); */
       
   284    if(ENABLE_USER_INTERFACE) 
       
   285      update(); 
       
   286    return VALID;
       
   287  }
       
   288 
       
   289 void Page::display_line_boundaries()
       
   290 /*--------------------------------------------------------------
       
   291 Primary Purpose: Display line boundaries in TCL/TK.  Called from
       
   292 setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are
       
   293 set to TRUE
       
   294 Effects:  Draws a blue line between each line of text
       
   295 Rev:  AR
       
   296 ---------------------------------------------------------------*/
       
   297 {
       
   298   int centerline, width;
       
   299   for(int j=0; j < fnumLines; j++)
       
   300     {
       
   301       centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2;
       
   302       width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow;
       
   303 
       
   304       scale(centerline);
       
   305       scale(width);
       
   306       /* having this pathname here is probably not such a good idea...*/
       
   307       
       
   308       docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width);
       
   309     }
       
   310 
       
   311 }
       
   312 
       
   313 
       
   314 int test_rlemap_lines(RLEMap* rmap)
       
   315 {
       
   316   int length = rmap->imageLength();
       
   317   for(int i = 0; i < length; i++)
       
   318     printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels);
       
   319 }
       
   320 
       
   321 
       
   322 MapStatus Page::extractComponents(int horizMerge)
       
   323 /*--------------------------------------------------------------
       
   324                      Component extraction routines.
       
   325 *
       
   326 * Given the top and bottom line of a row we want to generate a list of
       
   327 * components. The general method is to find the closest dot, trace its 
       
   328 * connected dots, then project upwards and downwards and add anything we 
       
   329 * find there to the component. We will erase the component from the RLEMap
       
   330 * as it is added to the component list. By projecting up and down 
       
   331 * from the piece we first find we should be able
       
   332 * to completely encompass characters like :;i?|! The only problems are 
       
   333 * italic or ligatured characters where we may pick up two or more 
       
   334 * characters at a time (which would be bad) or characters fragmented 
       
   335 * with a vertical gap.
       
   336 
       
   337 Primary Purpose: Main extraction routine.
       
   338 Effects: Makes new components and puts them in a list. Deletes components 
       
   339          from RLE map. Fills in component boundaries and calls 
       
   340 	 Component::setProperties to set the property vector
       
   341          Lastly convertMap is run to rebuild the RLEMap
       
   342 Constraints: Page::setLines() must be run first 
       
   343 Rev: 4/28/96
       
   344 ---------------------------------------------------------------*/
       
   345 {
       
   346   int currentCol, startRow, endRow, rowHeight;
       
   347   ListElement* intrvl;
       
   348   ListElement* tempintrvl;
       
   349   /*  printf("fnumLines = %d\n", fnumLines); */
       
   350   Component* comp;
       
   351   int  totalSpacing = 0;  // total blank horizontal pixels between components
       
   352   int  baselines[MaxVertSize];     // array for finding the baseline
       
   353   last_status = 0.0;
       
   354   int compCounter = 0;
       
   355   int i;
       
   356   int j;
       
   357   int upwardBound;      // Projection distances different for equations
       
   358   int downwardBound;    // and non-equations
       
   359   
       
   360  
       
   361   bool inEqn;          // Variables for finding if the center of a comp
       
   362   int centerx;         // is in an equation.
       
   363   int centery;
       
   364 
       
   365 
       
   366 
       
   367     printf("Extracting Components\n");
       
   368    fLineComponents = new Components*[fnumLines];
       
   369   for (i = 0; i < fnumLines; i++) {
       
   370     if(ENABLE_USER_INTERFACE)
       
   371       set_component_status(i, fnumLines);
       
   372     currentCol = 0;
       
   373     startRow = flineinfo[i].fstartrow;
       
   374     endRow = flineinfo[i].fendrow;
       
   375     rowHeight = endRow - startRow;
       
   376     assert(rowHeight > 0);
       
   377 
       
   378     for (j=0; j < MaxVertSize; j++)
       
   379       baselines[j] = 0;
       
   380     fLineComponents[i] = new Components();
       
   381 
       
   382 
       
   383     while (currentCol<=fRLEMap->imageWidth()) {  //until we reach the end of the page
       
   384 
       
   385 	//Build component starting with closest black dot
       
   386 	intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow);
       
   387 	if (intrvl == NULL) {
       
   388 	//  printf("Reached end of line\n");
       
   389 	  break;
       
   390 	}
       
   391 	comp = new Component(); //Make a new component named comp
       
   392 	assert(comp->AddToComponent(intrvl, fRLEMap, horizMerge));
       
   393 
       
   394 	//Now we want to extend upwards 
       
   395 	//First check if there is a blank space to the right 
       
   396 	tempintrvl =
       
   397 	  fRLEMap->FindNearHorizDot(comp->lr().x(), startRow, endRow);
       
   398 
       
   399 	
       
   400 	if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start > 
       
   401 	    comp->lr().x()+horizMerge+1)
       
   402 	  while (comp->ul().y() < endRow) {
       
   403 
       
   404 	// find the center of the component to check if we are in an equation
       
   405 	centerx = (comp->ul().x() + comp->lr().x())/2;
       
   406 	centery = (comp->ul().y() + comp->lr().y())/2;
       
   407  	inEqn = inEquation(centerx, centery);
       
   408 	// Determine projection distance.  Only project for non Equations.
       
   409 	if(inEqn)
       
   410 	  {
       
   411 	    upwardBound = comp->ul().y()+1;
       
   412 	    downwardBound = comp->lr().y() - 1;
       
   413 	  }
       
   414 	else
       
   415 	  {
       
   416 	    upwardBound = startRow;
       
   417 	    downwardBound = endRow;
       
   418 	  }
       
   419 	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
       
   420 					      comp->lr().x(), comp->lr().y(),
       
   421 					      upwardBound);
       
   422 					      //  startRow);
       
   423 	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
       
   424 							   horizMerge)))
       
   425 	      break;
       
   426 	    if (intrvl == NULL) break;
       
   427 	  }
       
   428 	else
       
   429 	  while (comp->ul().y() < endRow) {
       
   430 
       
   431        	// find the center of the component to check if we are in an equation
       
   432 	    centerx = (comp->ul().x() + comp->lr().x())/2;
       
   433 	    centery = (comp->ul().y() + comp->lr().y())/2;
       
   434 	    inEqn = inEquation(centerx, centery);
       
   435 	    // Determine projection distance.  Only project for non Equations.
       
   436 	    if(inEqn)
       
   437 	      {
       
   438 		upwardBound = comp->ul().y()+1;
       
   439 		downwardBound = comp->lr().y() - 1;
       
   440 	      }
       
   441 	    else // regular text
       
   442 	      {
       
   443 		upwardBound = startRow;
       
   444 		downwardBound = endRow;
       
   445 	      }
       
   446 
       
   447 	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
       
   448 					      comp->lr().x(), comp->ul().y(),
       
   449 					      upwardBound);
       
   450 					      // startRow);
       
   451 	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap, 
       
   452 							   horizMerge)));
       
   453 	    break;
       
   454 	    if (intrvl == NULL) break;
       
   455 	  }
       
   456 
       
   457 	//Now we want to extend downwards
       
   458 	while (comp->lr().y() > startRow) {
       
   459 	  intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(), 
       
   460 						    comp->lr().y(), downwardBound);
       
   461 	  if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
       
   462 							 horizMerge)))
       
   463 	    break;
       
   464 	  if (intrvl == NULL) break;
       
   465 	}
       
   466 
       
   467 	// Now we toss out the noise
       
   468 	int size;
       
   469 	if (comp != NULL) {
       
   470 	  if (comp->ul() < Point(0,0))
       
   471 	    printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y());
       
   472 	  else
       
   473 	    size = fBitMap->pixelsInRegion(comp->ul(), comp->lr());
       
   474 	}
       
   475 	  else
       
   476 	    size = 0;
       
   477 	Component * prev = (Component *)(fLineComponents[i]->last->item);
       
   478 	if (size < MinComponentSize) {
       
   479 //	  printf("Deleting some noise of size %d\n", size);
       
   480 	  // printComponent(comp);
       
   481 	  delete comp;
       
   482 	  comp = NULL;
       
   483 	}
       
   484 	else if (prev != NULL && 
       
   485 		 abs(comp->ul().x() - prev->ul().x()) <= 1 &&
       
   486  	         abs(comp->lr().x() == prev->lr().x()) <= 1)
       
   487 	  {
       
   488 	    // Check and see if this and the previous component have the
       
   489 	    // same x boundaries, if so merge the two.  Good for = and :
       
   490 	    prev->join(comp);
       
   491 	    prev->setProperties(fBitMap);
       
   492 	    delete comp;
       
   493 	    comp ==NULL;
       
   494 	  }
       
   495 	else
       
   496 	    {
       
   497 	      compCounter++;
       
   498 	      // display a rectangle around the component
       
   499 	      if(ENABLE_USER_INTERFACE)
       
   500 		  {
       
   501 		    if(DISPLAY_BOUNDING_BOXES)
       
   502 		      comp->display_bounding_box();
       
   503 		  }
       
   504 	  
       
   505 	  // JMH - make an array of frequency of the y coord of bottom of comp
       
   506 	      int vertOffset = endRow - comp->lr().y();
       
   507 	      if(vertOffset < MaxVertSize && vertOffset >= 0)
       
   508 		baselines[vertOffset]++;
       
   509 
       
   510 	  
       
   511 	      comp->setProperties(fBitMap);
       
   512 	      if(fLineComponents[i]->last != NULL)
       
   513 		{
       
   514 		  int thisSpacing = comp->ul().x() - 
       
   515 		    ((Component *) (fLineComponents[i]->last->item))->lr().x();
       
   516 		  // if a realy big space, make space the width of this comp
       
   517 		  if (thisSpacing > 200) 
       
   518 		  thisSpacing = 2*(comp->lr().x() - comp->ul().x());
       
   519 		totalSpacing += thisSpacing;
       
   520 		}
       
   521 
       
   522 	      fLineComponents[i]->Append(comp);       // add this component to list
       
   523 	      currentCol = (comp->ul()).x() + 1;   // update position on page
       
   524 	    }
       
   525       }
       
   526     
       
   527     // find most popular bottom of comp and call it the baseline
       
   528     int counter = 0;
       
   529     int baseline;
       
   530     for (j=0; j < MaxVertSize; j++) {
       
   531       if (counter < baselines[j]) {
       
   532 	counter = baselines[j];
       
   533 	baseline = endRow - j;
       
   534       }
       
   535     }
       
   536     //    printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline);
       
   537     // Now assign each character a group based on it's location
       
   538     for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
       
   539 	 ptr = ptr->next) {
       
   540       comp = (Component*) ptr->item;
       
   541       comp->charGroup = 0;
       
   542       
       
   543       // if top of char is higher than top - tolerance 
       
   544       if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) {
       
   545 	comp->charGroup += 2; //tall like a T
       
   546       }
       
   547       
       
   548       // if bottom of char is lower than base - tolerance
       
   549       if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) {
       
   550 	comp->charGroup += 1; //has a tail like a y
       
   551       } else 
       
   552 	if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) {
       
   553 	  comp->charGroup = 4; //floating like a '
       
   554 	  /*	  printf("bottom at %d < %d\n", comp->lr().y(),
       
   555 		  baseline - (2*rowHeight/BaseLineTolerance)); */
       
   556 	}
       
   557       //      printf("added character in group %d\n", comp->charGroup);
       
   558     }
       
   559   }
       
   560   /*  printf("Found %d components on this page.\n", compCounter); */
       
   561   //  printComponents();
       
   562   last_status = 0.0;
       
   563   if(ENABLE_USER_INTERFACE)
       
   564     set_status("Done extracting characters");
       
   565   if((compCounter - fnumLines) > 0) /* don't want divide by zero */
       
   566     {
       
   567       favgSpacing = totalSpacing / (compCounter - fnumLines);
       
   568     }
       
   569   else
       
   570     {
       
   571       favgSpacing = 1;  
       
   572     }
       
   573   delete fRLEMap;
       
   574   fRLEMap = new RLEMap;
       
   575   convertMap(fBitMap, fRLEMap);
       
   576 }
       
   577 
       
   578 void Page::printComponents()
       
   579 /*--------------------------------------------------------------
       
   580 Primary Purpose: Debugging routine that prints little bitmaps
       
   581 of low confidence characters
       
   582 ---------------------------------------------------------------*/
       
   583 {
       
   584   int compcounter = 0;
       
   585   for (int i = 0; i < fnumLines; i++) {
       
   586     Component* comp;
       
   587     for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
       
   588 	 ptr = ptr->next) {
       
   589       compcounter++;
       
   590       comp = (Component *) ptr->item;
       
   591       if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n')
       
   592       {
       
   593 	printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n", 
       
   594 	   (comp->ul()).x(), (comp->ul()).y(),
       
   595 	   (comp->lr()).x(), (comp->lr()).y());
       
   596 	printComponent(comp);
       
   597 	printf("properties: "); 
       
   598 	printVector(comp->properties(), numProperties);
       
   599 	printf("I think it's a -> %c <-   confidence: %d  line: %d  group: %d Comp#%d\n",
       
   600 	       comp->asciiId(),
       
   601 	       comp->confid(), i+1, comp->charGroup, compcounter);
       
   602 	printf("\n*******************************************************\n");
       
   603       }
       
   604     }
       
   605   }
       
   606 }
       
   607 
       
   608 void Page::printComponent(Component* comp)
       
   609 // Print a single component.
       
   610 {
       
   611   int right = comp->ul().x()+78;
       
   612   if (comp->lr().x() < right) 
       
   613     right = comp->lr().x();
       
   614 
       
   615   for (int r = comp->ul().y(); 
       
   616        r <= comp->lr().y(); r++){
       
   617     for (int c = comp->ul().x();
       
   618 	 c <= right; c++)
       
   619       bitprint(fBitMap->row(r)[c/8], c%8);
       
   620     printf( "\n");
       
   621   }
       
   622 }
       
   623 
       
   624 int spacing(ListElement * compa, ListElement * compb);
       
   625 // helper function for extractWords  (defined below)
       
   626 
       
   627 MapStatus Page::extractWords()
       
   628 /*--------------------------------------------------------------
       
   629 Primary Purpose: Extract words from each lines components
       
   630 Effects: sets the fWordsList to be a list of all of the words
       
   631 in the document.
       
   632 Constraints: extractComponents must be run first
       
   633 Rev: KM 11/7/95
       
   634 ---------------------------------------------------------------*/
       
   635 {
       
   636   bool inWord;
       
   637   ListElement * start;   // word Start
       
   638   int count;   // counts the components in the word
       
   639   int wordlength; // counts the characters in the word
       
   640   int word_count = 0;
       
   641   int spacingThreshold = (int) (1.25 * ((float) (favgSpacing)));
       
   642   fWordList = new Words;
       
   643   last_status = 0.0;
       
   644   for (int i = 0; i < fnumLines; i++)
       
   645       {
       
   646 	if(ENABLE_USER_INTERFACE)
       
   647 	  set_extract_status(i, fnumLines);
       
   648 	inWord = FALSE;
       
   649 	for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) {
       
   650 	  Component * item = (Component *) ptr->item;
       
   651 	  if(!inWord)
       
   652 	      {
       
   653 		start = ptr;
       
   654 		count = 1;
       
   655 		if (item->fasciiId == NULL) 
       
   656 		  wordlength = 1;
       
   657 		else
       
   658 		  wordlength = strlen(item->fasciiId);
       
   659 		inWord = TRUE;
       
   660 	      }
       
   661 	  if( spacing(ptr, ptr->next) > spacingThreshold || 
       
   662 	      inEquation( ptr)) 
       
   663 	      {
       
   664 		Word * newWord = new Word(start,count,wordlength);
       
   665 		(words())->Append(newWord);
       
   666 		if(1)
       
   667 		  printf("%s ",newWord->characters);
       
   668 		inWord = FALSE;
       
   669 		word_count++;
       
   670 	      }
       
   671 	  else
       
   672 	    count++;
       
   673 	    if (item->fasciiId == NULL) wordlength ++;
       
   674 	    else wordlength += strlen(item->fasciiId);
       
   675 	}
       
   676 	// Add in a separate word for new line
       
   677 	Word * newWord = new Word("\n",2);
       
   678         (words())->Append(newWord);
       
   679 	printf("%s", newWord->characters);
       
   680 	word_count++;
       
   681       }
       
   682   last_status = 0.0;
       
   683   fWordList->num_words = word_count;
       
   684   if(ENABLE_USER_INTERFACE)
       
   685     set_status("Done extracting words");
       
   686   return VALID;
       
   687 }
       
   688 
       
   689 void Page::spellcheck()
       
   690 /*--------------------------------------------------------------
       
   691 Primary Purpose: Run spell checker on word list.
       
   692 Constraints: extractWords must be run first
       
   693 Rev: AR
       
   694 ---------------------------------------------------------------*/
       
   695 {
       
   696   int word_count = 0;
       
   697   Word* temp_word;
       
   698   for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
       
   699     {
       
   700       word_count++;
       
   701       if(ENABLE_USER_INTERFACE)
       
   702 	set_spellcheck_status(word_count, fWordList->num_words);
       
   703       temp_word = (Word*)ptr->item;
       
   704       if(0)
       
   705 	printf("Spellchecking word %s\n", temp_word->characters);
       
   706       if(mispelled(temp_word->characters))
       
   707 	{
       
   708 	  temp_word->mispelled = TRUE;
       
   709 	}
       
   710     }
       
   711 }
       
   712 
       
   713 int Page::spacing(ListElement * compa, ListElement * compb)
       
   714 // spacing from end of comp_a to begining of comp_b
       
   715 {
       
   716   int x;
       
   717   if (compb == NULL) return 1000;  // end of line
       
   718 
       
   719   Component * a = ((Component *) (compa)->item);
       
   720   Component * b = ((Component *) (compb)->item);
       
   721   int returnval =  (b->ul().x() - a->lr().x());
       
   722   if (returnval < 0) 
       
   723     {
       
   724       return 0;
       
   725     }
       
   726   assert (returnval >= 0);
       
   727   return returnval;
       
   728 
       
   729 }
       
   730 
       
   731 
       
   732 void Page::printWords()
       
   733 // Prits out each component of each word. This can take a very long time
       
   734 {
       
   735 
       
   736   Word * thisWord;
       
   737   for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next)
       
   738       {
       
   739 	thisWord = (Word *) ptr->item;
       
   740 	printf("!!!!!! NEW WORD  %s  confid : %d !!!!!\n", thisWord->characters, thisWord->confid);
       
   741 	for(int i = 0; i < thisWord->charCount; i++)
       
   742 	    {
       
   743 	      Component * comp = thisWord->character[i];
       
   744 	      if (comp == NULL) continue;
       
   745 	      printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n", 
       
   746 		     (comp->ul()).x(), (comp->ul()).y(),
       
   747 		     (comp->lr()).x(), (comp->lr()).y());
       
   748 	      for (int r = comp->ul().y(); 
       
   749 		   r <= comp->lr().y(); r++){
       
   750 		for (int c = comp->ul().x();
       
   751 		     c <= comp->lr().x(); c++)
       
   752 		  bitprint(fBitMap->row(r)[c/8], c%8);
       
   753 		printf( "\n");
       
   754 	      }
       
   755 	      printf("properties: "); 
       
   756 	      printVector(comp->properties(), numProperties);
       
   757 	      printf("Identification:  %c distance: %d confidence %d\n",
       
   758 		     comp->asciiId(),
       
   759 		     comp->distance(&LearnedChars[comp->asciiId()]),
       
   760 	             comp->confid());
       
   761 	      printf("\n***********************************************\n");
       
   762 	    }
       
   763       }
       
   764 }
       
   765 
       
   766 MapStatus Page::recognize()
       
   767 /*--------------------------------------------------------------
       
   768 Primary Purpose: Recognize entire page.  Sets font and ascii id of
       
   769 each component
       
   770 Return Value: VALID if no error occurred OTHERERROR otherwise
       
   771 Constraints: extractComponents must be run first.
       
   772 See recognize(line) below for more detailed info
       
   773 Rev: KM
       
   774 ---------------------------------------------------------------*/
       
   775 {
       
   776   printf("Recognizing document\n");
       
   777   last_status = 0.0;
       
   778   for (int i = 0; i< fnumLines; i++)
       
   779       { 
       
   780 	if(ENABLE_USER_INTERFACE)
       
   781 	  set_recognize_status(i, fnumLines);
       
   782 	recognize(i);
       
   783       }
       
   784 
       
   785   last_status = 0.0;
       
   786   return VALID;
       
   787 
       
   788 }
       
   789 
       
   790 
       
   791 MapStatus Page::recognize(int linenum)
       
   792 /*--------------------------------------------------------------
       
   793 Primary Purpose: Recognize a line of connected components
       
   794 Arguments:  linenum is line number to recognize
       
   795 Effects: sets ascii identification fontid and confidence in each component
       
   796 If confidence is low and character is big enough for two characters.
       
   797 divideAndRecognize is called to split up the component.
       
   798 Constraints: extractComponents must be run first
       
   799 Rev: KM 11/9/95
       
   800 ---------------------------------------------------------------*/
       
   801 {
       
   802   Component * comp;
       
   803   Distance d;
       
   804 
       
   805   for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next) 
       
   806       {
       
   807 	comp = (Component *) ptr->item;
       
   808 
       
   809 	d = comp->recognize(LearnedGroups);
       
   810 	if (comp->confid() < ConfidenceThreshold && 
       
   811 	    comp->width() > 2*MinWidth) // really wide
       
   812 	  divideAndRecognize(line(linenum), ptr, d);
       
   813 
       
   814 	/***	
       
   815 	if (comp->confid() < ConfidenceThreshold || 
       
   816 	    (ptr !=  line(linenum)->first &&
       
   817 	    ((Component *) ptr->previous->item)->confid() < ConfidenceThreshold))
       
   818 	  uniteAndRecognize(line(linenum), ptr, d);
       
   819 	  ***/
       
   820 	  
       
   821       }
       
   822 
       
   823   return VALID;
       
   824 }
       
   825 
       
   826 
       
   827 
       
   828 void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d)
       
   829 /*--------------------------------------------------------------
       
   830 Primary Purpose: Identify and separate merged characters
       
   831 Arguments:ptr is a pointer to a list element containing a component
       
   832           d is the current recognition distance on the component
       
   833 Effects: Subdivides component into two parts, Division is made at
       
   834          the minimum vertical height of the component.  If the 
       
   835 	 minHeight > JoinTolerance no divison will be made.
       
   836 	 (JoinTolerance is a global var that determines
       
   837 	 the maximum number of merged pixels that are allowed in a
       
   838 	 column for a division to be made)
       
   839 	 When a division is made.  The component's boundaries are 
       
   840 	 adjusted accordingly and a new component is inserted into
       
   841 	 the list.
       
   842 
       
   843 	 Returns if distance is acceptable or width of component
       
   844 	 is <= MinWidth*2
       
   845 Rev: KM 11/24/95
       
   846 ---------------------------------------------------------------*/
       
   847 {
       
   848   Component * comp = (Component *) ptr->item;
       
   849   Component * newComp;
       
   850   bool allGroups = TRUE;
       
   851 
       
   852   // Save the original component boundaries just in case we cant improve
       
   853   Point oldlr = comp->lr();
       
   854   Point oldul = comp->ul();
       
   855   int oldwidth = (int) comp->width();
       
   856 
       
   857   // Some easy access x,y coordinates
       
   858   int ulx = comp->ul().x();
       
   859   int uly = comp->ul().y();
       
   860   int lrx = comp->lr().x();
       
   861   int lry = comp->lr().y();
       
   862 
       
   863   Distance newdist, bestdist;
       
   864   int bestlrx;
       
   865 
       
   866   if (comp->confid() > ConfidenceThreshold)
       
   867     return;
       
   868 
       
   869   if (oldwidth < MinWidth*2)  // cant be split in two
       
   870       {
       
   871 	return;
       
   872       }
       
   873 
       
   874   // Determine where to split.  Split at the thinnest point
       
   875   // within JoinTolerance (maximum number of pixels that might be fused)
       
   876 
       
   877   int minHeight = (int)comp->height();
       
   878   bestlrx = comp->lr().x();
       
   879   for(int i = MinWidth; i < oldwidth - MinWidth; i++)
       
   880       {
       
   881 
       
   882 	int newHeight = 
       
   883 	  fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
       
   884 	if (newHeight < minHeight)
       
   885 	    {
       
   886 	      minHeight = newHeight;
       
   887 	      bestlrx = ulx+i;
       
   888 	    }
       
   889       }
       
   890 //  printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
       
   891 
       
   892 
       
   893   if (bestlrx < lrx  && minHeight < JoinTolerance)
       
   894       {
       
   895 	comp->lr().x() = bestlrx;
       
   896 	int shrunk = comp->vertShrink(fBitMap);
       
   897 	comp->setProperties(fBitMap);
       
   898 	if (shrunk)  // ignore group if we had to shrink down
       
   899 	  newdist = comp->recognize(LearnedGroups, allGroups);
       
   900 	else
       
   901 	  newdist = comp->recognize(LearnedGroups);
       
   902 
       
   903 //	printf("Distance = %u  asciiid = %c \n", newdist, comp->asciiId());
       
   904 
       
   905 	Component * newcomp = new Component(Point(bestlrx+1, oldul.y())
       
   906 					    , oldlr);
       
   907 	newcomp->vertShrink(fBitMap);
       
   908 	newcomp->setProperties(fBitMap);
       
   909 	int newcompdist = newcomp->recognize(LearnedGroups,allGroups);
       
   910 
       
   911 	if ((newdist < d) && (newcomp->confid() > ConfidenceThreshold*.6))
       
   912 	  {
       
   913 	    list->insertAfter(ptr, newcomp);
       
   914 	    newcomp->display_bounding_box("red");
       
   915 	    comp->display_bounding_box("red");
       
   916 	  }
       
   917 	else
       
   918 	    {
       
   919 	      	comp->ul() = oldul;
       
   920 		comp->lr() = oldlr;
       
   921 		comp->setProperties(fBitMap);
       
   922 		comp->recognize(LearnedGroups);
       
   923 		delete newcomp;
       
   924 	    }
       
   925 	return;
       
   926       }
       
   927 
       
   928 
       
   929   return;
       
   930 
       
   931 }
       
   932 
       
   933 
       
   934 void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d)
       
   935 /*--------------------------------------------------------------
       
   936 Primary Purpose: Identify and merge a separated character
       
   937 Arguments:ptr is a pointer to a list element containing a component
       
   938           d is the current recognition distance on the component
       
   939 Effects: Unite two components into one.
       
   940     
       
   941 Rev: 5/6/96
       
   942 ---------------------------------------------------------------*/
       
   943 {
       
   944   if (ptr->previous == NULL) return;
       
   945   Component * part1 = (Component *) ptr->previous->item;
       
   946   Component * part2 = (Component *) ptr->item;
       
   947 
       
   948 
       
   949   Point ul, lr;
       
   950   ul = part1->ul();
       
   951   lr = part2->lr();
       
   952   if (ul.y() > lr.y() || ul.x() > lr.x())
       
   953     return;
       
   954   Component * newcomp = new Component(ul, lr);
       
   955 
       
   956   newcomp->setProperties(fBitMap);
       
   957   if (part1->charGroup <= 3 && part2->charGroup <= 3)
       
   958     newcomp->charGroup = (part1->charGroup | part2->charGroup);
       
   959   else if (part1->charGroup == 4)
       
   960     newcomp->charGroup = (part2->charGroup | 2);
       
   961   else
       
   962     newcomp->charGroup = (part1->charGroup | 2);
       
   963   if (newcomp->charGroup > 4) newcomp->charGroup = 4;
       
   964 
       
   965   int newdist = newcomp->recognize(LearnedGroups);
       
   966 
       
   967   if (newdist < d && newcomp->confid() > ConfidenceThreshold) 
       
   968     { 
       
   969       list->removeAt(ptr->previous);
       
   970       list->insertAfter(ptr, newcomp); 
       
   971       list->removeAt(ptr); 
       
   972     } else delete newcomp; 
       
   973 
       
   974 return;
       
   975 
       
   976 }
       
   977 
       
   978 
       
   979 int Page::writeWordPos(char * filename)
       
   980 /*--------------------------------------------------------------
       
   981 Primary Purpose: Writes word position, confidence, length and string to file
       
   982 Arguments: output file name
       
   983 Return Value: 1 if successful. 0 if an error occured
       
   984 Effects: Calls fWordList->printWordPos
       
   985 	  // Output format for each word
       
   986 	      "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(),
       
   987 		          word->confid, word->charCount, word->characters 
       
   988 Rev: 11/25/95
       
   989 ---------------------------------------------------------------*/
       
   990 { return fWordList->writeWordPos(filename);};
       
   991 
       
   992 int Page::writeWordbox(char * filename, int xoffset= 0, int yoffset = 0,
       
   993 		       bool equationsOnly = FALSE)
       
   994 /*--------------------------------------------------------------
       
   995 Primary Purpose: Write out word to scanworks wordbox file
       
   996 Arguments: output file, xoffset, yoffset, equationsOnly bool if we only want
       
   997 equations.
       
   998 Return Value: 
       
   999 Effects: calls fWordList->writeWordbox
       
  1000                // output format for each word
       
  1001 	 "%s %d %d %d %d  %d %d %d % \n",
       
  1002 		word->characters,
       
  1003 		word->ul.x(), word->ul.y(),
       
  1004 		word->lr.x(), word->lr.y(),
       
  1005 		word->lr.x(), word->ul.y(),
       
  1006 		word->ul.x(), word->lr.y() );
       
  1007 	  New line between lines of text
       
  1008 Rev: 11/25/95
       
  1009 ---------------------------------------------------------------*/
       
  1010 { return fWordList->writeWordbox(filename, xoffset, yoffset, this, equationsOnly);};
       
  1011 
       
  1012 
       
  1013 int Page::writeAscii(char * filename)
       
  1014 /*--------------------------------------------------------------
       
  1015 Primary Purpose: Write word list to asii file
       
  1016 Arguments: filename to write to
       
  1017 Return Value:  1 if successful 0 if unsuccessful
       
  1018 Effects:  Calss fWordList->writeAscii(filename)
       
  1019 Writes words to fill in text format using MinLineSize
       
  1020 to differentiate lines.
       
  1021 Rev: 11/25 KM
       
  1022 ---------------------------------------------------------------*/
       
  1023 
       
  1024 {return fWordList->writeAscii(filename);};
       
  1025 
       
  1026 
       
  1027 
       
  1028 int Page::addEquation(int startline, int startcol, int endline, int endcol)
       
  1029 /*--------------------------------------------------------------
       
  1030 Primary Purpose: Add an equation to the equation list
       
  1031 Arguments: boundaries of equation
       
  1032 Effects:  Adds an element fEqnList
       
  1033 Rev: 4/21/96
       
  1034 ---------------------------------------------------------------*/
       
  1035 {
       
  1036   EqnMarker * newEqn = new EqnMarker(startline, startcol, endline, endcol);
       
  1037   fEqnList->SortedInsert(newEqn, startline);
       
  1038 }
       
  1039 
       
  1040 int Page::deleteEquation(int col, int row)
       
  1041 /*--------------------------------------------------------------
       
  1042 Primary Purpose: deletes equations with this coordinate.
       
  1043 Arguments:  coordinate of equation to remove
       
  1044 Return Value: 1 if element was remove, 0 otherwise
       
  1045 Effects: removes any equation containing this coordinate
       
  1046 Rev: 4/21/96
       
  1047 ---------------------------------------------------------------*/
       
  1048 {
       
  1049   // first determine line number.
       
  1050   int linenum;
       
  1051 
       
  1052   for (int i = 0; i < fnumLines; i++)
       
  1053     if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row)
       
  1054       {
       
  1055 	linenum = i;
       
  1056 	break;
       
  1057       }
       
  1058 
       
  1059   for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) 
       
  1060     {
       
  1061       EqnMarker * eqn =  (EqnMarker *) ptr->item;
       
  1062       if (linenum == eqn->startline &&  linenum == eqn->endline)
       
  1063 	{
       
  1064 	  if	(col >= eqn->startcol && col <= eqn->endcol)
       
  1065 	    {
       
  1066 	    delete eqn;
       
  1067 	    setTclDeleteVars(eqn);
       
  1068 	    fEqnList->removeAt(ptr);
       
  1069 	    return 1;
       
  1070 	    }
       
  1071 	}
       
  1072       else if (linenum == eqn->startline && col >= eqn->startcol)
       
  1073        	    {
       
  1074 	    delete eqn;
       
  1075 	    setTclDeleteVars(eqn);
       
  1076 	    fEqnList->removeAt(ptr);
       
  1077 	    return 1;
       
  1078 	    }
       
  1079       else if (linenum > eqn->startline && linenum < eqn->endline)
       
  1080 	    {
       
  1081 	    delete eqn;
       
  1082 	    setTclDeleteVars(eqn);
       
  1083 	    fEqnList->removeAt(ptr);
       
  1084 	    return 1;
       
  1085 	    }
       
  1086       else if (linenum == eqn->endline && col <= eqn->endcol)
       
  1087 	    {
       
  1088 	    delete eqn;
       
  1089 	    setTclDeleteVars(eqn);
       
  1090 	    fEqnList->removeAt(ptr);
       
  1091 	    return 1;
       
  1092 	    }
       
  1093       
       
  1094   }      
       
  1095   
       
  1096   return 0;
       
  1097      
       
  1098 	  
       
  1099 }
       
  1100 
       
  1101 void Page::setTclDeleteVars(EqnMarker * eqn)
       
  1102 {
       
  1103 	  
       
  1104 if (ENABLE_USER_INTERFACE)
       
  1105   {
       
  1106     docommand("set deleted 1");
       
  1107     docommand("set curline %d",eqn->endline);
       
  1108     docommand("set curline_startrow %d",flineinfo[eqn->endline].fstartrow);
       
  1109     docommand("set curline_endrow %d",flineinfo[eqn->endline].fendrow);
       
  1110     docommand("set curx %d", eqn->endcol);
       
  1111 
       
  1112     // prevlines are actually starting lines but allowed same use of 
       
  1113     // tcl add equation code
       
  1114     docommand("set prevline %d",eqn->startline);
       
  1115     docommand("set prevline_startrow %d",flineinfo[eqn->startline].fstartrow);
       
  1116     docommand("set prevline_endrow %d",flineinfo[eqn->startline].fendrow);
       
  1117     docommand("set prevx %d", eqn->startcol);
       
  1118 
       
  1119     
       
  1120     
       
  1121     // this will change with zoning
       
  1122     docommand("set curline_startcol %d",0);
       
  1123     docommand("set curline_endcol %d",get_width());
       
  1124 	      
       
  1125 
       
  1126   }
       
  1127 
       
  1128 
       
  1129 }
       
  1130 
       
  1131 Component * Page::compAt(Point p)
       
  1132 /*--------------------------------------------------------------
       
  1133 Primary Purpose: Calls Components::compAt to return the smallest
       
  1134                   component containing point p
       
  1135 Return Value: Pointer to the component or null if no component here 
       
  1136 Effects:
       
  1137 Rev: 4/25/96
       
  1138 ---------------------------------------------------------------*/
       
  1139 {
       
  1140   Component * returnComp= NULL;
       
  1141   int linenum = get_linenum(p.x(), p.y() );
       
  1142 
       
  1143   if (linenum >= 0)
       
  1144     {
       
  1145       Components * complist = line(linenum);
       
  1146       returnComp = complist->compAt(p);
       
  1147     }
       
  1148   if (returnComp == NULL)
       
  1149     printf("No component found at ( %d, %d)\n ", p.x(), p.y());
       
  1150   else
       
  1151     printf("Component found at ( %d, %d)\n ul = (%d, %d)  lr = (%d, %d)\n "
       
  1152 	   , p.x(), p.y(),returnComp->ul().x(),returnComp->ul().y(),
       
  1153 	   returnComp->lr().x(),returnComp->lr().y());
       
  1154     
       
  1155 
       
  1156     return returnComp;
       
  1157 }
       
  1158 
       
  1159 
       
  1160 bool Page::inEquation(int col, int row)
       
  1161 /*--------------------------------------------------------------
       
  1162 Primary Purpose: determine if x,y is in an equation
       
  1163 Arguments: x,y coordinates
       
  1164 Return Value: true if in an Equation, false otherwise
       
  1165 Effects: determines if equation with these coordinated is in fEqnList
       
  1166 Rev: 11/25/95
       
  1167 ---------------------------------------------------------------*/
       
  1168 {
       
  1169   // first determine line number.
       
  1170   int linenum = get_linenum(col, row);
       
  1171 
       
  1172   
       
  1173   for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) 
       
  1174     {
       
  1175       EqnMarker * eqn =  (EqnMarker *) ptr->item;
       
  1176       if (linenum == eqn->startline &&  linenum == eqn->endline)
       
  1177 	{
       
  1178 	  if(col >= eqn->startcol && col <= eqn->endcol)
       
  1179 	    return true;
       
  1180 	}
       
  1181       else if (linenum == eqn->startline && col >= eqn->startcol)
       
  1182 	return true;
       
  1183       else if (linenum > eqn->startline && linenum < eqn->endline)
       
  1184 	return true;
       
  1185       else if (linenum == eqn->endline && col <= eqn->endcol)
       
  1186 	return true;
       
  1187     }
       
  1188   return false;
       
  1189 
       
  1190 }
       
  1191 
       
  1192 bool Page::inEquation(ListElement * comp)
       
  1193 /*--------------------------------------------------------------
       
  1194 Primary Purpose:  determine if the component in this list element 
       
  1195                  is in an equation
       
  1196 Arguments: A list element from a component list
       
  1197 Return Value: true if in equation, false otherwise
       
  1198 Effects:  calls inEquation(x,y) to do the real work
       
  1199 Rev: 4/21/96
       
  1200 ---------------------------------------------------------------*/
       
  1201 {
       
  1202   Component * c = (Component *) comp->item;
       
  1203   return inEquation(c->ul().x(), c->ul().y());
       
  1204 }
       
  1205 
       
  1206 
       
  1207 int Page::writeEquations(char * filename, int lineOffset)
       
  1208 /*--------------------------------------------------------------
       
  1209 Primary Purpose:  Writes boundaries of equations
       
  1210 Arguments: output file name
       
  1211 Return Value: 1 if successful 0 otherwise 
       
  1212 Effects: Outputs to filename for each equation
       
  1213 int startline, int startcol, int endline, int endcol <CR/LF>
       
  1214 Rev: 11/25/95
       
  1215 ---------------------------------------------------------------*/
       
  1216 {
       
  1217   FILE * outfile;
       
  1218   outfile = fopen(filename, "w");
       
  1219   if (outfile == NULL)
       
  1220       {
       
  1221 	printf("Error openning %s", filename);
       
  1222 	return 0;
       
  1223       }
       
  1224 
       
  1225   for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) 
       
  1226     {
       
  1227       EqnMarker * eqn =  (EqnMarker *) ptr->item;
       
  1228       fprintf(outfile, " %6d %6d %6d %6d\n", eqn->startline+lineOffset, 
       
  1229 	      eqn->startcol, 
       
  1230 	      eqn->endline+lineOffset, eqn->endcol);
       
  1231     }
       
  1232 fclose(outfile);
       
  1233 return 1;
       
  1234 }
       
  1235 
       
  1236 void Page::join(Component * a, Component * b)
       
  1237 {
       
  1238   if (a == b) return;
       
  1239   Component * primary;
       
  1240   Component * secondary;
       
  1241   
       
  1242   primary = (( a < b) ? a : b);
       
  1243   secondary = ((primary == a) ? b : a);
       
  1244   assert(primary != secondary);
       
  1245   assert(get_linenum(a) == get_linenum(b));
       
  1246 
       
  1247   primary->join(secondary);
       
  1248   
       
  1249   // remove secondary component from component list.
       
  1250   int linenum = get_linenum(secondary);
       
  1251   line(linenum)->removeElement(secondary);
       
  1252 
       
  1253 
       
  1254 }
       
  1255 
       
  1256 
       
  1257 
       
  1258 
       
  1259 
       
  1260 int Page::thinnestHorizontalSplit(Components * complist, 
       
  1261 				  ListElement * compptr)
       
  1262 /*--------------------------------------------------------------
       
  1263 Primary Purpose: Splits this component at thinnest point
       
  1264 Arguments: the component list that contains the compoent and
       
  1265             a pointer to its listelement
       
  1266 Return Value: 1 if split performed 0 otherwise.
       
  1267 Effects: Adds a new component to the list
       
  1268 Constraints: 
       
  1269 Rev: 4/26
       
  1270 ---------------------------------------------------------------*/
       
  1271 {
       
  1272   Component * comp = (Component *) compptr->item;
       
  1273   // Some easy access x,y coordinates
       
  1274   int ulx = comp->ul().x();
       
  1275   int uly = comp->ul().y();
       
  1276   int lrx = comp->lr().x();
       
  1277   int lry = comp->lr().y();
       
  1278 
       
  1279   int bestlrx;
       
  1280 
       
  1281   // Determine where to split.  Split at the thinnest point
       
  1282   // within JoinTolerance (maximum number of pixels that might be fused)
       
  1283 
       
  1284 
       
  1285   int minHeight = (int)comp->height();
       
  1286   int oldwidth = (int) comp->width();
       
  1287 
       
  1288   bestlrx = comp->lr().x();
       
  1289   // MinWidth is the minimum width of a learned charcter
       
  1290   for(int i = MinWidth; i < oldwidth - MinWidth; i++)
       
  1291       {
       
  1292 
       
  1293 	int newHeight = 
       
  1294 	  fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
       
  1295 	if (newHeight < minHeight)
       
  1296 	    {
       
  1297 	      minHeight = newHeight;
       
  1298 	      bestlrx = ulx+i;
       
  1299 	    }
       
  1300       }
       
  1301 //  printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
       
  1302   horizontalCompSplit(complist, compptr, bestlrx);
       
  1303 
       
  1304 
       
  1305 }
       
  1306 
       
  1307 int Page::thinnestHorizontalSplit(Component * comp)
       
  1308 {
       
  1309 
       
  1310   int i  = get_linenum(comp);
       
  1311   
       
  1312   Components * complist = fLineComponents[i];
       
  1313   ListElement * compptr;
       
  1314 
       
  1315   for (ListElement * ptr = complist->first; ptr != NULL; ptr = ptr->next)
       
  1316     {
       
  1317     if ((Component *) (ptr->item) == comp)
       
  1318       {
       
  1319 	compptr = ptr;
       
  1320 	thinnestHorizontalSplit(complist, compptr);
       
  1321 	return 1;
       
  1322       }
       
  1323     }
       
  1324   return 0;
       
  1325   
       
  1326     }
       
  1327 
       
  1328 int Page::horizontalCompSplit(Components * complist, 
       
  1329 			      ListElement * compptr, int x)
       
  1330 /*--------------------------------------------------------------
       
  1331 Primary Purpose: Split this component in the list into two components
       
  1332              at the indicated x coordinate
       
  1333 Arguments: x coordinate of splite
       
  1334 Return Value: 1 if split is performed 0 otherwise
       
  1335 Effects: Adds a new element to the list. One component is split into two
       
  1336 Constraints: fulx <= x >= flrx
       
  1337 Rev: 4/26/96
       
  1338 ---------------------------------------------------------------*/
       
  1339 
       
  1340 {
       
  1341   Component * comp = (Component *) compptr->item;
       
  1342   bool allGroups = TRUE;
       
  1343   comp->display_bounding_box("white");
       
  1344 
       
  1345   if( x < comp->ul().x() || x > comp->lr().x())
       
  1346     {
       
  1347     cout << " Cant split component " << x << "is not between" 
       
  1348 	 << comp->ul().x() << "and" << comp->lr().x() << endl;
       
  1349     return 0;
       
  1350     }
       
  1351   else
       
  1352     {
       
  1353       Component * newcomp = new Component(Point(x,comp->ul().y()),
       
  1354 					  comp->lr());
       
  1355       comp->lr().x() = x-1;
       
  1356       int compShrunk = comp->vertShrink(fBitMap);
       
  1357       comp->setProperties(fBitMap);
       
  1358       if(compShrunk)
       
  1359 	comp->recognize(LearnedGroups, allGroups);
       
  1360       else
       
  1361 	comp->recognize(LearnedGroups);
       
  1362 
       
  1363 
       
  1364       int newCompShrunk = newcomp->vertShrink(fBitMap);
       
  1365       newcomp->setProperties(fBitMap);
       
  1366 
       
  1367       if(newCompShrunk) // ignore group if shrunk
       
  1368 	newcomp->recognize(LearnedGroups, allGroups);
       
  1369       else
       
  1370 	newcomp->recognize(LearnedGroups);
       
  1371 
       
  1372       complist->insertAfter(compptr, newcomp);
       
  1373       comp->display_bounding_box("blue");
       
  1374       newcomp->display_bounding_box("blue");
       
  1375  
       
  1376      return 1;
       
  1377     }
       
  1378  
       
  1379   
       
  1380 }
       
  1381 
       
  1382 
       
  1383 ZonedPage::ZonedPage()
       
  1384   :Page(){ fzones = new Zones();}
       
  1385 
       
  1386 ZonedPage::~ZonedPage()
       
  1387 { 
       
  1388   ((Page *)this)->~Page();
       
  1389   delete fzones;
       
  1390 } 
       
  1391 
       
  1392 Zones * ZonedPage::zones()
       
  1393  { return fzones; }
       
  1394 
       
  1395 Page * ZonedPage::activate(int x, int y)
       
  1396      // activate the page at Point(x,y)
       
  1397 {
       
  1398   Zone * activeZone = zones()->findZone(x,y);
       
  1399   if (activeZone == NULL) return NULL;
       
  1400   docommand("set cur_xoffset %d", activeZone->ul().x());
       
  1401   docommand("set cur_yoffset %d", activeZone->ul().y());
       
  1402 
       
  1403   if (activeZone->page() == NULL)
       
  1404     {
       
  1405       activeZone->buildPage(this);
       
  1406     }
       
  1407   
       
  1408      return activeZone->page();
       
  1409 
       
  1410 }    
       
  1411 
       
  1412 void ZonedPage::autoZone(int horizMerge, int vertMerge)
       
  1413 { // autoZone tries to automatically zone page
       
  1414   Point curul;
       
  1415   Point curlr;
       
  1416   int changed = 1;
       
  1417   
       
  1418   if (components() != NULL)
       
  1419     delete components();
       
  1420 
       
  1421   extractComponents(horizMerge);
       
  1422   
       
  1423   while(changed)
       
  1424     {
       
  1425     changed = 0;
       
  1426   for (int i=0; i < numLines(); i++)
       
  1427     {
       
  1428     for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next)
       
  1429       {
       
  1430        	Component * mainitem = (Component *) mptr->item;
       
  1431 	for (int j= i; j < numLines(); j++)
       
  1432 	  for(ListElement * ptr = line(j)->first; ptr != NULL; ptr=ptr->next)
       
  1433 	      {
       
  1434 		
       
  1435 		Component * item = (Component *) ptr->item;
       
  1436 		if( (item->ul().y() - mainitem->lr().y()) <= vertMerge &&
       
  1437 		    (mainitem != item) &&
       
  1438 		    mainitem->xoverlap(item))
       
  1439 		    {
       
  1440 		      mainitem->join(item);
       
  1441 		      (line(j))->removeAt(ptr);
       
  1442 		      changed = 1;
       
  1443 		    }
       
  1444 	      }
       
  1445       }	
       
  1446      }
       
  1447     }
       
  1448 
       
  1449 
       
  1450   for (int i=0; i < numLines(); i++)
       
  1451     { 
       
  1452     for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next)
       
  1453       {
       
  1454        	Component * mainitem = (Component *) mptr->item;
       
  1455 	/*        printf(" (ul(%d,%d) lr(%d,%d)) ", mainitem->ul().x(),
       
  1456                 mainitem->ul().y(),  mainitem->lr().x(), mainitem->lr().y());
       
  1457 		*/
       
  1458 	/*
       
  1459 	mainitem->display_bounding_box("blue", 
       
  1460 				     ZONING_SCALE_FACTOR,
       
  1461 				     ".zoning_window.work_space");
       
  1462 				     */
       
  1463 	Point ul = Point(mainitem->ul().x() -1,mainitem->ul().y() -1);
       
  1464 	Point lr = Point( mainitem->lr().x() +1, mainitem->lr().y() +1);
       
  1465 	docommand("start_region %d %d", (int)(ul.x()*ZONING_SCALE_FACTOR),
       
  1466 		                        (int)(ul.y()*ZONING_SCALE_FACTOR));
       
  1467 
       
  1468 	docommand("end_region %d %d", (int)(lr.x()*ZONING_SCALE_FACTOR), 
       
  1469 		                      (int)(lr.y()*ZONING_SCALE_FACTOR));
       
  1470 
       
  1471 	Zone * newzone = new Zone(ul,lr);
       
  1472         zones()->Append(newzone);
       
  1473       }
       
  1474 
       
  1475     
       
  1476     }
       
  1477 }
       
  1478 
       
  1479 
       
  1480 
       
  1481 
       
  1482 
       
  1483 
       
  1484 
       
  1485 
       
  1486 
       
  1487 
       
  1488 
       
  1489 
       
  1490 
       
  1491 
       
  1492