reference/ocr-new/Page.cc
author viric@llimona
Thu, 18 May 2006 23:12:51 +0200
changeset 0 6b8091ca909a
permissions -rw-r--r--
Init from working directory of svn repository.

/** Page.cc contains the member functions for the primary OCR class Page */
#include "system.h"
#include "Page.h"
#include "convertMap.h"
#include "get_skew.h"
#include "Component.h"
#include "status_message.h"

/*** Member functions of class Page.     ***/

int Page::get_height()
{
  return fRLEMap->imageLength();
}

int Page::get_width()
{
  return fRLEMap->imageWidth();
}

int Page::get_linenum(int col, int row)
 /*--------------------------------------------------------------
Primary Purpose: Returns line number of x,y coordinates (just uses y for now)
                 called from proc equation_mark in new_ui.tcl
Return value: line number or -1 if no line is here.
Requires: setLines be run first
Rev: 4/21/96
---------------------------------------------------------------*/
{
  assert (flineinfo != NULL);
  int linenum= -1;

  for (int i = 0; i < fnumLines; i++)
    if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row)
      {
	linenum = i;
	if (ENABLE_USER_INTERFACE)
	  {
	    // save last mark before it is overwritten

	    docommand("set curline %d",linenum);
	    docommand("set curline_startrow %d",flineinfo[i].fstartrow);
	    docommand("set curline_endrow %d",flineinfo[i].fendrow);

	    
	    // this will change with zoning
	    docommand("set curline_startcol %d",0);
	    docommand("set curline_endcol %d",get_width());
	      
	  }

	break;

      }
return linenum;

}

int Page::send_words_to_tcl()
/*--------------------------------------------------------------
Primary Purpose:  Display words in tcl
Rev - AR
---------------------------------------------------------------*/
{
  int word_count = 0;
  int unknown_char_count = 0;
  int low_precision_count = 0;
  int mispelled_count = 0;
  char* send_chars;
  Word* temp_word;
  if(ENABLE_USER_INTERFACE) set_status("Displaying text");
  for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
    {
      word_count++;
      set_text_display_status(word_count, fWordList->num_words);
      temp_word = (Word*)ptr->item;
      send_chars = backslashify(temp_word->characters);
      /*	printf("Added word %s Confidence = %d\n", send_chars, 
	       temp_word->confid); */
      if(temp_word->confid < VERY_LOW_CONFIDENCE)
	  {
	    docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(),  temp_word->ul.y());
	    unknown_char_count++;
	  }
      else if(temp_word->confid < LOW_CONFIDENCE)
	  {
	    docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(),  temp_word->ul.y());
	    low_precision_count++;
	  }
      else if((temp_word->mispelled) && SPELLCHECK)
	  {
	    docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(),  temp_word->ul.y());
	    mispelled_count++;
	  }
      else
	  {
	    docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(),  temp_word->ul.y());
	  }
      update();
    }
  if(ENABLE_USER_INTERFACE)
      {
    set_status("Done displaying text");
    set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count))));
  }
}


int Page::deskew(int deskew_method)
/*--------------------------------------------------------------
Primary Purpose: Deskew the page
Arguments: 1 - RLE Rotation
           0 - BitMap Rotation
Return Value: 1 if successful, 0 if unsuccessful
Effects: updates the bitmap and rlemap of the page
Constraints: RLEMap Rotation is not currently reliable and probably
should not be used
Rev: AR
---------------------------------------------------------------*/
{
  /* a little ugly.... if the page is rotated
     in here, return 1, else 0 */

  if(deskew_method == RLE_DESKEW)
      {
	if(fRLEMap->deskew())
	{
	  convertMap(fRLEMap, fBitMap);
	  return 1;
	}
	return 0;
      }
  else
      {
      double skew = get_skew(fRLEMap);
      if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE))
	  {
	    fBitMap->rotateMap(skew);
	    convertMap(fBitMap, fRLEMap);
	    return 1;
	  }
      return 0;
    }
}

Page::Page()
/**Page::Page - constructor allocates bitmap and rlemap*/
{
  fBitMap = new BitMap;
  fRLEMap = new RLEMap;
  fEqnList = new EqnMarkers;
  fLineComponents = NULL;
  fWordList = NULL;
}

Page::~Page()
/*--------------------------------------------------------------
Primary Purpose:  Destructor deallocates private fields that
have been created.
Rev:
---------------------------------------------------------------*/
{
 
  if (flineinfo) delete flineinfo;
  for (int  i = 0; i <fnumLines; i++)
        if(fLineComponents[i] != NULL) delete fLineComponents[i];
  if(fLineComponents) delete fLineComponents;
  if (fBitMap) delete fBitMap;
  if (fRLEMap) delete fRLEMap;
  if (fWordList) delete fWordList;
  if (fEqnList) delete fEqnList;
}

Angle Page::skewAngle()
/*--------------------------------------------------------------
Primary Purpose: Determine the angle of rotation of the RLEMap r
Arguments: pointer to an RLEMap
Return Value: detected angle of rotation
Code is in get_skew.cc
Rev: AR
---------------------------------------------------------------*/
{
  return get_skew(fRLEMap);
}


MapStatus Page::readMap(char * filename)
 // Calls BitMap::readMap and then converts
{
  MapStatus status;
  status = fBitMap->readMap(filename);
  convertMap(fBitMap, fRLEMap);
  return status;
}



MapStatus Page::setLines()
/*--------------------------------------------------------------
Primary Purpose:  Set flineinfo array in Page class with the 
      starting and ending rows of each line of text.
      Also sets fnumLines to the number of lines
Arguments: none
Return Value: A Mapstatus either VALID, EMPTY if there is no
   data in the RLEMAP, or OTHERERROR if there is an unexpected error
Effects:  Allocates flineinfo and fills with starting and ending row
   of each line.  The following global variables are used as parameters
   in this function.  These are defined in system.cc
   NoiseTolerance - Rows whose number of pixels is less than  this value
                will be considered empty (current val 6). 
   MinVertSeparation - The minimum number of rows separating lines of text.
                 Lines will be merged if actual Separation is less than this
		 value. (current val 3)
   MinLineSize - The minimum number of rows in a line of text.  
                 Any smaller lines are discarded (currentval 5)

Constraints: Page::readMap() must be run first to fill fRLEMap 
Rev: 10/26 KM
---------------------------------------------------------------*/
{

   int maxrow = fRLEMap->imageLength() - 1;      // maximum row number 
   int actualSeparation = MinVertSeparation + 1; // must be bigger than min
                                                 // for line 0

   int linenum=0;                                // current line number
   int prvlinenum = 0;
   int lineSize;                                 // # rows in current line 

   int maxLines = maxrow/MinLineSize;           // max # of lines of text 

   if(maxrow == 0) return EMPTY;

   flineinfo = new LineMarker[maxLines]; 

   for (int i = 0; i < maxrow;)
	{
	  LineMarker & thisLine = flineinfo[linenum];
	  LineMarker & prevLine = flineinfo[prvlinenum];

	  while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance)
	    i++;
	  thisLine.fstartrow = i++;
	  while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance)
	    i++;
	  

	  lineSize = i - thisLine.fstartrow +1;

	  // If this line is less than MinVertSeparation away
	  //  from the last line.  Join the two together.
	  if (linenum > 0)
	    {
	      actualSeparation = thisLine.fstartrow - prevLine.fendrow;
	    }
	  if (actualSeparation < MinVertSeparation)
	    {
	     // If too small of a separation, add into prev row
	     prevLine.fendrow = i;
	   }
	  else if (lineSize >= MinLineSize)
	    {
	    thisLine.fendrow = i;
/*	    printf (" Line %d  Start: %d  End: %d  lineHeight %d\n", 
	        linenum,thisLine.fstartrow,
	        thisLine.fendrow, 
	        thisLine.fendrow  - thisLine.fstartrow +1);
*/
	    prvlinenum = linenum;
	    linenum++;

	  }
	  if (linenum >= maxLines) return OTHERERROR;
	}

   fnumLines = linenum;   // Set number of lines in page class


   if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES)
     {
       display_line_boundaries();
     }
   /*   printf("Setlines found a total of %d lines.\n", fnumLines); */
   if(ENABLE_USER_INTERFACE) 
     update(); 
   return VALID;
 }

void Page::display_line_boundaries()
/*--------------------------------------------------------------
Primary Purpose: Display line boundaries in TCL/TK.  Called from
setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are
set to TRUE
Effects:  Draws a blue line between each line of text
Rev:  AR
---------------------------------------------------------------*/
{
  int centerline, width;
  for(int j=0; j < fnumLines; j++)
    {
      centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2;
      width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow;

      scale(centerline);
      scale(width);
      /* having this pathname here is probably not such a good idea...*/
      
      docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width);
    }

}


int test_rlemap_lines(RLEMap* rmap)
{
  int length = rmap->imageLength();
  for(int i = 0; i < length; i++)
    printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels);
}


MapStatus Page::extractComponents(int horizMerge)
/*--------------------------------------------------------------
                     Component extraction routines.
*
* Given the top and bottom line of a row we want to generate a list of
* components. The general method is to find the closest dot, trace its 
* connected dots, then project upwards and downwards and add anything we 
* find there to the component. We will erase the component from the RLEMap
* as it is added to the component list. By projecting up and down 
* from the piece we first find we should be able
* to completely encompass characters like :;i?|! The only problems are 
* italic or ligatured characters where we may pick up two or more 
* characters at a time (which would be bad) or characters fragmented 
* with a vertical gap.

Primary Purpose: Main extraction routine.
Effects: Makes new components and puts them in a list. Deletes components 
         from RLE map. Fills in component boundaries and calls 
	 Component::setProperties to set the property vector
         Lastly convertMap is run to rebuild the RLEMap
Constraints: Page::setLines() must be run first 
Rev: 4/28/96
---------------------------------------------------------------*/
{
  int currentCol, startRow, endRow, rowHeight;
  ListElement* intrvl;
  ListElement* tempintrvl;
  /*  printf("fnumLines = %d\n", fnumLines); */
  Component* comp;
  int  totalSpacing = 0;  // total blank horizontal pixels between components
  int  baselines[MaxVertSize];     // array for finding the baseline
  last_status = 0.0;
  int compCounter = 0;
  int i;
  int j;
  int upwardBound;      // Projection distances different for equations
  int downwardBound;    // and non-equations
  
 
  bool inEqn;          // Variables for finding if the center of a comp
  int centerx;         // is in an equation.
  int centery;



    printf("Extracting Components\n");
   fLineComponents = new Components*[fnumLines];
  for (i = 0; i < fnumLines; i++) {
    if(ENABLE_USER_INTERFACE)
      set_component_status(i, fnumLines);
    currentCol = 0;
    startRow = flineinfo[i].fstartrow;
    endRow = flineinfo[i].fendrow;
    rowHeight = endRow - startRow;
    assert(rowHeight > 0);

    for (j=0; j < MaxVertSize; j++)
      baselines[j] = 0;
    fLineComponents[i] = new Components();


    while (currentCol<=fRLEMap->imageWidth()) {  //until we reach the end of the page

	//Build component starting with closest black dot
	intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow);
	if (intrvl == NULL) {
	//  printf("Reached end of line\n");
	  break;
	}
	comp = new Component(); //Make a new component named comp
	assert(comp->AddToComponent(intrvl, fRLEMap, horizMerge));

	//Now we want to extend upwards 
	//First check if there is a blank space to the right 
	tempintrvl =
	  fRLEMap->FindNearHorizDot(comp->lr().x(), startRow, endRow);

	
	if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start > 
	    comp->lr().x()+horizMerge+1)
	  while (comp->ul().y() < endRow) {

	// find the center of the component to check if we are in an equation
	centerx = (comp->ul().x() + comp->lr().x())/2;
	centery = (comp->ul().y() + comp->lr().y())/2;
 	inEqn = inEquation(centerx, centery);
	// Determine projection distance.  Only project for non Equations.
	if(inEqn)
	  {
	    upwardBound = comp->ul().y()+1;
	    downwardBound = comp->lr().y() - 1;
	  }
	else
	  {
	    upwardBound = startRow;
	    downwardBound = endRow;
	  }
	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
					      comp->lr().x(), comp->lr().y(),
					      upwardBound);
					      //  startRow);
	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
							   horizMerge)))
	      break;
	    if (intrvl == NULL) break;
	  }
	else
	  while (comp->ul().y() < endRow) {

       	// find the center of the component to check if we are in an equation
	    centerx = (comp->ul().x() + comp->lr().x())/2;
	    centery = (comp->ul().y() + comp->lr().y())/2;
	    inEqn = inEquation(centerx, centery);
	    // Determine projection distance.  Only project for non Equations.
	    if(inEqn)
	      {
		upwardBound = comp->ul().y()+1;
		downwardBound = comp->lr().y() - 1;
	      }
	    else // regular text
	      {
		upwardBound = startRow;
		downwardBound = endRow;
	      }

	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
					      comp->lr().x(), comp->ul().y(),
					      upwardBound);
					      // startRow);
	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap, 
							   horizMerge)));
	    break;
	    if (intrvl == NULL) break;
	  }

	//Now we want to extend downwards
	while (comp->lr().y() > startRow) {
	  intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(), 
						    comp->lr().y(), downwardBound);
	  if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
							 horizMerge)))
	    break;
	  if (intrvl == NULL) break;
	}

	// Now we toss out the noise
	int size;
	if (comp != NULL) {
	  if (comp->ul() < Point(0,0))
	    printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y());
	  else
	    size = fBitMap->pixelsInRegion(comp->ul(), comp->lr());
	}
	  else
	    size = 0;
	Component * prev = (Component *)(fLineComponents[i]->last->item);
	if (size < MinComponentSize) {
//	  printf("Deleting some noise of size %d\n", size);
	  // printComponent(comp);
	  delete comp;
	  comp = NULL;
	}
	else if (prev != NULL && 
		 abs(comp->ul().x() - prev->ul().x()) <= 1 &&
 	         abs(comp->lr().x() == prev->lr().x()) <= 1)
	  {
	    // Check and see if this and the previous component have the
	    // same x boundaries, if so merge the two.  Good for = and :
	    prev->join(comp);
	    prev->setProperties(fBitMap);
	    delete comp;
	    comp ==NULL;
	  }
	else
	    {
	      compCounter++;
	      // display a rectangle around the component
	      if(ENABLE_USER_INTERFACE)
		  {
		    if(DISPLAY_BOUNDING_BOXES)
		      comp->display_bounding_box();
		  }
	  
	  // JMH - make an array of frequency of the y coord of bottom of comp
	      int vertOffset = endRow - comp->lr().y();
	      if(vertOffset < MaxVertSize && vertOffset >= 0)
		baselines[vertOffset]++;

	  
	      comp->setProperties(fBitMap);
	      if(fLineComponents[i]->last != NULL)
		{
		  int thisSpacing = comp->ul().x() - 
		    ((Component *) (fLineComponents[i]->last->item))->lr().x();
		  // if a realy big space, make space the width of this comp
		  if (thisSpacing > 200) 
		  thisSpacing = 2*(comp->lr().x() - comp->ul().x());
		totalSpacing += thisSpacing;
		}

	      fLineComponents[i]->Append(comp);       // add this component to list
	      currentCol = (comp->ul()).x() + 1;   // update position on page
	    }
      }
    
    // find most popular bottom of comp and call it the baseline
    int counter = 0;
    int baseline;
    for (j=0; j < MaxVertSize; j++) {
      if (counter < baselines[j]) {
	counter = baselines[j];
	baseline = endRow - j;
      }
    }
    //    printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline);
    // Now assign each character a group based on it's location
    for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
	 ptr = ptr->next) {
      comp = (Component*) ptr->item;
      comp->charGroup = 0;
      
      // if top of char is higher than top - tolerance 
      if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) {
	comp->charGroup += 2; //tall like a T
      }
      
      // if bottom of char is lower than base - tolerance
      if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) {
	comp->charGroup += 1; //has a tail like a y
      } else 
	if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) {
	  comp->charGroup = 4; //floating like a '
	  /*	  printf("bottom at %d < %d\n", comp->lr().y(),
		  baseline - (2*rowHeight/BaseLineTolerance)); */
	}
      //      printf("added character in group %d\n", comp->charGroup);
    }
  }
  /*  printf("Found %d components on this page.\n", compCounter); */
  //  printComponents();
  last_status = 0.0;
  if(ENABLE_USER_INTERFACE)
    set_status("Done extracting characters");
  if((compCounter - fnumLines) > 0) /* don't want divide by zero */
    {
      favgSpacing = totalSpacing / (compCounter - fnumLines);
    }
  else
    {
      favgSpacing = 1;  
    }
  delete fRLEMap;
  fRLEMap = new RLEMap;
  convertMap(fBitMap, fRLEMap);
}

void Page::printComponents()
/*--------------------------------------------------------------
Primary Purpose: Debugging routine that prints little bitmaps
of low confidence characters
---------------------------------------------------------------*/
{
  int compcounter = 0;
  for (int i = 0; i < fnumLines; i++) {
    Component* comp;
    for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
	 ptr = ptr->next) {
      compcounter++;
      comp = (Component *) ptr->item;
      if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n')
      {
	printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n", 
	   (comp->ul()).x(), (comp->ul()).y(),
	   (comp->lr()).x(), (comp->lr()).y());
	printComponent(comp);
	printf("properties: "); 
	printVector(comp->properties(), numProperties);
	printf("I think it's a -> %c <-   confidence: %d  line: %d  group: %d Comp#%d\n",
	       comp->asciiId(),
	       comp->confid(), i+1, comp->charGroup, compcounter);
	printf("\n*******************************************************\n");
      }
    }
  }
}

void Page::printComponent(Component* comp)
// Print a single component.
{
  int right = comp->ul().x()+78;
  if (comp->lr().x() < right) 
    right = comp->lr().x();

  for (int r = comp->ul().y(); 
       r <= comp->lr().y(); r++){
    for (int c = comp->ul().x();
	 c <= right; c++)
      bitprint(fBitMap->row(r)[c/8], c%8);
    printf( "\n");
  }
}

int spacing(ListElement * compa, ListElement * compb);
// helper function for extractWords  (defined below)

MapStatus Page::extractWords()
/*--------------------------------------------------------------
Primary Purpose: Extract words from each lines components
Effects: sets the fWordsList to be a list of all of the words
in the document.
Constraints: extractComponents must be run first
Rev: KM 11/7/95
---------------------------------------------------------------*/
{
  bool inWord;
  ListElement * start;   // word Start
  int count;   // counts the components in the word
  int wordlength; // counts the characters in the word
  int word_count = 0;
  int spacingThreshold = (int) (1.25 * ((float) (favgSpacing)));
  fWordList = new Words;
  last_status = 0.0;
  for (int i = 0; i < fnumLines; i++)
      {
	if(ENABLE_USER_INTERFACE)
	  set_extract_status(i, fnumLines);
	inWord = FALSE;
	for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) {
	  Component * item = (Component *) ptr->item;
	  if(!inWord)
	      {
		start = ptr;
		count = 1;
		if (item->fasciiId == NULL) 
		  wordlength = 1;
		else
		  wordlength = strlen(item->fasciiId);
		inWord = TRUE;
	      }
	  if( spacing(ptr, ptr->next) > spacingThreshold || 
	      inEquation( ptr)) 
	      {
		Word * newWord = new Word(start,count,wordlength);
		(words())->Append(newWord);
		if(1)
		  printf("%s ",newWord->characters);
		inWord = FALSE;
		word_count++;
	      }
	  else
	    count++;
	    if (item->fasciiId == NULL) wordlength ++;
	    else wordlength += strlen(item->fasciiId);
	}
	// Add in a separate word for new line
	Word * newWord = new Word("\n",2);
        (words())->Append(newWord);
	printf("%s", newWord->characters);
	word_count++;
      }
  last_status = 0.0;
  fWordList->num_words = word_count;
  if(ENABLE_USER_INTERFACE)
    set_status("Done extracting words");
  return VALID;
}

void Page::spellcheck()
/*--------------------------------------------------------------
Primary Purpose: Run spell checker on word list.
Constraints: extractWords must be run first
Rev: AR
---------------------------------------------------------------*/
{
  int word_count = 0;
  Word* temp_word;
  for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
    {
      word_count++;
      if(ENABLE_USER_INTERFACE)
	set_spellcheck_status(word_count, fWordList->num_words);
      temp_word = (Word*)ptr->item;
      if(0)
	printf("Spellchecking word %s\n", temp_word->characters);
      if(mispelled(temp_word->characters))
	{
	  temp_word->mispelled = TRUE;
	}
    }
}

int Page::spacing(ListElement * compa, ListElement * compb)
// spacing from end of comp_a to begining of comp_b
{
  int x;
  if (compb == NULL) return 1000;  // end of line

  Component * a = ((Component *) (compa)->item);
  Component * b = ((Component *) (compb)->item);
  int returnval =  (b->ul().x() - a->lr().x());
  if (returnval < 0) 
    {
      return 0;
    }
  assert (returnval >= 0);
  return returnval;

}


void Page::printWords()
// Prits out each component of each word. This can take a very long time
{

  Word * thisWord;
  for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next)
      {
	thisWord = (Word *) ptr->item;
	printf("!!!!!! NEW WORD  %s  confid : %d !!!!!\n", thisWord->characters, thisWord->confid);
	for(int i = 0; i < thisWord->charCount; i++)
	    {
	      Component * comp = thisWord->character[i];
	      if (comp == NULL) continue;
	      printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n", 
		     (comp->ul()).x(), (comp->ul()).y(),
		     (comp->lr()).x(), (comp->lr()).y());
	      for (int r = comp->ul().y(); 
		   r <= comp->lr().y(); r++){
		for (int c = comp->ul().x();
		     c <= comp->lr().x(); c++)
		  bitprint(fBitMap->row(r)[c/8], c%8);
		printf( "\n");
	      }
	      printf("properties: "); 
	      printVector(comp->properties(), numProperties);
	      printf("Identification:  %c distance: %d confidence %d\n",
		     comp->asciiId(),
		     comp->distance(&LearnedChars[comp->asciiId()]),
	             comp->confid());
	      printf("\n***********************************************\n");
	    }
      }
}

MapStatus Page::recognize()
/*--------------------------------------------------------------
Primary Purpose: Recognize entire page.  Sets font and ascii id of
each component
Return Value: VALID if no error occurred OTHERERROR otherwise
Constraints: extractComponents must be run first.
See recognize(line) below for more detailed info
Rev: KM
---------------------------------------------------------------*/
{
  printf("Recognizing document\n");
  last_status = 0.0;
  for (int i = 0; i< fnumLines; i++)
      { 
	if(ENABLE_USER_INTERFACE)
	  set_recognize_status(i, fnumLines);
	recognize(i);
      }

  last_status = 0.0;
  return VALID;

}


MapStatus Page::recognize(int linenum)
/*--------------------------------------------------------------
Primary Purpose: Recognize a line of connected components
Arguments:  linenum is line number to recognize
Effects: sets ascii identification fontid and confidence in each component
If confidence is low and character is big enough for two characters.
divideAndRecognize is called to split up the component.
Constraints: extractComponents must be run first
Rev: KM 11/9/95
---------------------------------------------------------------*/
{
  Component * comp;
  Distance d;

  for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next) 
      {
	comp = (Component *) ptr->item;

	d = comp->recognize(LearnedGroups);
	if (comp->confid() < ConfidenceThreshold && 
	    comp->width() > 2*MinWidth) // really wide
	  divideAndRecognize(line(linenum), ptr, d);

	/***	
	if (comp->confid() < ConfidenceThreshold || 
	    (ptr !=  line(linenum)->first &&
	    ((Component *) ptr->previous->item)->confid() < ConfidenceThreshold))
	  uniteAndRecognize(line(linenum), ptr, d);
	  ***/
	  
      }

  return VALID;
}



void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d)
/*--------------------------------------------------------------
Primary Purpose: Identify and separate merged characters
Arguments:ptr is a pointer to a list element containing a component
          d is the current recognition distance on the component
Effects: Subdivides component into two parts, Division is made at
         the minimum vertical height of the component.  If the 
	 minHeight > JoinTolerance no divison will be made.
	 (JoinTolerance is a global var that determines
	 the maximum number of merged pixels that are allowed in a
	 column for a division to be made)
	 When a division is made.  The component's boundaries are 
	 adjusted accordingly and a new component is inserted into
	 the list.

	 Returns if distance is acceptable or width of component
	 is <= MinWidth*2
Rev: KM 11/24/95
---------------------------------------------------------------*/
{
  Component * comp = (Component *) ptr->item;
  Component * newComp;
  bool allGroups = TRUE;

  // Save the original component boundaries just in case we cant improve
  Point oldlr = comp->lr();
  Point oldul = comp->ul();
  int oldwidth = (int) comp->width();

  // Some easy access x,y coordinates
  int ulx = comp->ul().x();
  int uly = comp->ul().y();
  int lrx = comp->lr().x();
  int lry = comp->lr().y();

  Distance newdist, bestdist;
  int bestlrx;

  if (comp->confid() > ConfidenceThreshold)
    return;

  if (oldwidth < MinWidth*2)  // cant be split in two
      {
	return;
      }

  // Determine where to split.  Split at the thinnest point
  // within JoinTolerance (maximum number of pixels that might be fused)

  int minHeight = (int)comp->height();
  bestlrx = comp->lr().x();
  for(int i = MinWidth; i < oldwidth - MinWidth; i++)
      {

	int newHeight = 
	  fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
	if (newHeight < minHeight)
	    {
	      minHeight = newHeight;
	      bestlrx = ulx+i;
	    }
      }
//  printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);


  if (bestlrx < lrx  && minHeight < JoinTolerance)
      {
	comp->lr().x() = bestlrx;
	int shrunk = comp->vertShrink(fBitMap);
	comp->setProperties(fBitMap);
	if (shrunk)  // ignore group if we had to shrink down
	  newdist = comp->recognize(LearnedGroups, allGroups);
	else
	  newdist = comp->recognize(LearnedGroups);

//	printf("Distance = %u  asciiid = %c \n", newdist, comp->asciiId());

	Component * newcomp = new Component(Point(bestlrx+1, oldul.y())
					    , oldlr);
	newcomp->vertShrink(fBitMap);
	newcomp->setProperties(fBitMap);
	int newcompdist = newcomp->recognize(LearnedGroups,allGroups);

	if ((newdist < d) && (newcomp->confid() > ConfidenceThreshold*.6))
	  {
	    list->insertAfter(ptr, newcomp);
	    newcomp->display_bounding_box("red");
	    comp->display_bounding_box("red");
	  }
	else
	    {
	      	comp->ul() = oldul;
		comp->lr() = oldlr;
		comp->setProperties(fBitMap);
		comp->recognize(LearnedGroups);
		delete newcomp;
	    }
	return;
      }


  return;

}


void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d)
/*--------------------------------------------------------------
Primary Purpose: Identify and merge a separated character
Arguments:ptr is a pointer to a list element containing a component
          d is the current recognition distance on the component
Effects: Unite two components into one.
    
Rev: 5/6/96
---------------------------------------------------------------*/
{
  if (ptr->previous == NULL) return;
  Component * part1 = (Component *) ptr->previous->item;
  Component * part2 = (Component *) ptr->item;


  Point ul, lr;
  ul = part1->ul();
  lr = part2->lr();
  if (ul.y() > lr.y() || ul.x() > lr.x())
    return;
  Component * newcomp = new Component(ul, lr);

  newcomp->setProperties(fBitMap);
  if (part1->charGroup <= 3 && part2->charGroup <= 3)
    newcomp->charGroup = (part1->charGroup | part2->charGroup);
  else if (part1->charGroup == 4)
    newcomp->charGroup = (part2->charGroup | 2);
  else
    newcomp->charGroup = (part1->charGroup | 2);
  if (newcomp->charGroup > 4) newcomp->charGroup = 4;

  int newdist = newcomp->recognize(LearnedGroups);

  if (newdist < d && newcomp->confid() > ConfidenceThreshold) 
    { 
      list->removeAt(ptr->previous);
      list->insertAfter(ptr, newcomp); 
      list->removeAt(ptr); 
    } else delete newcomp; 

return;

}


int Page::writeWordPos(char * filename)
/*--------------------------------------------------------------
Primary Purpose: Writes word position, confidence, length and string to file
Arguments: output file name
Return Value: 1 if successful. 0 if an error occured
Effects: Calls fWordList->printWordPos
	  // Output format for each word
	      "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(),
		          word->confid, word->charCount, word->characters 
Rev: 11/25/95
---------------------------------------------------------------*/
{ return fWordList->writeWordPos(filename);};

int Page::writeWordbox(char * filename, int xoffset= 0, int yoffset = 0,
		       bool equationsOnly = FALSE)
/*--------------------------------------------------------------
Primary Purpose: Write out word to scanworks wordbox file
Arguments: output file, xoffset, yoffset, equationsOnly bool if we only want
equations.
Return Value: 
Effects: calls fWordList->writeWordbox
               // output format for each word
	 "%s %d %d %d %d  %d %d %d % \n",
		word->characters,
		word->ul.x(), word->ul.y(),
		word->lr.x(), word->lr.y(),
		word->lr.x(), word->ul.y(),
		word->ul.x(), word->lr.y() );
	  New line between lines of text
Rev: 11/25/95
---------------------------------------------------------------*/
{ return fWordList->writeWordbox(filename, xoffset, yoffset, this, equationsOnly);};


int Page::writeAscii(char * filename)
/*--------------------------------------------------------------
Primary Purpose: Write word list to asii file
Arguments: filename to write to
Return Value:  1 if successful 0 if unsuccessful
Effects:  Calss fWordList->writeAscii(filename)
Writes words to fill in text format using MinLineSize
to differentiate lines.
Rev: 11/25 KM
---------------------------------------------------------------*/

{return fWordList->writeAscii(filename);};



int Page::addEquation(int startline, int startcol, int endline, int endcol)
/*--------------------------------------------------------------
Primary Purpose: Add an equation to the equation list
Arguments: boundaries of equation
Effects:  Adds an element fEqnList
Rev: 4/21/96
---------------------------------------------------------------*/
{
  EqnMarker * newEqn = new EqnMarker(startline, startcol, endline, endcol);
  fEqnList->SortedInsert(newEqn, startline);
}

int Page::deleteEquation(int col, int row)
/*--------------------------------------------------------------
Primary Purpose: deletes equations with this coordinate.
Arguments:  coordinate of equation to remove
Return Value: 1 if element was remove, 0 otherwise
Effects: removes any equation containing this coordinate
Rev: 4/21/96
---------------------------------------------------------------*/
{
  // first determine line number.
  int linenum;

  for (int i = 0; i < fnumLines; i++)
    if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row)
      {
	linenum = i;
	break;
      }

  for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) 
    {
      EqnMarker * eqn =  (EqnMarker *) ptr->item;
      if (linenum == eqn->startline &&  linenum == eqn->endline)
	{
	  if	(col >= eqn->startcol && col <= eqn->endcol)
	    {
	    delete eqn;
	    setTclDeleteVars(eqn);
	    fEqnList->removeAt(ptr);
	    return 1;
	    }
	}
      else if (linenum == eqn->startline && col >= eqn->startcol)
       	    {
	    delete eqn;
	    setTclDeleteVars(eqn);
	    fEqnList->removeAt(ptr);
	    return 1;
	    }
      else if (linenum > eqn->startline && linenum < eqn->endline)
	    {
	    delete eqn;
	    setTclDeleteVars(eqn);
	    fEqnList->removeAt(ptr);
	    return 1;
	    }
      else if (linenum == eqn->endline && col <= eqn->endcol)
	    {
	    delete eqn;
	    setTclDeleteVars(eqn);
	    fEqnList->removeAt(ptr);
	    return 1;
	    }
      
  }      
  
  return 0;
     
	  
}

void Page::setTclDeleteVars(EqnMarker * eqn)
{
	  
if (ENABLE_USER_INTERFACE)
  {
    docommand("set deleted 1");
    docommand("set curline %d",eqn->endline);
    docommand("set curline_startrow %d",flineinfo[eqn->endline].fstartrow);
    docommand("set curline_endrow %d",flineinfo[eqn->endline].fendrow);
    docommand("set curx %d", eqn->endcol);

    // prevlines are actually starting lines but allowed same use of 
    // tcl add equation code
    docommand("set prevline %d",eqn->startline);
    docommand("set prevline_startrow %d",flineinfo[eqn->startline].fstartrow);
    docommand("set prevline_endrow %d",flineinfo[eqn->startline].fendrow);
    docommand("set prevx %d", eqn->startcol);

    
    
    // this will change with zoning
    docommand("set curline_startcol %d",0);
    docommand("set curline_endcol %d",get_width());
	      

  }


}

Component * Page::compAt(Point p)
/*--------------------------------------------------------------
Primary Purpose: Calls Components::compAt to return the smallest
                  component containing point p
Return Value: Pointer to the component or null if no component here 
Effects:
Rev: 4/25/96
---------------------------------------------------------------*/
{
  Component * returnComp= NULL;
  int linenum = get_linenum(p.x(), p.y() );

  if (linenum >= 0)
    {
      Components * complist = line(linenum);
      returnComp = complist->compAt(p);
    }
  if (returnComp == NULL)
    printf("No component found at ( %d, %d)\n ", p.x(), p.y());
  else
    printf("Component found at ( %d, %d)\n ul = (%d, %d)  lr = (%d, %d)\n "
	   , p.x(), p.y(),returnComp->ul().x(),returnComp->ul().y(),
	   returnComp->lr().x(),returnComp->lr().y());
    

    return returnComp;
}


bool Page::inEquation(int col, int row)
/*--------------------------------------------------------------
Primary Purpose: determine if x,y is in an equation
Arguments: x,y coordinates
Return Value: true if in an Equation, false otherwise
Effects: determines if equation with these coordinated is in fEqnList
Rev: 11/25/95
---------------------------------------------------------------*/
{
  // first determine line number.
  int linenum = get_linenum(col, row);

  
  for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) 
    {
      EqnMarker * eqn =  (EqnMarker *) ptr->item;
      if (linenum == eqn->startline &&  linenum == eqn->endline)
	{
	  if(col >= eqn->startcol && col <= eqn->endcol)
	    return true;
	}
      else if (linenum == eqn->startline && col >= eqn->startcol)
	return true;
      else if (linenum > eqn->startline && linenum < eqn->endline)
	return true;
      else if (linenum == eqn->endline && col <= eqn->endcol)
	return true;
    }
  return false;

}

bool Page::inEquation(ListElement * comp)
/*--------------------------------------------------------------
Primary Purpose:  determine if the component in this list element 
                 is in an equation
Arguments: A list element from a component list
Return Value: true if in equation, false otherwise
Effects:  calls inEquation(x,y) to do the real work
Rev: 4/21/96
---------------------------------------------------------------*/
{
  Component * c = (Component *) comp->item;
  return inEquation(c->ul().x(), c->ul().y());
}


int Page::writeEquations(char * filename, int lineOffset)
/*--------------------------------------------------------------
Primary Purpose:  Writes boundaries of equations
Arguments: output file name
Return Value: 1 if successful 0 otherwise 
Effects: Outputs to filename for each equation
int startline, int startcol, int endline, int endcol <CR/LF>
Rev: 11/25/95
---------------------------------------------------------------*/
{
  FILE * outfile;
  outfile = fopen(filename, "w");
  if (outfile == NULL)
      {
	printf("Error openning %s", filename);
	return 0;
      }

  for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) 
    {
      EqnMarker * eqn =  (EqnMarker *) ptr->item;
      fprintf(outfile, " %6d %6d %6d %6d\n", eqn->startline+lineOffset, 
	      eqn->startcol, 
	      eqn->endline+lineOffset, eqn->endcol);
    }
fclose(outfile);
return 1;
}

void Page::join(Component * a, Component * b)
{
  if (a == b) return;
  Component * primary;
  Component * secondary;
  
  primary = (( a < b) ? a : b);
  secondary = ((primary == a) ? b : a);
  assert(primary != secondary);
  assert(get_linenum(a) == get_linenum(b));

  primary->join(secondary);
  
  // remove secondary component from component list.
  int linenum = get_linenum(secondary);
  line(linenum)->removeElement(secondary);


}





int Page::thinnestHorizontalSplit(Components * complist, 
				  ListElement * compptr)
/*--------------------------------------------------------------
Primary Purpose: Splits this component at thinnest point
Arguments: the component list that contains the compoent and
            a pointer to its listelement
Return Value: 1 if split performed 0 otherwise.
Effects: Adds a new component to the list
Constraints: 
Rev: 4/26
---------------------------------------------------------------*/
{
  Component * comp = (Component *) compptr->item;
  // Some easy access x,y coordinates
  int ulx = comp->ul().x();
  int uly = comp->ul().y();
  int lrx = comp->lr().x();
  int lry = comp->lr().y();

  int bestlrx;

  // Determine where to split.  Split at the thinnest point
  // within JoinTolerance (maximum number of pixels that might be fused)


  int minHeight = (int)comp->height();
  int oldwidth = (int) comp->width();

  bestlrx = comp->lr().x();
  // MinWidth is the minimum width of a learned charcter
  for(int i = MinWidth; i < oldwidth - MinWidth; i++)
      {

	int newHeight = 
	  fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
	if (newHeight < minHeight)
	    {
	      minHeight = newHeight;
	      bestlrx = ulx+i;
	    }
      }
//  printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
  horizontalCompSplit(complist, compptr, bestlrx);


}

int Page::thinnestHorizontalSplit(Component * comp)
{

  int i  = get_linenum(comp);
  
  Components * complist = fLineComponents[i];
  ListElement * compptr;

  for (ListElement * ptr = complist->first; ptr != NULL; ptr = ptr->next)
    {
    if ((Component *) (ptr->item) == comp)
      {
	compptr = ptr;
	thinnestHorizontalSplit(complist, compptr);
	return 1;
      }
    }
  return 0;
  
    }

int Page::horizontalCompSplit(Components * complist, 
			      ListElement * compptr, int x)
/*--------------------------------------------------------------
Primary Purpose: Split this component in the list into two components
             at the indicated x coordinate
Arguments: x coordinate of splite
Return Value: 1 if split is performed 0 otherwise
Effects: Adds a new element to the list. One component is split into two
Constraints: fulx <= x >= flrx
Rev: 4/26/96
---------------------------------------------------------------*/

{
  Component * comp = (Component *) compptr->item;
  bool allGroups = TRUE;
  comp->display_bounding_box("white");

  if( x < comp->ul().x() || x > comp->lr().x())
    {
    cout << " Cant split component " << x << "is not between" 
	 << comp->ul().x() << "and" << comp->lr().x() << endl;
    return 0;
    }
  else
    {
      Component * newcomp = new Component(Point(x,comp->ul().y()),
					  comp->lr());
      comp->lr().x() = x-1;
      int compShrunk = comp->vertShrink(fBitMap);
      comp->setProperties(fBitMap);
      if(compShrunk)
	comp->recognize(LearnedGroups, allGroups);
      else
	comp->recognize(LearnedGroups);


      int newCompShrunk = newcomp->vertShrink(fBitMap);
      newcomp->setProperties(fBitMap);

      if(newCompShrunk) // ignore group if shrunk
	newcomp->recognize(LearnedGroups, allGroups);
      else
	newcomp->recognize(LearnedGroups);

      complist->insertAfter(compptr, newcomp);
      comp->display_bounding_box("blue");
      newcomp->display_bounding_box("blue");
 
     return 1;
    }
 
  
}


ZonedPage::ZonedPage()
  :Page(){ fzones = new Zones();}

ZonedPage::~ZonedPage()
{ 
  ((Page *)this)->~Page();
  delete fzones;
} 

Zones * ZonedPage::zones()
 { return fzones; }

Page * ZonedPage::activate(int x, int y)
     // activate the page at Point(x,y)
{
  Zone * activeZone = zones()->findZone(x,y);
  if (activeZone == NULL) return NULL;
  docommand("set cur_xoffset %d", activeZone->ul().x());
  docommand("set cur_yoffset %d", activeZone->ul().y());

  if (activeZone->page() == NULL)
    {
      activeZone->buildPage(this);
    }
  
     return activeZone->page();

}    

void ZonedPage::autoZone(int horizMerge, int vertMerge)
{ // autoZone tries to automatically zone page
  Point curul;
  Point curlr;
  int changed = 1;
  
  if (components() != NULL)
    delete components();

  extractComponents(horizMerge);
  
  while(changed)
    {
    changed = 0;
  for (int i=0; i < numLines(); i++)
    {
    for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next)
      {
       	Component * mainitem = (Component *) mptr->item;
	for (int j= i; j < numLines(); j++)
	  for(ListElement * ptr = line(j)->first; ptr != NULL; ptr=ptr->next)
	      {
		
		Component * item = (Component *) ptr->item;
		if( (item->ul().y() - mainitem->lr().y()) <= vertMerge &&
		    (mainitem != item) &&
		    mainitem->xoverlap(item))
		    {
		      mainitem->join(item);
		      (line(j))->removeAt(ptr);
		      changed = 1;
		    }
	      }
      }	
     }
    }


  for (int i=0; i < numLines(); i++)
    { 
    for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next)
      {
       	Component * mainitem = (Component *) mptr->item;
	/*        printf(" (ul(%d,%d) lr(%d,%d)) ", mainitem->ul().x(),
                mainitem->ul().y(),  mainitem->lr().x(), mainitem->lr().y());
		*/
	/*
	mainitem->display_bounding_box("blue", 
				     ZONING_SCALE_FACTOR,
				     ".zoning_window.work_space");
				     */
	Point ul = Point(mainitem->ul().x() -1,mainitem->ul().y() -1);
	Point lr = Point( mainitem->lr().x() +1, mainitem->lr().y() +1);
	docommand("start_region %d %d", (int)(ul.x()*ZONING_SCALE_FACTOR),
		                        (int)(ul.y()*ZONING_SCALE_FACTOR));

	docommand("end_region %d %d", (int)(lr.x()*ZONING_SCALE_FACTOR), 
		                      (int)(lr.y()*ZONING_SCALE_FACTOR));

	Zone * newzone = new Zone(ul,lr);
        zones()->Append(newzone);
      }

    
    }
}