reference/ocr-simple/Page.cc
changeset 0 6b8091ca909a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-simple/Page.cc	Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,873 @@
+/** Page.cc contains the member functions for the primary OCR class Page */
+#include "system.h"
+#include "Page.h"
+#include "convertMap.h"
+#include "get_skew.h"
+#include "Component.h"
+#include "status_message.h"
+
+/*** Member functions of class Page.     ***/
+
+int Page::get_height()
+{
+  return fRLEMap->imageLength();
+}
+
+int Page::get_width()
+{
+  return fRLEMap->imageWidth();
+}
+
+int Page::send_words_to_tcl()
+/*--------------------------------------------------------------
+Primary Purpose:  Display words in tcl
+Rev - AR
+---------------------------------------------------------------*/
+{
+  int word_count = 0;
+  int unknown_char_count = 0;
+  int low_precision_count = 0;
+  int mispelled_count = 0;
+  char* send_chars;
+  Word* temp_word;
+  if(ENABLE_USER_INTERFACE) set_status("Displaying text");
+  for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
+    {
+      word_count++;
+      set_text_display_status(word_count, fWordList->num_words);
+      temp_word = (Word*)ptr->item;
+      send_chars = backslashify(temp_word->characters);
+      /*	printf("Added word %s Confidence = %d\n", send_chars, 
+	       temp_word->confid); */
+      if(temp_word->confid < VERY_LOW_CONFIDENCE)
+	  {
+	    docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(),  temp_word->ul.y());
+	    unknown_char_count++;
+	  }
+      else if(temp_word->confid < LOW_CONFIDENCE)
+	  {
+	    docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(),  temp_word->ul.y());
+	    low_precision_count++;
+	  }
+      else if((temp_word->mispelled) && SPELLCHECK)
+	  {
+	    docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(),  temp_word->ul.y());
+	    mispelled_count++;
+	  }
+      else
+	  {
+	    docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(),  temp_word->ul.y());
+	  }
+      update();
+    }
+  if(ENABLE_USER_INTERFACE)
+      {
+    set_status("Done displaying text");
+    set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count))));
+  }
+}
+
+
+int Page::deskew(int deskew_method)
+/*--------------------------------------------------------------
+Primary Purpose: Deskew the page
+Arguments: 1 - RLE Rotation
+           0 - BitMap Rotation
+Return Value: 1 if successful, 0 if unsuccessful
+Effects: updates the bitmap and rlemap of the page
+Constraints: RLEMap Rotation is not currently reliable and probably
+should not be used
+Rev: AR
+---------------------------------------------------------------*/
+{
+  /* a little ugly.... if the page is rotated
+     in here, return 1, else 0 */
+
+  if(deskew_method == RLE_DESKEW)
+      {
+	if(fRLEMap->deskew())
+	{
+	  convertMap(fRLEMap, fBitMap);
+	  return 1;
+	}
+	return 0;
+      }
+  else
+      {
+      double skew = get_skew(fRLEMap);
+      if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE))
+	  {
+	    fBitMap->rotateMap(skew);
+	    convertMap(fBitMap, fRLEMap);
+	    return 1;
+	  }
+      return 0;
+    }
+}
+
+Page::Page()
+/**Page::Page - constructor allocates bitmap and rlemap*/
+{
+  fBitMap = new BitMap;
+  fRLEMap = new RLEMap;
+  fLineComponents = NULL;
+  fWordList = NULL;
+}
+
+Page::~Page()
+/*--------------------------------------------------------------
+Primary Purpose:  Destructor deallocates private fields that
+have been created.
+Rev:
+---------------------------------------------------------------*/
+{
+ 
+  if (flineinfo) delete flineinfo;
+  for (int  i = 0; i <fnumLines; i++)
+        if(fLineComponents[i]) delete fLineComponents[i];
+  if(fLineComponents) delete fLineComponents;
+  if (fBitMap) delete fBitMap;
+  if (fRLEMap) delete fRLEMap;
+  if (fWordList) delete fWordList;
+}
+
+Angle Page::skewAngle()
+/*--------------------------------------------------------------
+Primary Purpose: Determine the angle of rotation of the RLEMap r
+Arguments: pointer to an RLEMap
+Return Value: detected angle of rotation
+Code is in get_skew.cc
+Rev: AR
+---------------------------------------------------------------*/
+{
+  return get_skew(fRLEMap);
+}
+
+
+MapStatus Page::readMap(char * filename)
+ // Calls BitMap::readMap and then converts
+{
+  MapStatus status;
+  status = fBitMap->readMap(filename);
+  convertMap(fBitMap, fRLEMap);
+  return status;
+}
+
+
+
+MapStatus Page::setLines()
+/*--------------------------------------------------------------
+Primary Purpose:  Set flineinfo array in Page class with the 
+      starting and ending rows of each line of text.
+      Also sets fnumLines to the number of lines
+Arguments: none
+Return Value: A Mapstatus either VALID, EMPTY if there is no
+   data in the RLEMAP, or OTHERERROR if there is an unexpected error
+Effects:  Allocates flineinfo and fills with starting and ending row
+   of each line.  The following global variables are used as parameters
+   in this function.  These are defined in system.cc
+   NoiseTolerance - Rows whose number of pixels is less than  this value
+                will be considered empty (current val 6). 
+   MinVertSeparation - The minimum number of rows separating lines of text.
+                 Lines will be merged if actual Separation is less than this
+		 value. (current val 3)
+   MinLineSize - The minimum number of rows in a line of text.  
+                 Any smaller lines are discarded (currentval 5)
+
+Constraints: Page::readMap() must be run first to fill fRLEMap 
+Rev: 10/26 KM
+---------------------------------------------------------------*/
+{
+
+   int maxrow = fRLEMap->imageLength() - 1;      // maximum row number 
+   int actualSeparation = MinVertSeparation + 1; // must be bigger than min
+                                                 // for line 0
+
+   int linenum=0;                                // current line number
+   int prvlinenum = 0;
+   int lineSize;                                 // # rows in current line 
+
+   int maxLines = maxrow/MinLineSize;           // max # of lines of text 
+
+   if(maxrow == 0) return EMPTY;
+
+   flineinfo = new LineMarker[maxLines]; 
+
+   for (int i = 0; i < maxrow;)
+	{
+	  LineMarker & thisLine = flineinfo[linenum];
+	  LineMarker & prevLine = flineinfo[prvlinenum];
+
+	  while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance)
+	    i++;
+	  thisLine.fstartrow = i++;
+	  while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance)
+	    i++;
+	  
+
+	  lineSize = i - thisLine.fstartrow +1;
+
+	  // If this line is less than MinVertSeparation away
+	  //  from the last line.  Join the two together.
+	  if (linenum > 0)
+	    {
+	      actualSeparation = thisLine.fstartrow - prevLine.fendrow;
+	    }
+	  if (actualSeparation < MinVertSeparation)
+	    {
+	     // If too small of a separation, add into prev row
+	     prevLine.fendrow = i;
+	   }
+	  else if (lineSize >= MinLineSize)
+	    {
+	    thisLine.fendrow = i;
+/*	    printf (" Line %d  Start: %d  End: %d  lineHeight %d\n", 
+	        linenum,thisLine.fstartrow,
+	        thisLine.fendrow, 
+	        thisLine.fendrow  - thisLine.fstartrow +1);
+*/
+	    prvlinenum = linenum;
+	    linenum++;
+
+	  }
+	  if (linenum >= maxLines) return OTHERERROR;
+	}
+
+   fnumLines = linenum;   // Set number of lines in page class
+
+   fLineComponents = new Components*[fnumLines];
+   if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES)
+     {
+       display_line_boundaries();
+     }
+   /*   printf("Setlines found a total of %d lines.\n", fnumLines); */
+   if(ENABLE_USER_INTERFACE) 
+     update(); 
+   return VALID;
+ }
+
+void Page::display_line_boundaries()
+/*--------------------------------------------------------------
+Primary Purpose: Display line boundaries in TCL/TK.  Called from
+setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are
+set to TRUE
+Effects:  Draws a blue line between each line of text
+Rev:  AR
+---------------------------------------------------------------*/
+{
+  int centerline, width;
+  for(int j=0; j < fnumLines; j++)
+    {
+      centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2;
+      width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow;
+
+      scale(centerline);
+      scale(width);
+      /* having this pathname here is probably not such a good idea...*/
+      
+      docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width);
+    }
+}
+
+
+int test_rlemap_lines(RLEMap* rmap)
+{
+  int length = rmap->imageLength();
+  for(int i = 0; i < length; i++)
+    printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels);
+}
+
+
+MapStatus Page::extractComponents()
+/*--------------------------------------------------------------
+                     Component extraction routines.
+*
+* Given the top and bottom line of a row we want to generate a list of
+* components. The general method is to find the closest dot, trace its 
+* connected dots, then project upwards and downwards and add anything we 
+* find there to the component. We will erase the component from the RLEMap
+* as it is added to the component list. By projecting up and down 
+* from the piece we first find we should be able
+* to completely encompass characters like :;i?|! The only problems are 
+* italic or ligatured characters where we may pick up two or more 
+* characters at a time (which would be bad) or characters fragmented 
+* with a vertical gap.
+
+Primary Purpose: Main extraction routine.
+Effects: Makes new components and puts them in a list. Deletes components 
+         from RLE map. Fills in component boundaries and calls 
+	 Component::setProperties to set the property vector
+         Lastly convertMap is run to rebuild the RLEMap
+Constraints: Page::setLines() must be run first 
+Rev: 11/2 JMH
+     11/8 KM add set properties and
+     avgSpacing;
+---------------------------------------------------------------*/
+{
+  int currentCol, startRow, endRow, rowHeight;
+  ListElement* intrvl;
+  ListElement* tempintrvl;
+  /*  printf("fnumLines = %d\n", fnumLines); */
+  Component* comp;
+  int  totalSpacing = 0;  // total blank horizontal pixels between components
+  int  baselines[MaxVertSize];     // array for finding the baseline
+  last_status = 0.0;
+  int compCounter = 0;
+  int i;
+  int j;
+    printf("Extracting Components\n");
+  for (i = 0; i < fnumLines; i++) {
+    if(ENABLE_USER_INTERFACE)
+      set_component_status(i, fnumLines);
+    currentCol = 0;
+    startRow = flineinfo[i].fstartrow;
+    endRow = flineinfo[i].fendrow;
+    rowHeight = endRow - startRow;
+    assert(rowHeight > 0);
+
+    for (j=0; j < MaxVertSize; j++)
+      baselines[j] = 0;
+    fLineComponents[i] = new Components();
+
+
+    while (currentCol<=fRLEMap->imageWidth()) {  //until we reach the end of the page
+
+	//Build component starting with closest black dot
+	intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow);
+	if (intrvl == NULL) {
+	//  printf("Reached end of line\n");
+	  break;
+	}
+	comp = new Component(); //Make a new component named comp
+	assert(comp->AddToComponent(intrvl, fRLEMap));
+
+	//Now we want to extend upwards
+	//First check if there is a blank space to the right
+	tempintrvl = fRLEMap->FindNearHorizDot(comp->lr().x(), 
+					       startRow, endRow);
+	if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start > 
+	    comp->lr().x()+MinHorizSeparation+1)
+	  while (comp->ul().y() < endRow) {
+	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
+					      comp->lr().x(), comp->lr().y(),
+					      startRow);
+	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)))
+	      break;
+	    if (intrvl == NULL) break;
+	  }
+	else
+	  while (comp->ul().y() < endRow) {
+	    intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), 
+					      comp->lr().x(), comp->ul().y(),
+					      startRow);
+	    if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)));
+	    break;
+	    if (intrvl == NULL) break;
+	  }
+
+	//Now we want to extend downwards
+	while (comp->lr().y() > startRow) {
+	  intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(), 
+						    comp->lr().y(), endRow);
+	  if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)))
+	    break;
+	  if (intrvl == NULL) break;
+	}
+
+	// Now we toss out the noise
+	int size;
+	if (comp != NULL) {
+	  if (comp->ul() < Point(0,0))
+	    printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y());
+	  else
+	    size = fBitMap->pixelsInRegion(comp->ul(), comp->lr());
+	}
+	  else
+	    size = 0;
+	if (size < MinComponentSize) {
+//	  printf("Deleting some noise of size %d\n", size);
+	  // printComponent(comp);
+	  delete comp;
+	  comp = NULL;
+	}
+	else
+	    {
+	      compCounter++;
+	      // display a rectangle around the component
+	      if(ENABLE_USER_INTERFACE)
+		  {
+		    if(DISPLAY_BOUNDING_BOXES)
+		      comp->display_bounding_box();
+		  }
+	  
+	  // JMH - make an array of frequency of the y coord of bottom of comp
+	      int vertOffset = endRow - comp->lr().y();
+	      if(vertOffset < MaxVertSize && vertOffset >= 0)
+		baselines[vertOffset]++;
+
+	  
+	      comp->setProperties(fBitMap);
+	      if(fLineComponents[i]->last != NULL)
+		totalSpacing += 
+		  comp->ul().x() - 
+		    ((Component *) (fLineComponents[i]->last->item))->lr().x();
+
+	      fLineComponents[i]->Append(comp);       // add this component to list
+	      currentCol = (comp->lr()).x() + 1;   // update position on page
+	    }
+      }
+    
+    // find most popular bottom of comp and call it the baseline
+    int counter = 0;
+    int baseline;
+    for (j=0; j < MaxVertSize; j++) {
+      if (counter < baselines[j]) {
+	counter = baselines[j];
+	baseline = endRow - j;
+      }
+    }
+    //    printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline);
+    // Now assign each character a group based on it's location
+    for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
+	 ptr = ptr->next) {
+      comp = (Component*) ptr->item;
+      comp->charGroup = 0;
+      
+      // if top of char is higher than top - tolerance 
+      if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) {
+	comp->charGroup += 2; //tall like a T
+      }
+      
+      // if bottom of char is lower than base - tolerance
+      if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) {
+	comp->charGroup += 1; //has a tail like a y
+      } else 
+	if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) {
+	  comp->charGroup = 4; //floating like a '
+	  /*	  printf("bottom at %d < %d\n", comp->lr().y(),
+		  baseline - (2*rowHeight/BaseLineTolerance)); */
+	}
+      //      printf("added character in group %d\n", comp->charGroup);
+    }
+  }
+  /*  printf("Found %d components on this page.\n", compCounter); */
+  //  printComponents();
+  last_status = 0.0;
+  if(ENABLE_USER_INTERFACE)
+    set_status("Done extracting characters");
+  if((compCounter - fnumLines) > 0) /* don't want divide by zero */
+    {
+      favgSpacing = totalSpacing / (compCounter - fnumLines);
+    }
+  else
+    {
+      favgSpacing = 1;  
+    }
+  delete fRLEMap;
+  fRLEMap = new RLEMap;
+  convertMap(fBitMap, fRLEMap);
+}
+
+void Page::printComponents()
+/*--------------------------------------------------------------
+Primary Purpose: Debugging routine that prints little bitmaps
+of low confidence characters
+---------------------------------------------------------------*/
+{
+  int compcounter = 0;
+  for (int i = 0; i < fnumLines; i++) {
+    Component* comp;
+    for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; 
+	 ptr = ptr->next) {
+      compcounter++;
+      comp = (Component *) ptr->item;
+      if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n')
+      {
+	printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n", 
+	   (comp->ul()).x(), (comp->ul()).y(),
+	   (comp->lr()).x(), (comp->lr()).y());
+	printComponent(comp);
+	printf("properties: "); 
+	printVector(comp->properties(), numProperties);
+	printf("I think it's a -> %c <-   confidence: %d  line: %d  group: %d Comp#%d\n",
+	       comp->asciiId(),
+	       comp->confid(), i+1, comp->charGroup, compcounter);
+	printf("\n*******************************************************\n");
+      }
+    }
+  }
+}
+
+void Page::printComponent(Component* comp)
+// Print a single component.
+{
+  int right = comp->ul().x()+78;
+  if (comp->lr().x() < right) 
+    right = comp->lr().x();
+
+  for (int r = comp->ul().y(); 
+       r <= comp->lr().y(); r++){
+    for (int c = comp->ul().x();
+	 c <= right; c++)
+      bitprint(fBitMap->row(r)[c/8], c%8);
+    printf( "\n");
+  }
+}
+
+int spacing(ListElement * compa, ListElement * compb);
+// helper function for extractWords  (defined below)
+
+MapStatus Page::extractWords()
+/*--------------------------------------------------------------
+Primary Purpose: Extract words from each lines components
+Effects: sets the fWordsList to be a list of all of the words
+in the document.
+Constraints: extractComponents must be run first
+Rev: KM 11/7/95
+---------------------------------------------------------------*/
+{
+  bool inWord;
+  ListElement * start;   // word Start
+  int count;   // counts the characters in the word
+  int word_count = 0;
+  int spacingThreshold = (int) (1.25 * ((float) (favgSpacing)));
+  fWordList = new Words;
+  last_status = 0.0;
+  for (int i = 0; i < fnumLines; i++)
+      {
+	if(ENABLE_USER_INTERFACE)
+	  set_extract_status(i, fnumLines);
+	inWord = FALSE;
+	for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) {
+	  if(!inWord)
+	      {
+		start = ptr;
+		count = 1;
+		inWord = TRUE;
+	      }
+	  if( spacing(ptr, ptr->next) > spacingThreshold)
+	      {
+		Word * newWord = new Word(start,count);
+		(words())->Append(newWord);
+		if(1)
+		  printf("%s ",newWord->characters);
+		inWord = FALSE;
+		word_count++;
+	      }
+	  else
+	    count++;
+	}
+	// Add in a separate word for new line
+	Word * newWord = new Word("\n",2);
+        (words())->Append(newWord);
+	printf("%s", newWord->characters);
+	word_count++;
+      }
+  last_status = 0.0;
+  fWordList->num_words = word_count;
+  if(ENABLE_USER_INTERFACE)
+    set_status("Done extracting words");
+  return VALID;
+}
+
+void Page::spellcheck()
+/*--------------------------------------------------------------
+Primary Purpose: Run spell checker on word list.
+Constraints: extractWords must be run first
+Rev: AR
+---------------------------------------------------------------*/
+{
+  int word_count = 0;
+  Word* temp_word;
+  for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
+    {
+      word_count++;
+      if(ENABLE_USER_INTERFACE)
+	set_spellcheck_status(word_count, fWordList->num_words);
+      temp_word = (Word*)ptr->item;
+      if(0)
+	printf("Spellchecking word %s\n", temp_word->characters);
+      if(mispelled(temp_word->characters))
+	{
+	  temp_word->mispelled = TRUE;
+	}
+    }
+}
+
+int Page::spacing(ListElement * compa, ListElement * compb)
+// spacing from end of comp_a to begining of comp_b
+{
+  int x;
+  if (compb == NULL) return 1000;  // end of line
+
+  Component * a = ((Component *) (compa)->item);
+  Component * b = ((Component *) (compb)->item);
+  int returnval =  (b->ul().x() - a->lr().x());
+  if (returnval < 0) 
+    {
+      return 0;
+    }
+  assert (returnval >= 0);
+  return returnval;
+
+}
+
+
+void Page::printWords()
+// Prits out each component of each word. This can take a very long time
+{
+
+  Word * thisWord;
+  for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next)
+      {
+	thisWord = (Word *) ptr->item;
+	printf("!!!!!! NEW WORD  %s  confid : %d !!!!!\n", thisWord->characters, thisWord->confid);
+	for(int i = 0; i < thisWord->charCount; i++)
+	    {
+	      Component * comp = thisWord->character[i];
+	      if (comp == NULL) continue;
+	      printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n", 
+		     (comp->ul()).x(), (comp->ul()).y(),
+		     (comp->lr()).x(), (comp->lr()).y());
+	      for (int r = comp->ul().y(); 
+		   r <= comp->lr().y(); r++){
+		for (int c = comp->ul().x();
+		     c <= comp->lr().x(); c++)
+		  bitprint(fBitMap->row(r)[c/8], c%8);
+		printf( "\n");
+	      }
+	      printf("properties: "); 
+	      printVector(comp->properties(), numProperties);
+	      printf("Identification:  %c distance: %d confidence %d\n",
+		     comp->asciiId(),
+		     comp->distance(&LearnedChars[comp->asciiId()]),
+	             comp->confid());
+	      printf("\n***********************************************\n");
+	    }
+      }
+}
+
+MapStatus Page::recognize()
+/*--------------------------------------------------------------
+Primary Purpose: Recognize entire page.  Sets font and ascii id of
+each component
+Return Value: VALID if no error occurred OTHERERROR otherwise
+Constraints: extractComponents must be run first.
+See recognize(line) below for more detailed info
+Rev: KM
+---------------------------------------------------------------*/
+{
+  printf("Recognizing document\n");
+  last_status = 0.0;
+  for (int i = 0; i< fnumLines; i++)
+      { 
+	if(ENABLE_USER_INTERFACE)
+	  set_recognize_status(i, fnumLines);
+	recognize(i);
+      }
+
+  last_status = 0.0;
+  return VALID;
+
+}
+
+
+MapStatus Page::recognize(int linenum)
+/*--------------------------------------------------------------
+Primary Purpose: Recognize a line of connected components
+Arguments:  linenum is line number to recognize
+Effects: sets ascii identification fontid and confidence in each component
+If confidence is low and character is big enough for two characters.
+divideAndRecognize is called to split up the component.
+Constraints: extractComponents must be run first
+Rev: KM 11/9/95
+---------------------------------------------------------------*/
+{
+  Component * comp;
+  Distance d;
+
+  for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next) 
+      {
+	comp = (Component *) ptr->item;
+
+	d = comp->recognize(LearnedGroups);
+	if (comp->confid() < ConfidenceThreshold && 
+	    comp->width() > 2*MinWidth) // really wide
+	  divideAndRecognize(line(linenum), ptr, d);
+      }
+
+  return VALID;
+}
+
+
+
+void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d)
+/*--------------------------------------------------------------
+Primary Purpose: Identify and separate merged characters
+Arguments:ptr is a pointer to a list element containing a component
+          d is the current recognition distance on the component
+Effects: Subdivides component into two parts, Division is made at
+         the minimum vertical height of the component.  If the 
+	 minHeight > JoinTolerance no divison will be made.
+	 (JoinTolerance is a global var that determines
+	 the maximum number of merged pixels that are allowed in a
+	 column for a division to be made)
+	 When a division is made.  The component's boundaries are 
+	 adjusted accordingly and a new component is inserted into
+	 the list.
+
+	 Returns if distance is acceptable or width of component
+	 is <= MinWidth*2
+Rev: KM 11/24/95
+---------------------------------------------------------------*/
+{
+  Component * comp = (Component *) ptr->item;
+  Component * newComp;
+  bool allGroups = TRUE;
+
+  // Save the original component boundaries just in case we cant improve
+  Point oldlr = comp->lr();
+  Point oldul = comp->ul();
+  int oldwidth = (int) comp->width();
+
+  // Some easy access x,y coordinates
+  int ulx = comp->ul().x();
+  int uly = comp->ul().y();
+  int lrx = comp->lr().x();
+  int lry = comp->lr().y();
+
+  Distance newdist, bestdist;
+  int bestlrx;
+
+  if (comp->confid() > ConfidenceThreshold)
+    return;
+
+  if (oldwidth < MinWidth*2)  // cant be split in two
+      {
+	return;
+      }
+
+  // Determine where to split.  Split at the thinnest point
+  // within JoinTolerance (maximum number of pixels that might be fused)
+
+  int minHeight = (int)comp->height();
+  bestlrx = comp->lr().x();
+  for(int i = MinWidth; i < oldwidth - MinWidth; i++)
+      {
+	int newHeight = 
+	  fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
+	if (newHeight < minHeight)
+	    {
+	      minHeight = newHeight;
+	      bestlrx = ulx+i;
+	    }
+      }
+//  printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
+
+
+  if (bestlrx < lrx  && minHeight < JoinTolerance)
+      {
+	comp->lr().x() = bestlrx;
+	int shrunk = comp->vertShrink(fBitMap);
+	comp->setProperties(fBitMap);
+	if (shrunk)  // ignore group if we had to shrink down
+	  newdist = comp->recognize(LearnedGroups, allGroups);
+	else
+	  newdist = comp->recognize(LearnedGroups);
+
+//	printf("Distance = %u  asciiid = %c \n", newdist, comp->asciiId());
+
+	Component * newcomp = new Component(Point(bestlrx+1, oldul.y())
+					    , oldlr);
+	newcomp->vertShrink(fBitMap);
+	newcomp->setProperties(fBitMap);
+	int newcompdist = newcomp->recognize(LearnedGroups,allGroups);
+
+	if (newdist < d)
+      	  list->insertAfter(ptr, newcomp);
+	else
+	    {
+	      	comp->ul() = oldul;
+		comp->lr() = oldlr;
+		comp->setProperties(fBitMap);
+		comp->recognize(LearnedGroups);
+		delete newcomp;
+	    }
+	return;
+      }
+
+
+  return;
+
+}
+
+
+void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d)
+/*--------------------------------------------------------------
+Primary Purpose: Identify and merge a separated character
+Arguments:ptr is a pointer to a list element containing a component
+          d is the current recognition distance on the component
+Effects: Unite two components into one.
+    
+Rev: JMH 12/10/95
+---------------------------------------------------------------*/
+{
+  Component * part1 = (Component *) ptr->previous->item;
+  Component * part2 = (Component *) ptr->item;
+  Point ul, lr;
+  ul = part1->ul();
+  lr = part2->lr();
+  if (ul.y() > lr.y() || ul.x() > lr.x())
+    return;
+  Component * newcomp = new Component(part1->ul(), part2->lr());
+
+  newcomp->setProperties(fBitMap);
+  if (part1->charGroup <= 3 && part2->charGroup <= 3)
+    newcomp->charGroup = (part1->charGroup | part2->charGroup);
+  else if (part1->charGroup == 4)
+    newcomp->charGroup = (part2->charGroup | 2);
+  else
+    newcomp->charGroup = (part1->charGroup | 2);
+  if (newcomp->charGroup > 4) newcomp->charGroup = 4;
+
+  int newdist = newcomp->recognize(LearnedGroups);
+
+  if (newdist < d) {
+    list->removeAt(ptr->previous);
+    list->insertAfter(ptr, newcomp);
+    list->removeAt(ptr); 
+  } else
+    delete newcomp;
+  return;
+
+}
+
+
+int Page::writeWordPos(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Writes word position, confidence, length and string to file
+Arguments: output file name
+Return Value: 1 if successful. 0 if an error occured
+Effects: Calls fWordList->printWordPos
+	  // Output format for each word
+	      "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(),
+		          word->confid, word->charCount, word->characters 
+Rev: 11/25/95
+---------------------------------------------------------------*/
+{ return fWordList->writeWordPos(filename);};
+
+int Page::writeAscii(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Write word list to asii file
+Arguments: filename to write to
+Return Value:  1 if successful 0 if unsuccessful
+Effects:  Calss fWordList->writeAscii(filename)
+Writes words to fill in text format using MinLineSize
+to differentiate lines.
+Rev: 11/25 KM
+---------------------------------------------------------------*/
+
+{return fWordList->writeAscii(filename);};
+
+
+