--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-simple/Page.cc Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,873 @@
+/** Page.cc contains the member functions for the primary OCR class Page */
+#include "system.h"
+#include "Page.h"
+#include "convertMap.h"
+#include "get_skew.h"
+#include "Component.h"
+#include "status_message.h"
+
+/*** Member functions of class Page. ***/
+
+int Page::get_height()
+{
+ return fRLEMap->imageLength();
+}
+
+int Page::get_width()
+{
+ return fRLEMap->imageWidth();
+}
+
+int Page::send_words_to_tcl()
+/*--------------------------------------------------------------
+Primary Purpose: Display words in tcl
+Rev - AR
+---------------------------------------------------------------*/
+{
+ int word_count = 0;
+ int unknown_char_count = 0;
+ int low_precision_count = 0;
+ int mispelled_count = 0;
+ char* send_chars;
+ Word* temp_word;
+ if(ENABLE_USER_INTERFACE) set_status("Displaying text");
+ for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
+ {
+ word_count++;
+ set_text_display_status(word_count, fWordList->num_words);
+ temp_word = (Word*)ptr->item;
+ send_chars = backslashify(temp_word->characters);
+ /* printf("Added word %s Confidence = %d\n", send_chars,
+ temp_word->confid); */
+ if(temp_word->confid < VERY_LOW_CONFIDENCE)
+ {
+ docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ unknown_char_count++;
+ }
+ else if(temp_word->confid < LOW_CONFIDENCE)
+ {
+ docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ low_precision_count++;
+ }
+ else if((temp_word->mispelled) && SPELLCHECK)
+ {
+ docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ mispelled_count++;
+ }
+ else
+ {
+ docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ }
+ update();
+ }
+ if(ENABLE_USER_INTERFACE)
+ {
+ set_status("Done displaying text");
+ set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count))));
+ }
+}
+
+
+int Page::deskew(int deskew_method)
+/*--------------------------------------------------------------
+Primary Purpose: Deskew the page
+Arguments: 1 - RLE Rotation
+ 0 - BitMap Rotation
+Return Value: 1 if successful, 0 if unsuccessful
+Effects: updates the bitmap and rlemap of the page
+Constraints: RLEMap Rotation is not currently reliable and probably
+should not be used
+Rev: AR
+---------------------------------------------------------------*/
+{
+ /* a little ugly.... if the page is rotated
+ in here, return 1, else 0 */
+
+ if(deskew_method == RLE_DESKEW)
+ {
+ if(fRLEMap->deskew())
+ {
+ convertMap(fRLEMap, fBitMap);
+ return 1;
+ }
+ return 0;
+ }
+ else
+ {
+ double skew = get_skew(fRLEMap);
+ if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE))
+ {
+ fBitMap->rotateMap(skew);
+ convertMap(fBitMap, fRLEMap);
+ return 1;
+ }
+ return 0;
+ }
+}
+
+Page::Page()
+/**Page::Page - constructor allocates bitmap and rlemap*/
+{
+ fBitMap = new BitMap;
+ fRLEMap = new RLEMap;
+ fLineComponents = NULL;
+ fWordList = NULL;
+}
+
+Page::~Page()
+/*--------------------------------------------------------------
+Primary Purpose: Destructor deallocates private fields that
+have been created.
+Rev:
+---------------------------------------------------------------*/
+{
+
+ if (flineinfo) delete flineinfo;
+ for (int i = 0; i <fnumLines; i++)
+ if(fLineComponents[i]) delete fLineComponents[i];
+ if(fLineComponents) delete fLineComponents;
+ if (fBitMap) delete fBitMap;
+ if (fRLEMap) delete fRLEMap;
+ if (fWordList) delete fWordList;
+}
+
+Angle Page::skewAngle()
+/*--------------------------------------------------------------
+Primary Purpose: Determine the angle of rotation of the RLEMap r
+Arguments: pointer to an RLEMap
+Return Value: detected angle of rotation
+Code is in get_skew.cc
+Rev: AR
+---------------------------------------------------------------*/
+{
+ return get_skew(fRLEMap);
+}
+
+
+MapStatus Page::readMap(char * filename)
+ // Calls BitMap::readMap and then converts
+{
+ MapStatus status;
+ status = fBitMap->readMap(filename);
+ convertMap(fBitMap, fRLEMap);
+ return status;
+}
+
+
+
+MapStatus Page::setLines()
+/*--------------------------------------------------------------
+Primary Purpose: Set flineinfo array in Page class with the
+ starting and ending rows of each line of text.
+ Also sets fnumLines to the number of lines
+Arguments: none
+Return Value: A Mapstatus either VALID, EMPTY if there is no
+ data in the RLEMAP, or OTHERERROR if there is an unexpected error
+Effects: Allocates flineinfo and fills with starting and ending row
+ of each line. The following global variables are used as parameters
+ in this function. These are defined in system.cc
+ NoiseTolerance - Rows whose number of pixels is less than this value
+ will be considered empty (current val 6).
+ MinVertSeparation - The minimum number of rows separating lines of text.
+ Lines will be merged if actual Separation is less than this
+ value. (current val 3)
+ MinLineSize - The minimum number of rows in a line of text.
+ Any smaller lines are discarded (currentval 5)
+
+Constraints: Page::readMap() must be run first to fill fRLEMap
+Rev: 10/26 KM
+---------------------------------------------------------------*/
+{
+
+ int maxrow = fRLEMap->imageLength() - 1; // maximum row number
+ int actualSeparation = MinVertSeparation + 1; // must be bigger than min
+ // for line 0
+
+ int linenum=0; // current line number
+ int prvlinenum = 0;
+ int lineSize; // # rows in current line
+
+ int maxLines = maxrow/MinLineSize; // max # of lines of text
+
+ if(maxrow == 0) return EMPTY;
+
+ flineinfo = new LineMarker[maxLines];
+
+ for (int i = 0; i < maxrow;)
+ {
+ LineMarker & thisLine = flineinfo[linenum];
+ LineMarker & prevLine = flineinfo[prvlinenum];
+
+ while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance)
+ i++;
+ thisLine.fstartrow = i++;
+ while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance)
+ i++;
+
+
+ lineSize = i - thisLine.fstartrow +1;
+
+ // If this line is less than MinVertSeparation away
+ // from the last line. Join the two together.
+ if (linenum > 0)
+ {
+ actualSeparation = thisLine.fstartrow - prevLine.fendrow;
+ }
+ if (actualSeparation < MinVertSeparation)
+ {
+ // If too small of a separation, add into prev row
+ prevLine.fendrow = i;
+ }
+ else if (lineSize >= MinLineSize)
+ {
+ thisLine.fendrow = i;
+/* printf (" Line %d Start: %d End: %d lineHeight %d\n",
+ linenum,thisLine.fstartrow,
+ thisLine.fendrow,
+ thisLine.fendrow - thisLine.fstartrow +1);
+*/
+ prvlinenum = linenum;
+ linenum++;
+
+ }
+ if (linenum >= maxLines) return OTHERERROR;
+ }
+
+ fnumLines = linenum; // Set number of lines in page class
+
+ fLineComponents = new Components*[fnumLines];
+ if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES)
+ {
+ display_line_boundaries();
+ }
+ /* printf("Setlines found a total of %d lines.\n", fnumLines); */
+ if(ENABLE_USER_INTERFACE)
+ update();
+ return VALID;
+ }
+
+void Page::display_line_boundaries()
+/*--------------------------------------------------------------
+Primary Purpose: Display line boundaries in TCL/TK. Called from
+setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are
+set to TRUE
+Effects: Draws a blue line between each line of text
+Rev: AR
+---------------------------------------------------------------*/
+{
+ int centerline, width;
+ for(int j=0; j < fnumLines; j++)
+ {
+ centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2;
+ width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow;
+
+ scale(centerline);
+ scale(width);
+ /* having this pathname here is probably not such a good idea...*/
+
+ docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width);
+ }
+}
+
+
+int test_rlemap_lines(RLEMap* rmap)
+{
+ int length = rmap->imageLength();
+ for(int i = 0; i < length; i++)
+ printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels);
+}
+
+
+MapStatus Page::extractComponents()
+/*--------------------------------------------------------------
+ Component extraction routines.
+*
+* Given the top and bottom line of a row we want to generate a list of
+* components. The general method is to find the closest dot, trace its
+* connected dots, then project upwards and downwards and add anything we
+* find there to the component. We will erase the component from the RLEMap
+* as it is added to the component list. By projecting up and down
+* from the piece we first find we should be able
+* to completely encompass characters like :;i?|! The only problems are
+* italic or ligatured characters where we may pick up two or more
+* characters at a time (which would be bad) or characters fragmented
+* with a vertical gap.
+
+Primary Purpose: Main extraction routine.
+Effects: Makes new components and puts them in a list. Deletes components
+ from RLE map. Fills in component boundaries and calls
+ Component::setProperties to set the property vector
+ Lastly convertMap is run to rebuild the RLEMap
+Constraints: Page::setLines() must be run first
+Rev: 11/2 JMH
+ 11/8 KM add set properties and
+ avgSpacing;
+---------------------------------------------------------------*/
+{
+ int currentCol, startRow, endRow, rowHeight;
+ ListElement* intrvl;
+ ListElement* tempintrvl;
+ /* printf("fnumLines = %d\n", fnumLines); */
+ Component* comp;
+ int totalSpacing = 0; // total blank horizontal pixels between components
+ int baselines[MaxVertSize]; // array for finding the baseline
+ last_status = 0.0;
+ int compCounter = 0;
+ int i;
+ int j;
+ printf("Extracting Components\n");
+ for (i = 0; i < fnumLines; i++) {
+ if(ENABLE_USER_INTERFACE)
+ set_component_status(i, fnumLines);
+ currentCol = 0;
+ startRow = flineinfo[i].fstartrow;
+ endRow = flineinfo[i].fendrow;
+ rowHeight = endRow - startRow;
+ assert(rowHeight > 0);
+
+ for (j=0; j < MaxVertSize; j++)
+ baselines[j] = 0;
+ fLineComponents[i] = new Components();
+
+
+ while (currentCol<=fRLEMap->imageWidth()) { //until we reach the end of the page
+
+ //Build component starting with closest black dot
+ intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow);
+ if (intrvl == NULL) {
+ // printf("Reached end of line\n");
+ break;
+ }
+ comp = new Component(); //Make a new component named comp
+ assert(comp->AddToComponent(intrvl, fRLEMap));
+
+ //Now we want to extend upwards
+ //First check if there is a blank space to the right
+ tempintrvl = fRLEMap->FindNearHorizDot(comp->lr().x(),
+ startRow, endRow);
+ if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start >
+ comp->lr().x()+MinHorizSeparation+1)
+ while (comp->ul().y() < endRow) {
+ intrvl = fRLEMap->FindNearVertDot(comp->ul().x(),
+ comp->lr().x(), comp->lr().y(),
+ startRow);
+ if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)))
+ break;
+ if (intrvl == NULL) break;
+ }
+ else
+ while (comp->ul().y() < endRow) {
+ intrvl = fRLEMap->FindNearVertDot(comp->ul().x(),
+ comp->lr().x(), comp->ul().y(),
+ startRow);
+ if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)));
+ break;
+ if (intrvl == NULL) break;
+ }
+
+ //Now we want to extend downwards
+ while (comp->lr().y() > startRow) {
+ intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(),
+ comp->lr().y(), endRow);
+ if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap)))
+ break;
+ if (intrvl == NULL) break;
+ }
+
+ // Now we toss out the noise
+ int size;
+ if (comp != NULL) {
+ if (comp->ul() < Point(0,0))
+ printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y());
+ else
+ size = fBitMap->pixelsInRegion(comp->ul(), comp->lr());
+ }
+ else
+ size = 0;
+ if (size < MinComponentSize) {
+// printf("Deleting some noise of size %d\n", size);
+ // printComponent(comp);
+ delete comp;
+ comp = NULL;
+ }
+ else
+ {
+ compCounter++;
+ // display a rectangle around the component
+ if(ENABLE_USER_INTERFACE)
+ {
+ if(DISPLAY_BOUNDING_BOXES)
+ comp->display_bounding_box();
+ }
+
+ // JMH - make an array of frequency of the y coord of bottom of comp
+ int vertOffset = endRow - comp->lr().y();
+ if(vertOffset < MaxVertSize && vertOffset >= 0)
+ baselines[vertOffset]++;
+
+
+ comp->setProperties(fBitMap);
+ if(fLineComponents[i]->last != NULL)
+ totalSpacing +=
+ comp->ul().x() -
+ ((Component *) (fLineComponents[i]->last->item))->lr().x();
+
+ fLineComponents[i]->Append(comp); // add this component to list
+ currentCol = (comp->lr()).x() + 1; // update position on page
+ }
+ }
+
+ // find most popular bottom of comp and call it the baseline
+ int counter = 0;
+ int baseline;
+ for (j=0; j < MaxVertSize; j++) {
+ if (counter < baselines[j]) {
+ counter = baselines[j];
+ baseline = endRow - j;
+ }
+ }
+ // printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline);
+ // Now assign each character a group based on it's location
+ for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL;
+ ptr = ptr->next) {
+ comp = (Component*) ptr->item;
+ comp->charGroup = 0;
+
+ // if top of char is higher than top - tolerance
+ if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) {
+ comp->charGroup += 2; //tall like a T
+ }
+
+ // if bottom of char is lower than base - tolerance
+ if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) {
+ comp->charGroup += 1; //has a tail like a y
+ } else
+ if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) {
+ comp->charGroup = 4; //floating like a '
+ /* printf("bottom at %d < %d\n", comp->lr().y(),
+ baseline - (2*rowHeight/BaseLineTolerance)); */
+ }
+ // printf("added character in group %d\n", comp->charGroup);
+ }
+ }
+ /* printf("Found %d components on this page.\n", compCounter); */
+ // printComponents();
+ last_status = 0.0;
+ if(ENABLE_USER_INTERFACE)
+ set_status("Done extracting characters");
+ if((compCounter - fnumLines) > 0) /* don't want divide by zero */
+ {
+ favgSpacing = totalSpacing / (compCounter - fnumLines);
+ }
+ else
+ {
+ favgSpacing = 1;
+ }
+ delete fRLEMap;
+ fRLEMap = new RLEMap;
+ convertMap(fBitMap, fRLEMap);
+}
+
+void Page::printComponents()
+/*--------------------------------------------------------------
+Primary Purpose: Debugging routine that prints little bitmaps
+of low confidence characters
+---------------------------------------------------------------*/
+{
+ int compcounter = 0;
+ for (int i = 0; i < fnumLines; i++) {
+ Component* comp;
+ for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL;
+ ptr = ptr->next) {
+ compcounter++;
+ comp = (Component *) ptr->item;
+ if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n')
+ {
+ printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n",
+ (comp->ul()).x(), (comp->ul()).y(),
+ (comp->lr()).x(), (comp->lr()).y());
+ printComponent(comp);
+ printf("properties: ");
+ printVector(comp->properties(), numProperties);
+ printf("I think it's a -> %c <- confidence: %d line: %d group: %d Comp#%d\n",
+ comp->asciiId(),
+ comp->confid(), i+1, comp->charGroup, compcounter);
+ printf("\n*******************************************************\n");
+ }
+ }
+ }
+}
+
+void Page::printComponent(Component* comp)
+// Print a single component.
+{
+ int right = comp->ul().x()+78;
+ if (comp->lr().x() < right)
+ right = comp->lr().x();
+
+ for (int r = comp->ul().y();
+ r <= comp->lr().y(); r++){
+ for (int c = comp->ul().x();
+ c <= right; c++)
+ bitprint(fBitMap->row(r)[c/8], c%8);
+ printf( "\n");
+ }
+}
+
+int spacing(ListElement * compa, ListElement * compb);
+// helper function for extractWords (defined below)
+
+MapStatus Page::extractWords()
+/*--------------------------------------------------------------
+Primary Purpose: Extract words from each lines components
+Effects: sets the fWordsList to be a list of all of the words
+in the document.
+Constraints: extractComponents must be run first
+Rev: KM 11/7/95
+---------------------------------------------------------------*/
+{
+ bool inWord;
+ ListElement * start; // word Start
+ int count; // counts the characters in the word
+ int word_count = 0;
+ int spacingThreshold = (int) (1.25 * ((float) (favgSpacing)));
+ fWordList = new Words;
+ last_status = 0.0;
+ for (int i = 0; i < fnumLines; i++)
+ {
+ if(ENABLE_USER_INTERFACE)
+ set_extract_status(i, fnumLines);
+ inWord = FALSE;
+ for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) {
+ if(!inWord)
+ {
+ start = ptr;
+ count = 1;
+ inWord = TRUE;
+ }
+ if( spacing(ptr, ptr->next) > spacingThreshold)
+ {
+ Word * newWord = new Word(start,count);
+ (words())->Append(newWord);
+ if(1)
+ printf("%s ",newWord->characters);
+ inWord = FALSE;
+ word_count++;
+ }
+ else
+ count++;
+ }
+ // Add in a separate word for new line
+ Word * newWord = new Word("\n",2);
+ (words())->Append(newWord);
+ printf("%s", newWord->characters);
+ word_count++;
+ }
+ last_status = 0.0;
+ fWordList->num_words = word_count;
+ if(ENABLE_USER_INTERFACE)
+ set_status("Done extracting words");
+ return VALID;
+}
+
+void Page::spellcheck()
+/*--------------------------------------------------------------
+Primary Purpose: Run spell checker on word list.
+Constraints: extractWords must be run first
+Rev: AR
+---------------------------------------------------------------*/
+{
+ int word_count = 0;
+ Word* temp_word;
+ for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
+ {
+ word_count++;
+ if(ENABLE_USER_INTERFACE)
+ set_spellcheck_status(word_count, fWordList->num_words);
+ temp_word = (Word*)ptr->item;
+ if(0)
+ printf("Spellchecking word %s\n", temp_word->characters);
+ if(mispelled(temp_word->characters))
+ {
+ temp_word->mispelled = TRUE;
+ }
+ }
+}
+
+int Page::spacing(ListElement * compa, ListElement * compb)
+// spacing from end of comp_a to begining of comp_b
+{
+ int x;
+ if (compb == NULL) return 1000; // end of line
+
+ Component * a = ((Component *) (compa)->item);
+ Component * b = ((Component *) (compb)->item);
+ int returnval = (b->ul().x() - a->lr().x());
+ if (returnval < 0)
+ {
+ return 0;
+ }
+ assert (returnval >= 0);
+ return returnval;
+
+}
+
+
+void Page::printWords()
+// Prits out each component of each word. This can take a very long time
+{
+
+ Word * thisWord;
+ for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next)
+ {
+ thisWord = (Word *) ptr->item;
+ printf("!!!!!! NEW WORD %s confid : %d !!!!!\n", thisWord->characters, thisWord->confid);
+ for(int i = 0; i < thisWord->charCount; i++)
+ {
+ Component * comp = thisWord->character[i];
+ if (comp == NULL) continue;
+ printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n",
+ (comp->ul()).x(), (comp->ul()).y(),
+ (comp->lr()).x(), (comp->lr()).y());
+ for (int r = comp->ul().y();
+ r <= comp->lr().y(); r++){
+ for (int c = comp->ul().x();
+ c <= comp->lr().x(); c++)
+ bitprint(fBitMap->row(r)[c/8], c%8);
+ printf( "\n");
+ }
+ printf("properties: ");
+ printVector(comp->properties(), numProperties);
+ printf("Identification: %c distance: %d confidence %d\n",
+ comp->asciiId(),
+ comp->distance(&LearnedChars[comp->asciiId()]),
+ comp->confid());
+ printf("\n***********************************************\n");
+ }
+ }
+}
+
+MapStatus Page::recognize()
+/*--------------------------------------------------------------
+Primary Purpose: Recognize entire page. Sets font and ascii id of
+each component
+Return Value: VALID if no error occurred OTHERERROR otherwise
+Constraints: extractComponents must be run first.
+See recognize(line) below for more detailed info
+Rev: KM
+---------------------------------------------------------------*/
+{
+ printf("Recognizing document\n");
+ last_status = 0.0;
+ for (int i = 0; i< fnumLines; i++)
+ {
+ if(ENABLE_USER_INTERFACE)
+ set_recognize_status(i, fnumLines);
+ recognize(i);
+ }
+
+ last_status = 0.0;
+ return VALID;
+
+}
+
+
+MapStatus Page::recognize(int linenum)
+/*--------------------------------------------------------------
+Primary Purpose: Recognize a line of connected components
+Arguments: linenum is line number to recognize
+Effects: sets ascii identification fontid and confidence in each component
+If confidence is low and character is big enough for two characters.
+divideAndRecognize is called to split up the component.
+Constraints: extractComponents must be run first
+Rev: KM 11/9/95
+---------------------------------------------------------------*/
+{
+ Component * comp;
+ Distance d;
+
+ for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next)
+ {
+ comp = (Component *) ptr->item;
+
+ d = comp->recognize(LearnedGroups);
+ if (comp->confid() < ConfidenceThreshold &&
+ comp->width() > 2*MinWidth) // really wide
+ divideAndRecognize(line(linenum), ptr, d);
+ }
+
+ return VALID;
+}
+
+
+
+void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d)
+/*--------------------------------------------------------------
+Primary Purpose: Identify and separate merged characters
+Arguments:ptr is a pointer to a list element containing a component
+ d is the current recognition distance on the component
+Effects: Subdivides component into two parts, Division is made at
+ the minimum vertical height of the component. If the
+ minHeight > JoinTolerance no divison will be made.
+ (JoinTolerance is a global var that determines
+ the maximum number of merged pixels that are allowed in a
+ column for a division to be made)
+ When a division is made. The component's boundaries are
+ adjusted accordingly and a new component is inserted into
+ the list.
+
+ Returns if distance is acceptable or width of component
+ is <= MinWidth*2
+Rev: KM 11/24/95
+---------------------------------------------------------------*/
+{
+ Component * comp = (Component *) ptr->item;
+ Component * newComp;
+ bool allGroups = TRUE;
+
+ // Save the original component boundaries just in case we cant improve
+ Point oldlr = comp->lr();
+ Point oldul = comp->ul();
+ int oldwidth = (int) comp->width();
+
+ // Some easy access x,y coordinates
+ int ulx = comp->ul().x();
+ int uly = comp->ul().y();
+ int lrx = comp->lr().x();
+ int lry = comp->lr().y();
+
+ Distance newdist, bestdist;
+ int bestlrx;
+
+ if (comp->confid() > ConfidenceThreshold)
+ return;
+
+ if (oldwidth < MinWidth*2) // cant be split in two
+ {
+ return;
+ }
+
+ // Determine where to split. Split at the thinnest point
+ // within JoinTolerance (maximum number of pixels that might be fused)
+
+ int minHeight = (int)comp->height();
+ bestlrx = comp->lr().x();
+ for(int i = MinWidth; i < oldwidth - MinWidth; i++)
+ {
+ int newHeight =
+ fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
+ if (newHeight < minHeight)
+ {
+ minHeight = newHeight;
+ bestlrx = ulx+i;
+ }
+ }
+// printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
+
+
+ if (bestlrx < lrx && minHeight < JoinTolerance)
+ {
+ comp->lr().x() = bestlrx;
+ int shrunk = comp->vertShrink(fBitMap);
+ comp->setProperties(fBitMap);
+ if (shrunk) // ignore group if we had to shrink down
+ newdist = comp->recognize(LearnedGroups, allGroups);
+ else
+ newdist = comp->recognize(LearnedGroups);
+
+// printf("Distance = %u asciiid = %c \n", newdist, comp->asciiId());
+
+ Component * newcomp = new Component(Point(bestlrx+1, oldul.y())
+ , oldlr);
+ newcomp->vertShrink(fBitMap);
+ newcomp->setProperties(fBitMap);
+ int newcompdist = newcomp->recognize(LearnedGroups,allGroups);
+
+ if (newdist < d)
+ list->insertAfter(ptr, newcomp);
+ else
+ {
+ comp->ul() = oldul;
+ comp->lr() = oldlr;
+ comp->setProperties(fBitMap);
+ comp->recognize(LearnedGroups);
+ delete newcomp;
+ }
+ return;
+ }
+
+
+ return;
+
+}
+
+
+void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d)
+/*--------------------------------------------------------------
+Primary Purpose: Identify and merge a separated character
+Arguments:ptr is a pointer to a list element containing a component
+ d is the current recognition distance on the component
+Effects: Unite two components into one.
+
+Rev: JMH 12/10/95
+---------------------------------------------------------------*/
+{
+ Component * part1 = (Component *) ptr->previous->item;
+ Component * part2 = (Component *) ptr->item;
+ Point ul, lr;
+ ul = part1->ul();
+ lr = part2->lr();
+ if (ul.y() > lr.y() || ul.x() > lr.x())
+ return;
+ Component * newcomp = new Component(part1->ul(), part2->lr());
+
+ newcomp->setProperties(fBitMap);
+ if (part1->charGroup <= 3 && part2->charGroup <= 3)
+ newcomp->charGroup = (part1->charGroup | part2->charGroup);
+ else if (part1->charGroup == 4)
+ newcomp->charGroup = (part2->charGroup | 2);
+ else
+ newcomp->charGroup = (part1->charGroup | 2);
+ if (newcomp->charGroup > 4) newcomp->charGroup = 4;
+
+ int newdist = newcomp->recognize(LearnedGroups);
+
+ if (newdist < d) {
+ list->removeAt(ptr->previous);
+ list->insertAfter(ptr, newcomp);
+ list->removeAt(ptr);
+ } else
+ delete newcomp;
+ return;
+
+}
+
+
+int Page::writeWordPos(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Writes word position, confidence, length and string to file
+Arguments: output file name
+Return Value: 1 if successful. 0 if an error occured
+Effects: Calls fWordList->printWordPos
+ // Output format for each word
+ "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(),
+ word->confid, word->charCount, word->characters
+Rev: 11/25/95
+---------------------------------------------------------------*/
+{ return fWordList->writeWordPos(filename);};
+
+int Page::writeAscii(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Write word list to asii file
+Arguments: filename to write to
+Return Value: 1 if successful 0 if unsuccessful
+Effects: Calss fWordList->writeAscii(filename)
+Writes words to fill in text format using MinLineSize
+to differentiate lines.
+Rev: 11/25 KM
+---------------------------------------------------------------*/
+
+{return fWordList->writeAscii(filename);};
+
+
+