--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-new/Page.cc Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,1492 @@
+/** Page.cc contains the member functions for the primary OCR class Page */
+#include "system.h"
+#include "Page.h"
+#include "convertMap.h"
+#include "get_skew.h"
+#include "Component.h"
+#include "status_message.h"
+
+/*** Member functions of class Page. ***/
+
+int Page::get_height()
+{
+ return fRLEMap->imageLength();
+}
+
+int Page::get_width()
+{
+ return fRLEMap->imageWidth();
+}
+
+int Page::get_linenum(int col, int row)
+ /*--------------------------------------------------------------
+Primary Purpose: Returns line number of x,y coordinates (just uses y for now)
+ called from proc equation_mark in new_ui.tcl
+Return value: line number or -1 if no line is here.
+Requires: setLines be run first
+Rev: 4/21/96
+---------------------------------------------------------------*/
+{
+ assert (flineinfo != NULL);
+ int linenum= -1;
+
+ for (int i = 0; i < fnumLines; i++)
+ if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row)
+ {
+ linenum = i;
+ if (ENABLE_USER_INTERFACE)
+ {
+ // save last mark before it is overwritten
+
+ docommand("set curline %d",linenum);
+ docommand("set curline_startrow %d",flineinfo[i].fstartrow);
+ docommand("set curline_endrow %d",flineinfo[i].fendrow);
+
+
+ // this will change with zoning
+ docommand("set curline_startcol %d",0);
+ docommand("set curline_endcol %d",get_width());
+
+ }
+
+ break;
+
+ }
+return linenum;
+
+}
+
+int Page::send_words_to_tcl()
+/*--------------------------------------------------------------
+Primary Purpose: Display words in tcl
+Rev - AR
+---------------------------------------------------------------*/
+{
+ int word_count = 0;
+ int unknown_char_count = 0;
+ int low_precision_count = 0;
+ int mispelled_count = 0;
+ char* send_chars;
+ Word* temp_word;
+ if(ENABLE_USER_INTERFACE) set_status("Displaying text");
+ for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
+ {
+ word_count++;
+ set_text_display_status(word_count, fWordList->num_words);
+ temp_word = (Word*)ptr->item;
+ send_chars = backslashify(temp_word->characters);
+ /* printf("Added word %s Confidence = %d\n", send_chars,
+ temp_word->confid); */
+ if(temp_word->confid < VERY_LOW_CONFIDENCE)
+ {
+ docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ unknown_char_count++;
+ }
+ else if(temp_word->confid < LOW_CONFIDENCE)
+ {
+ docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ low_precision_count++;
+ }
+ else if((temp_word->mispelled) && SPELLCHECK)
+ {
+ docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ mispelled_count++;
+ }
+ else
+ {
+ docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(), temp_word->ul.y());
+ }
+ update();
+ }
+ if(ENABLE_USER_INTERFACE)
+ {
+ set_status("Done displaying text");
+ set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count))));
+ }
+}
+
+
+int Page::deskew(int deskew_method)
+/*--------------------------------------------------------------
+Primary Purpose: Deskew the page
+Arguments: 1 - RLE Rotation
+ 0 - BitMap Rotation
+Return Value: 1 if successful, 0 if unsuccessful
+Effects: updates the bitmap and rlemap of the page
+Constraints: RLEMap Rotation is not currently reliable and probably
+should not be used
+Rev: AR
+---------------------------------------------------------------*/
+{
+ /* a little ugly.... if the page is rotated
+ in here, return 1, else 0 */
+
+ if(deskew_method == RLE_DESKEW)
+ {
+ if(fRLEMap->deskew())
+ {
+ convertMap(fRLEMap, fBitMap);
+ return 1;
+ }
+ return 0;
+ }
+ else
+ {
+ double skew = get_skew(fRLEMap);
+ if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE))
+ {
+ fBitMap->rotateMap(skew);
+ convertMap(fBitMap, fRLEMap);
+ return 1;
+ }
+ return 0;
+ }
+}
+
+Page::Page()
+/**Page::Page - constructor allocates bitmap and rlemap*/
+{
+ fBitMap = new BitMap;
+ fRLEMap = new RLEMap;
+ fEqnList = new EqnMarkers;
+ fLineComponents = NULL;
+ fWordList = NULL;
+}
+
+Page::~Page()
+/*--------------------------------------------------------------
+Primary Purpose: Destructor deallocates private fields that
+have been created.
+Rev:
+---------------------------------------------------------------*/
+{
+
+ if (flineinfo) delete flineinfo;
+ for (int i = 0; i <fnumLines; i++)
+ if(fLineComponents[i] != NULL) delete fLineComponents[i];
+ if(fLineComponents) delete fLineComponents;
+ if (fBitMap) delete fBitMap;
+ if (fRLEMap) delete fRLEMap;
+ if (fWordList) delete fWordList;
+ if (fEqnList) delete fEqnList;
+}
+
+Angle Page::skewAngle()
+/*--------------------------------------------------------------
+Primary Purpose: Determine the angle of rotation of the RLEMap r
+Arguments: pointer to an RLEMap
+Return Value: detected angle of rotation
+Code is in get_skew.cc
+Rev: AR
+---------------------------------------------------------------*/
+{
+ return get_skew(fRLEMap);
+}
+
+
+MapStatus Page::readMap(char * filename)
+ // Calls BitMap::readMap and then converts
+{
+ MapStatus status;
+ status = fBitMap->readMap(filename);
+ convertMap(fBitMap, fRLEMap);
+ return status;
+}
+
+
+
+MapStatus Page::setLines()
+/*--------------------------------------------------------------
+Primary Purpose: Set flineinfo array in Page class with the
+ starting and ending rows of each line of text.
+ Also sets fnumLines to the number of lines
+Arguments: none
+Return Value: A Mapstatus either VALID, EMPTY if there is no
+ data in the RLEMAP, or OTHERERROR if there is an unexpected error
+Effects: Allocates flineinfo and fills with starting and ending row
+ of each line. The following global variables are used as parameters
+ in this function. These are defined in system.cc
+ NoiseTolerance - Rows whose number of pixels is less than this value
+ will be considered empty (current val 6).
+ MinVertSeparation - The minimum number of rows separating lines of text.
+ Lines will be merged if actual Separation is less than this
+ value. (current val 3)
+ MinLineSize - The minimum number of rows in a line of text.
+ Any smaller lines are discarded (currentval 5)
+
+Constraints: Page::readMap() must be run first to fill fRLEMap
+Rev: 10/26 KM
+---------------------------------------------------------------*/
+{
+
+ int maxrow = fRLEMap->imageLength() - 1; // maximum row number
+ int actualSeparation = MinVertSeparation + 1; // must be bigger than min
+ // for line 0
+
+ int linenum=0; // current line number
+ int prvlinenum = 0;
+ int lineSize; // # rows in current line
+
+ int maxLines = maxrow/MinLineSize; // max # of lines of text
+
+ if(maxrow == 0) return EMPTY;
+
+ flineinfo = new LineMarker[maxLines];
+
+ for (int i = 0; i < maxrow;)
+ {
+ LineMarker & thisLine = flineinfo[linenum];
+ LineMarker & prevLine = flineinfo[prvlinenum];
+
+ while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance)
+ i++;
+ thisLine.fstartrow = i++;
+ while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance)
+ i++;
+
+
+ lineSize = i - thisLine.fstartrow +1;
+
+ // If this line is less than MinVertSeparation away
+ // from the last line. Join the two together.
+ if (linenum > 0)
+ {
+ actualSeparation = thisLine.fstartrow - prevLine.fendrow;
+ }
+ if (actualSeparation < MinVertSeparation)
+ {
+ // If too small of a separation, add into prev row
+ prevLine.fendrow = i;
+ }
+ else if (lineSize >= MinLineSize)
+ {
+ thisLine.fendrow = i;
+/* printf (" Line %d Start: %d End: %d lineHeight %d\n",
+ linenum,thisLine.fstartrow,
+ thisLine.fendrow,
+ thisLine.fendrow - thisLine.fstartrow +1);
+*/
+ prvlinenum = linenum;
+ linenum++;
+
+ }
+ if (linenum >= maxLines) return OTHERERROR;
+ }
+
+ fnumLines = linenum; // Set number of lines in page class
+
+
+ if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES)
+ {
+ display_line_boundaries();
+ }
+ /* printf("Setlines found a total of %d lines.\n", fnumLines); */
+ if(ENABLE_USER_INTERFACE)
+ update();
+ return VALID;
+ }
+
+void Page::display_line_boundaries()
+/*--------------------------------------------------------------
+Primary Purpose: Display line boundaries in TCL/TK. Called from
+setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are
+set to TRUE
+Effects: Draws a blue line between each line of text
+Rev: AR
+---------------------------------------------------------------*/
+{
+ int centerline, width;
+ for(int j=0; j < fnumLines; j++)
+ {
+ centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2;
+ width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow;
+
+ scale(centerline);
+ scale(width);
+ /* having this pathname here is probably not such a good idea...*/
+
+ docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width);
+ }
+
+}
+
+
+int test_rlemap_lines(RLEMap* rmap)
+{
+ int length = rmap->imageLength();
+ for(int i = 0; i < length; i++)
+ printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels);
+}
+
+
+MapStatus Page::extractComponents(int horizMerge)
+/*--------------------------------------------------------------
+ Component extraction routines.
+*
+* Given the top and bottom line of a row we want to generate a list of
+* components. The general method is to find the closest dot, trace its
+* connected dots, then project upwards and downwards and add anything we
+* find there to the component. We will erase the component from the RLEMap
+* as it is added to the component list. By projecting up and down
+* from the piece we first find we should be able
+* to completely encompass characters like :;i?|! The only problems are
+* italic or ligatured characters where we may pick up two or more
+* characters at a time (which would be bad) or characters fragmented
+* with a vertical gap.
+
+Primary Purpose: Main extraction routine.
+Effects: Makes new components and puts them in a list. Deletes components
+ from RLE map. Fills in component boundaries and calls
+ Component::setProperties to set the property vector
+ Lastly convertMap is run to rebuild the RLEMap
+Constraints: Page::setLines() must be run first
+Rev: 4/28/96
+---------------------------------------------------------------*/
+{
+ int currentCol, startRow, endRow, rowHeight;
+ ListElement* intrvl;
+ ListElement* tempintrvl;
+ /* printf("fnumLines = %d\n", fnumLines); */
+ Component* comp;
+ int totalSpacing = 0; // total blank horizontal pixels between components
+ int baselines[MaxVertSize]; // array for finding the baseline
+ last_status = 0.0;
+ int compCounter = 0;
+ int i;
+ int j;
+ int upwardBound; // Projection distances different for equations
+ int downwardBound; // and non-equations
+
+
+ bool inEqn; // Variables for finding if the center of a comp
+ int centerx; // is in an equation.
+ int centery;
+
+
+
+ printf("Extracting Components\n");
+ fLineComponents = new Components*[fnumLines];
+ for (i = 0; i < fnumLines; i++) {
+ if(ENABLE_USER_INTERFACE)
+ set_component_status(i, fnumLines);
+ currentCol = 0;
+ startRow = flineinfo[i].fstartrow;
+ endRow = flineinfo[i].fendrow;
+ rowHeight = endRow - startRow;
+ assert(rowHeight > 0);
+
+ for (j=0; j < MaxVertSize; j++)
+ baselines[j] = 0;
+ fLineComponents[i] = new Components();
+
+
+ while (currentCol<=fRLEMap->imageWidth()) { //until we reach the end of the page
+
+ //Build component starting with closest black dot
+ intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow);
+ if (intrvl == NULL) {
+ // printf("Reached end of line\n");
+ break;
+ }
+ comp = new Component(); //Make a new component named comp
+ assert(comp->AddToComponent(intrvl, fRLEMap, horizMerge));
+
+ //Now we want to extend upwards
+ //First check if there is a blank space to the right
+ tempintrvl =
+ fRLEMap->FindNearHorizDot(comp->lr().x(), startRow, endRow);
+
+
+ if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start >
+ comp->lr().x()+horizMerge+1)
+ while (comp->ul().y() < endRow) {
+
+ // find the center of the component to check if we are in an equation
+ centerx = (comp->ul().x() + comp->lr().x())/2;
+ centery = (comp->ul().y() + comp->lr().y())/2;
+ inEqn = inEquation(centerx, centery);
+ // Determine projection distance. Only project for non Equations.
+ if(inEqn)
+ {
+ upwardBound = comp->ul().y()+1;
+ downwardBound = comp->lr().y() - 1;
+ }
+ else
+ {
+ upwardBound = startRow;
+ downwardBound = endRow;
+ }
+ intrvl = fRLEMap->FindNearVertDot(comp->ul().x(),
+ comp->lr().x(), comp->lr().y(),
+ upwardBound);
+ // startRow);
+ if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
+ horizMerge)))
+ break;
+ if (intrvl == NULL) break;
+ }
+ else
+ while (comp->ul().y() < endRow) {
+
+ // find the center of the component to check if we are in an equation
+ centerx = (comp->ul().x() + comp->lr().x())/2;
+ centery = (comp->ul().y() + comp->lr().y())/2;
+ inEqn = inEquation(centerx, centery);
+ // Determine projection distance. Only project for non Equations.
+ if(inEqn)
+ {
+ upwardBound = comp->ul().y()+1;
+ downwardBound = comp->lr().y() - 1;
+ }
+ else // regular text
+ {
+ upwardBound = startRow;
+ downwardBound = endRow;
+ }
+
+ intrvl = fRLEMap->FindNearVertDot(comp->ul().x(),
+ comp->lr().x(), comp->ul().y(),
+ upwardBound);
+ // startRow);
+ if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
+ horizMerge)));
+ break;
+ if (intrvl == NULL) break;
+ }
+
+ //Now we want to extend downwards
+ while (comp->lr().y() > startRow) {
+ intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(),
+ comp->lr().y(), downwardBound);
+ if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap,
+ horizMerge)))
+ break;
+ if (intrvl == NULL) break;
+ }
+
+ // Now we toss out the noise
+ int size;
+ if (comp != NULL) {
+ if (comp->ul() < Point(0,0))
+ printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y());
+ else
+ size = fBitMap->pixelsInRegion(comp->ul(), comp->lr());
+ }
+ else
+ size = 0;
+ Component * prev = (Component *)(fLineComponents[i]->last->item);
+ if (size < MinComponentSize) {
+// printf("Deleting some noise of size %d\n", size);
+ // printComponent(comp);
+ delete comp;
+ comp = NULL;
+ }
+ else if (prev != NULL &&
+ abs(comp->ul().x() - prev->ul().x()) <= 1 &&
+ abs(comp->lr().x() == prev->lr().x()) <= 1)
+ {
+ // Check and see if this and the previous component have the
+ // same x boundaries, if so merge the two. Good for = and :
+ prev->join(comp);
+ prev->setProperties(fBitMap);
+ delete comp;
+ comp ==NULL;
+ }
+ else
+ {
+ compCounter++;
+ // display a rectangle around the component
+ if(ENABLE_USER_INTERFACE)
+ {
+ if(DISPLAY_BOUNDING_BOXES)
+ comp->display_bounding_box();
+ }
+
+ // JMH - make an array of frequency of the y coord of bottom of comp
+ int vertOffset = endRow - comp->lr().y();
+ if(vertOffset < MaxVertSize && vertOffset >= 0)
+ baselines[vertOffset]++;
+
+
+ comp->setProperties(fBitMap);
+ if(fLineComponents[i]->last != NULL)
+ {
+ int thisSpacing = comp->ul().x() -
+ ((Component *) (fLineComponents[i]->last->item))->lr().x();
+ // if a realy big space, make space the width of this comp
+ if (thisSpacing > 200)
+ thisSpacing = 2*(comp->lr().x() - comp->ul().x());
+ totalSpacing += thisSpacing;
+ }
+
+ fLineComponents[i]->Append(comp); // add this component to list
+ currentCol = (comp->ul()).x() + 1; // update position on page
+ }
+ }
+
+ // find most popular bottom of comp and call it the baseline
+ int counter = 0;
+ int baseline;
+ for (j=0; j < MaxVertSize; j++) {
+ if (counter < baselines[j]) {
+ counter = baselines[j];
+ baseline = endRow - j;
+ }
+ }
+ // printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline);
+ // Now assign each character a group based on it's location
+ for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL;
+ ptr = ptr->next) {
+ comp = (Component*) ptr->item;
+ comp->charGroup = 0;
+
+ // if top of char is higher than top - tolerance
+ if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) {
+ comp->charGroup += 2; //tall like a T
+ }
+
+ // if bottom of char is lower than base - tolerance
+ if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) {
+ comp->charGroup += 1; //has a tail like a y
+ } else
+ if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) {
+ comp->charGroup = 4; //floating like a '
+ /* printf("bottom at %d < %d\n", comp->lr().y(),
+ baseline - (2*rowHeight/BaseLineTolerance)); */
+ }
+ // printf("added character in group %d\n", comp->charGroup);
+ }
+ }
+ /* printf("Found %d components on this page.\n", compCounter); */
+ // printComponents();
+ last_status = 0.0;
+ if(ENABLE_USER_INTERFACE)
+ set_status("Done extracting characters");
+ if((compCounter - fnumLines) > 0) /* don't want divide by zero */
+ {
+ favgSpacing = totalSpacing / (compCounter - fnumLines);
+ }
+ else
+ {
+ favgSpacing = 1;
+ }
+ delete fRLEMap;
+ fRLEMap = new RLEMap;
+ convertMap(fBitMap, fRLEMap);
+}
+
+void Page::printComponents()
+/*--------------------------------------------------------------
+Primary Purpose: Debugging routine that prints little bitmaps
+of low confidence characters
+---------------------------------------------------------------*/
+{
+ int compcounter = 0;
+ for (int i = 0; i < fnumLines; i++) {
+ Component* comp;
+ for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL;
+ ptr = ptr->next) {
+ compcounter++;
+ comp = (Component *) ptr->item;
+ if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n')
+ {
+ printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n",
+ (comp->ul()).x(), (comp->ul()).y(),
+ (comp->lr()).x(), (comp->lr()).y());
+ printComponent(comp);
+ printf("properties: ");
+ printVector(comp->properties(), numProperties);
+ printf("I think it's a -> %c <- confidence: %d line: %d group: %d Comp#%d\n",
+ comp->asciiId(),
+ comp->confid(), i+1, comp->charGroup, compcounter);
+ printf("\n*******************************************************\n");
+ }
+ }
+ }
+}
+
+void Page::printComponent(Component* comp)
+// Print a single component.
+{
+ int right = comp->ul().x()+78;
+ if (comp->lr().x() < right)
+ right = comp->lr().x();
+
+ for (int r = comp->ul().y();
+ r <= comp->lr().y(); r++){
+ for (int c = comp->ul().x();
+ c <= right; c++)
+ bitprint(fBitMap->row(r)[c/8], c%8);
+ printf( "\n");
+ }
+}
+
+int spacing(ListElement * compa, ListElement * compb);
+// helper function for extractWords (defined below)
+
+MapStatus Page::extractWords()
+/*--------------------------------------------------------------
+Primary Purpose: Extract words from each lines components
+Effects: sets the fWordsList to be a list of all of the words
+in the document.
+Constraints: extractComponents must be run first
+Rev: KM 11/7/95
+---------------------------------------------------------------*/
+{
+ bool inWord;
+ ListElement * start; // word Start
+ int count; // counts the components in the word
+ int wordlength; // counts the characters in the word
+ int word_count = 0;
+ int spacingThreshold = (int) (1.25 * ((float) (favgSpacing)));
+ fWordList = new Words;
+ last_status = 0.0;
+ for (int i = 0; i < fnumLines; i++)
+ {
+ if(ENABLE_USER_INTERFACE)
+ set_extract_status(i, fnumLines);
+ inWord = FALSE;
+ for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) {
+ Component * item = (Component *) ptr->item;
+ if(!inWord)
+ {
+ start = ptr;
+ count = 1;
+ if (item->fasciiId == NULL)
+ wordlength = 1;
+ else
+ wordlength = strlen(item->fasciiId);
+ inWord = TRUE;
+ }
+ if( spacing(ptr, ptr->next) > spacingThreshold ||
+ inEquation( ptr))
+ {
+ Word * newWord = new Word(start,count,wordlength);
+ (words())->Append(newWord);
+ if(1)
+ printf("%s ",newWord->characters);
+ inWord = FALSE;
+ word_count++;
+ }
+ else
+ count++;
+ if (item->fasciiId == NULL) wordlength ++;
+ else wordlength += strlen(item->fasciiId);
+ }
+ // Add in a separate word for new line
+ Word * newWord = new Word("\n",2);
+ (words())->Append(newWord);
+ printf("%s", newWord->characters);
+ word_count++;
+ }
+ last_status = 0.0;
+ fWordList->num_words = word_count;
+ if(ENABLE_USER_INTERFACE)
+ set_status("Done extracting words");
+ return VALID;
+}
+
+void Page::spellcheck()
+/*--------------------------------------------------------------
+Primary Purpose: Run spell checker on word list.
+Constraints: extractWords must be run first
+Rev: AR
+---------------------------------------------------------------*/
+{
+ int word_count = 0;
+ Word* temp_word;
+ for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next)
+ {
+ word_count++;
+ if(ENABLE_USER_INTERFACE)
+ set_spellcheck_status(word_count, fWordList->num_words);
+ temp_word = (Word*)ptr->item;
+ if(0)
+ printf("Spellchecking word %s\n", temp_word->characters);
+ if(mispelled(temp_word->characters))
+ {
+ temp_word->mispelled = TRUE;
+ }
+ }
+}
+
+int Page::spacing(ListElement * compa, ListElement * compb)
+// spacing from end of comp_a to begining of comp_b
+{
+ int x;
+ if (compb == NULL) return 1000; // end of line
+
+ Component * a = ((Component *) (compa)->item);
+ Component * b = ((Component *) (compb)->item);
+ int returnval = (b->ul().x() - a->lr().x());
+ if (returnval < 0)
+ {
+ return 0;
+ }
+ assert (returnval >= 0);
+ return returnval;
+
+}
+
+
+void Page::printWords()
+// Prits out each component of each word. This can take a very long time
+{
+
+ Word * thisWord;
+ for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next)
+ {
+ thisWord = (Word *) ptr->item;
+ printf("!!!!!! NEW WORD %s confid : %d !!!!!\n", thisWord->characters, thisWord->confid);
+ for(int i = 0; i < thisWord->charCount; i++)
+ {
+ Component * comp = thisWord->character[i];
+ if (comp == NULL) continue;
+ printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n",
+ (comp->ul()).x(), (comp->ul()).y(),
+ (comp->lr()).x(), (comp->lr()).y());
+ for (int r = comp->ul().y();
+ r <= comp->lr().y(); r++){
+ for (int c = comp->ul().x();
+ c <= comp->lr().x(); c++)
+ bitprint(fBitMap->row(r)[c/8], c%8);
+ printf( "\n");
+ }
+ printf("properties: ");
+ printVector(comp->properties(), numProperties);
+ printf("Identification: %c distance: %d confidence %d\n",
+ comp->asciiId(),
+ comp->distance(&LearnedChars[comp->asciiId()]),
+ comp->confid());
+ printf("\n***********************************************\n");
+ }
+ }
+}
+
+MapStatus Page::recognize()
+/*--------------------------------------------------------------
+Primary Purpose: Recognize entire page. Sets font and ascii id of
+each component
+Return Value: VALID if no error occurred OTHERERROR otherwise
+Constraints: extractComponents must be run first.
+See recognize(line) below for more detailed info
+Rev: KM
+---------------------------------------------------------------*/
+{
+ printf("Recognizing document\n");
+ last_status = 0.0;
+ for (int i = 0; i< fnumLines; i++)
+ {
+ if(ENABLE_USER_INTERFACE)
+ set_recognize_status(i, fnumLines);
+ recognize(i);
+ }
+
+ last_status = 0.0;
+ return VALID;
+
+}
+
+
+MapStatus Page::recognize(int linenum)
+/*--------------------------------------------------------------
+Primary Purpose: Recognize a line of connected components
+Arguments: linenum is line number to recognize
+Effects: sets ascii identification fontid and confidence in each component
+If confidence is low and character is big enough for two characters.
+divideAndRecognize is called to split up the component.
+Constraints: extractComponents must be run first
+Rev: KM 11/9/95
+---------------------------------------------------------------*/
+{
+ Component * comp;
+ Distance d;
+
+ for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next)
+ {
+ comp = (Component *) ptr->item;
+
+ d = comp->recognize(LearnedGroups);
+ if (comp->confid() < ConfidenceThreshold &&
+ comp->width() > 2*MinWidth) // really wide
+ divideAndRecognize(line(linenum), ptr, d);
+
+ /***
+ if (comp->confid() < ConfidenceThreshold ||
+ (ptr != line(linenum)->first &&
+ ((Component *) ptr->previous->item)->confid() < ConfidenceThreshold))
+ uniteAndRecognize(line(linenum), ptr, d);
+ ***/
+
+ }
+
+ return VALID;
+}
+
+
+
+void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d)
+/*--------------------------------------------------------------
+Primary Purpose: Identify and separate merged characters
+Arguments:ptr is a pointer to a list element containing a component
+ d is the current recognition distance on the component
+Effects: Subdivides component into two parts, Division is made at
+ the minimum vertical height of the component. If the
+ minHeight > JoinTolerance no divison will be made.
+ (JoinTolerance is a global var that determines
+ the maximum number of merged pixels that are allowed in a
+ column for a division to be made)
+ When a division is made. The component's boundaries are
+ adjusted accordingly and a new component is inserted into
+ the list.
+
+ Returns if distance is acceptable or width of component
+ is <= MinWidth*2
+Rev: KM 11/24/95
+---------------------------------------------------------------*/
+{
+ Component * comp = (Component *) ptr->item;
+ Component * newComp;
+ bool allGroups = TRUE;
+
+ // Save the original component boundaries just in case we cant improve
+ Point oldlr = comp->lr();
+ Point oldul = comp->ul();
+ int oldwidth = (int) comp->width();
+
+ // Some easy access x,y coordinates
+ int ulx = comp->ul().x();
+ int uly = comp->ul().y();
+ int lrx = comp->lr().x();
+ int lry = comp->lr().y();
+
+ Distance newdist, bestdist;
+ int bestlrx;
+
+ if (comp->confid() > ConfidenceThreshold)
+ return;
+
+ if (oldwidth < MinWidth*2) // cant be split in two
+ {
+ return;
+ }
+
+ // Determine where to split. Split at the thinnest point
+ // within JoinTolerance (maximum number of pixels that might be fused)
+
+ int minHeight = (int)comp->height();
+ bestlrx = comp->lr().x();
+ for(int i = MinWidth; i < oldwidth - MinWidth; i++)
+ {
+
+ int newHeight =
+ fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
+ if (newHeight < minHeight)
+ {
+ minHeight = newHeight;
+ bestlrx = ulx+i;
+ }
+ }
+// printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
+
+
+ if (bestlrx < lrx && minHeight < JoinTolerance)
+ {
+ comp->lr().x() = bestlrx;
+ int shrunk = comp->vertShrink(fBitMap);
+ comp->setProperties(fBitMap);
+ if (shrunk) // ignore group if we had to shrink down
+ newdist = comp->recognize(LearnedGroups, allGroups);
+ else
+ newdist = comp->recognize(LearnedGroups);
+
+// printf("Distance = %u asciiid = %c \n", newdist, comp->asciiId());
+
+ Component * newcomp = new Component(Point(bestlrx+1, oldul.y())
+ , oldlr);
+ newcomp->vertShrink(fBitMap);
+ newcomp->setProperties(fBitMap);
+ int newcompdist = newcomp->recognize(LearnedGroups,allGroups);
+
+ if ((newdist < d) && (newcomp->confid() > ConfidenceThreshold*.6))
+ {
+ list->insertAfter(ptr, newcomp);
+ newcomp->display_bounding_box("red");
+ comp->display_bounding_box("red");
+ }
+ else
+ {
+ comp->ul() = oldul;
+ comp->lr() = oldlr;
+ comp->setProperties(fBitMap);
+ comp->recognize(LearnedGroups);
+ delete newcomp;
+ }
+ return;
+ }
+
+
+ return;
+
+}
+
+
+void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d)
+/*--------------------------------------------------------------
+Primary Purpose: Identify and merge a separated character
+Arguments:ptr is a pointer to a list element containing a component
+ d is the current recognition distance on the component
+Effects: Unite two components into one.
+
+Rev: 5/6/96
+---------------------------------------------------------------*/
+{
+ if (ptr->previous == NULL) return;
+ Component * part1 = (Component *) ptr->previous->item;
+ Component * part2 = (Component *) ptr->item;
+
+
+ Point ul, lr;
+ ul = part1->ul();
+ lr = part2->lr();
+ if (ul.y() > lr.y() || ul.x() > lr.x())
+ return;
+ Component * newcomp = new Component(ul, lr);
+
+ newcomp->setProperties(fBitMap);
+ if (part1->charGroup <= 3 && part2->charGroup <= 3)
+ newcomp->charGroup = (part1->charGroup | part2->charGroup);
+ else if (part1->charGroup == 4)
+ newcomp->charGroup = (part2->charGroup | 2);
+ else
+ newcomp->charGroup = (part1->charGroup | 2);
+ if (newcomp->charGroup > 4) newcomp->charGroup = 4;
+
+ int newdist = newcomp->recognize(LearnedGroups);
+
+ if (newdist < d && newcomp->confid() > ConfidenceThreshold)
+ {
+ list->removeAt(ptr->previous);
+ list->insertAfter(ptr, newcomp);
+ list->removeAt(ptr);
+ } else delete newcomp;
+
+return;
+
+}
+
+
+int Page::writeWordPos(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Writes word position, confidence, length and string to file
+Arguments: output file name
+Return Value: 1 if successful. 0 if an error occured
+Effects: Calls fWordList->printWordPos
+ // Output format for each word
+ "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(),
+ word->confid, word->charCount, word->characters
+Rev: 11/25/95
+---------------------------------------------------------------*/
+{ return fWordList->writeWordPos(filename);};
+
+int Page::writeWordbox(char * filename, int xoffset= 0, int yoffset = 0,
+ bool equationsOnly = FALSE)
+/*--------------------------------------------------------------
+Primary Purpose: Write out word to scanworks wordbox file
+Arguments: output file, xoffset, yoffset, equationsOnly bool if we only want
+equations.
+Return Value:
+Effects: calls fWordList->writeWordbox
+ // output format for each word
+ "%s %d %d %d %d %d %d %d % \n",
+ word->characters,
+ word->ul.x(), word->ul.y(),
+ word->lr.x(), word->lr.y(),
+ word->lr.x(), word->ul.y(),
+ word->ul.x(), word->lr.y() );
+ New line between lines of text
+Rev: 11/25/95
+---------------------------------------------------------------*/
+{ return fWordList->writeWordbox(filename, xoffset, yoffset, this, equationsOnly);};
+
+
+int Page::writeAscii(char * filename)
+/*--------------------------------------------------------------
+Primary Purpose: Write word list to asii file
+Arguments: filename to write to
+Return Value: 1 if successful 0 if unsuccessful
+Effects: Calss fWordList->writeAscii(filename)
+Writes words to fill in text format using MinLineSize
+to differentiate lines.
+Rev: 11/25 KM
+---------------------------------------------------------------*/
+
+{return fWordList->writeAscii(filename);};
+
+
+
+int Page::addEquation(int startline, int startcol, int endline, int endcol)
+/*--------------------------------------------------------------
+Primary Purpose: Add an equation to the equation list
+Arguments: boundaries of equation
+Effects: Adds an element fEqnList
+Rev: 4/21/96
+---------------------------------------------------------------*/
+{
+ EqnMarker * newEqn = new EqnMarker(startline, startcol, endline, endcol);
+ fEqnList->SortedInsert(newEqn, startline);
+}
+
+int Page::deleteEquation(int col, int row)
+/*--------------------------------------------------------------
+Primary Purpose: deletes equations with this coordinate.
+Arguments: coordinate of equation to remove
+Return Value: 1 if element was remove, 0 otherwise
+Effects: removes any equation containing this coordinate
+Rev: 4/21/96
+---------------------------------------------------------------*/
+{
+ // first determine line number.
+ int linenum;
+
+ for (int i = 0; i < fnumLines; i++)
+ if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row)
+ {
+ linenum = i;
+ break;
+ }
+
+ for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next)
+ {
+ EqnMarker * eqn = (EqnMarker *) ptr->item;
+ if (linenum == eqn->startline && linenum == eqn->endline)
+ {
+ if (col >= eqn->startcol && col <= eqn->endcol)
+ {
+ delete eqn;
+ setTclDeleteVars(eqn);
+ fEqnList->removeAt(ptr);
+ return 1;
+ }
+ }
+ else if (linenum == eqn->startline && col >= eqn->startcol)
+ {
+ delete eqn;
+ setTclDeleteVars(eqn);
+ fEqnList->removeAt(ptr);
+ return 1;
+ }
+ else if (linenum > eqn->startline && linenum < eqn->endline)
+ {
+ delete eqn;
+ setTclDeleteVars(eqn);
+ fEqnList->removeAt(ptr);
+ return 1;
+ }
+ else if (linenum == eqn->endline && col <= eqn->endcol)
+ {
+ delete eqn;
+ setTclDeleteVars(eqn);
+ fEqnList->removeAt(ptr);
+ return 1;
+ }
+
+ }
+
+ return 0;
+
+
+}
+
+void Page::setTclDeleteVars(EqnMarker * eqn)
+{
+
+if (ENABLE_USER_INTERFACE)
+ {
+ docommand("set deleted 1");
+ docommand("set curline %d",eqn->endline);
+ docommand("set curline_startrow %d",flineinfo[eqn->endline].fstartrow);
+ docommand("set curline_endrow %d",flineinfo[eqn->endline].fendrow);
+ docommand("set curx %d", eqn->endcol);
+
+ // prevlines are actually starting lines but allowed same use of
+ // tcl add equation code
+ docommand("set prevline %d",eqn->startline);
+ docommand("set prevline_startrow %d",flineinfo[eqn->startline].fstartrow);
+ docommand("set prevline_endrow %d",flineinfo[eqn->startline].fendrow);
+ docommand("set prevx %d", eqn->startcol);
+
+
+
+ // this will change with zoning
+ docommand("set curline_startcol %d",0);
+ docommand("set curline_endcol %d",get_width());
+
+
+ }
+
+
+}
+
+Component * Page::compAt(Point p)
+/*--------------------------------------------------------------
+Primary Purpose: Calls Components::compAt to return the smallest
+ component containing point p
+Return Value: Pointer to the component or null if no component here
+Effects:
+Rev: 4/25/96
+---------------------------------------------------------------*/
+{
+ Component * returnComp= NULL;
+ int linenum = get_linenum(p.x(), p.y() );
+
+ if (linenum >= 0)
+ {
+ Components * complist = line(linenum);
+ returnComp = complist->compAt(p);
+ }
+ if (returnComp == NULL)
+ printf("No component found at ( %d, %d)\n ", p.x(), p.y());
+ else
+ printf("Component found at ( %d, %d)\n ul = (%d, %d) lr = (%d, %d)\n "
+ , p.x(), p.y(),returnComp->ul().x(),returnComp->ul().y(),
+ returnComp->lr().x(),returnComp->lr().y());
+
+
+ return returnComp;
+}
+
+
+bool Page::inEquation(int col, int row)
+/*--------------------------------------------------------------
+Primary Purpose: determine if x,y is in an equation
+Arguments: x,y coordinates
+Return Value: true if in an Equation, false otherwise
+Effects: determines if equation with these coordinated is in fEqnList
+Rev: 11/25/95
+---------------------------------------------------------------*/
+{
+ // first determine line number.
+ int linenum = get_linenum(col, row);
+
+
+ for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next)
+ {
+ EqnMarker * eqn = (EqnMarker *) ptr->item;
+ if (linenum == eqn->startline && linenum == eqn->endline)
+ {
+ if(col >= eqn->startcol && col <= eqn->endcol)
+ return true;
+ }
+ else if (linenum == eqn->startline && col >= eqn->startcol)
+ return true;
+ else if (linenum > eqn->startline && linenum < eqn->endline)
+ return true;
+ else if (linenum == eqn->endline && col <= eqn->endcol)
+ return true;
+ }
+ return false;
+
+}
+
+bool Page::inEquation(ListElement * comp)
+/*--------------------------------------------------------------
+Primary Purpose: determine if the component in this list element
+ is in an equation
+Arguments: A list element from a component list
+Return Value: true if in equation, false otherwise
+Effects: calls inEquation(x,y) to do the real work
+Rev: 4/21/96
+---------------------------------------------------------------*/
+{
+ Component * c = (Component *) comp->item;
+ return inEquation(c->ul().x(), c->ul().y());
+}
+
+
+int Page::writeEquations(char * filename, int lineOffset)
+/*--------------------------------------------------------------
+Primary Purpose: Writes boundaries of equations
+Arguments: output file name
+Return Value: 1 if successful 0 otherwise
+Effects: Outputs to filename for each equation
+int startline, int startcol, int endline, int endcol <CR/LF>
+Rev: 11/25/95
+---------------------------------------------------------------*/
+{
+ FILE * outfile;
+ outfile = fopen(filename, "w");
+ if (outfile == NULL)
+ {
+ printf("Error openning %s", filename);
+ return 0;
+ }
+
+ for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next)
+ {
+ EqnMarker * eqn = (EqnMarker *) ptr->item;
+ fprintf(outfile, " %6d %6d %6d %6d\n", eqn->startline+lineOffset,
+ eqn->startcol,
+ eqn->endline+lineOffset, eqn->endcol);
+ }
+fclose(outfile);
+return 1;
+}
+
+void Page::join(Component * a, Component * b)
+{
+ if (a == b) return;
+ Component * primary;
+ Component * secondary;
+
+ primary = (( a < b) ? a : b);
+ secondary = ((primary == a) ? b : a);
+ assert(primary != secondary);
+ assert(get_linenum(a) == get_linenum(b));
+
+ primary->join(secondary);
+
+ // remove secondary component from component list.
+ int linenum = get_linenum(secondary);
+ line(linenum)->removeElement(secondary);
+
+
+}
+
+
+
+
+
+int Page::thinnestHorizontalSplit(Components * complist,
+ ListElement * compptr)
+/*--------------------------------------------------------------
+Primary Purpose: Splits this component at thinnest point
+Arguments: the component list that contains the compoent and
+ a pointer to its listelement
+Return Value: 1 if split performed 0 otherwise.
+Effects: Adds a new component to the list
+Constraints:
+Rev: 4/26
+---------------------------------------------------------------*/
+{
+ Component * comp = (Component *) compptr->item;
+ // Some easy access x,y coordinates
+ int ulx = comp->ul().x();
+ int uly = comp->ul().y();
+ int lrx = comp->lr().x();
+ int lry = comp->lr().y();
+
+ int bestlrx;
+
+ // Determine where to split. Split at the thinnest point
+ // within JoinTolerance (maximum number of pixels that might be fused)
+
+
+ int minHeight = (int)comp->height();
+ int oldwidth = (int) comp->width();
+
+ bestlrx = comp->lr().x();
+ // MinWidth is the minimum width of a learned charcter
+ for(int i = MinWidth; i < oldwidth - MinWidth; i++)
+ {
+
+ int newHeight =
+ fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry));
+ if (newHeight < minHeight)
+ {
+ minHeight = newHeight;
+ bestlrx = ulx+i;
+ }
+ }
+// printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight);
+ horizontalCompSplit(complist, compptr, bestlrx);
+
+
+}
+
+int Page::thinnestHorizontalSplit(Component * comp)
+{
+
+ int i = get_linenum(comp);
+
+ Components * complist = fLineComponents[i];
+ ListElement * compptr;
+
+ for (ListElement * ptr = complist->first; ptr != NULL; ptr = ptr->next)
+ {
+ if ((Component *) (ptr->item) == comp)
+ {
+ compptr = ptr;
+ thinnestHorizontalSplit(complist, compptr);
+ return 1;
+ }
+ }
+ return 0;
+
+ }
+
+int Page::horizontalCompSplit(Components * complist,
+ ListElement * compptr, int x)
+/*--------------------------------------------------------------
+Primary Purpose: Split this component in the list into two components
+ at the indicated x coordinate
+Arguments: x coordinate of splite
+Return Value: 1 if split is performed 0 otherwise
+Effects: Adds a new element to the list. One component is split into two
+Constraints: fulx <= x >= flrx
+Rev: 4/26/96
+---------------------------------------------------------------*/
+
+{
+ Component * comp = (Component *) compptr->item;
+ bool allGroups = TRUE;
+ comp->display_bounding_box("white");
+
+ if( x < comp->ul().x() || x > comp->lr().x())
+ {
+ cout << " Cant split component " << x << "is not between"
+ << comp->ul().x() << "and" << comp->lr().x() << endl;
+ return 0;
+ }
+ else
+ {
+ Component * newcomp = new Component(Point(x,comp->ul().y()),
+ comp->lr());
+ comp->lr().x() = x-1;
+ int compShrunk = comp->vertShrink(fBitMap);
+ comp->setProperties(fBitMap);
+ if(compShrunk)
+ comp->recognize(LearnedGroups, allGroups);
+ else
+ comp->recognize(LearnedGroups);
+
+
+ int newCompShrunk = newcomp->vertShrink(fBitMap);
+ newcomp->setProperties(fBitMap);
+
+ if(newCompShrunk) // ignore group if shrunk
+ newcomp->recognize(LearnedGroups, allGroups);
+ else
+ newcomp->recognize(LearnedGroups);
+
+ complist->insertAfter(compptr, newcomp);
+ comp->display_bounding_box("blue");
+ newcomp->display_bounding_box("blue");
+
+ return 1;
+ }
+
+
+}
+
+
+ZonedPage::ZonedPage()
+ :Page(){ fzones = new Zones();}
+
+ZonedPage::~ZonedPage()
+{
+ ((Page *)this)->~Page();
+ delete fzones;
+}
+
+Zones * ZonedPage::zones()
+ { return fzones; }
+
+Page * ZonedPage::activate(int x, int y)
+ // activate the page at Point(x,y)
+{
+ Zone * activeZone = zones()->findZone(x,y);
+ if (activeZone == NULL) return NULL;
+ docommand("set cur_xoffset %d", activeZone->ul().x());
+ docommand("set cur_yoffset %d", activeZone->ul().y());
+
+ if (activeZone->page() == NULL)
+ {
+ activeZone->buildPage(this);
+ }
+
+ return activeZone->page();
+
+}
+
+void ZonedPage::autoZone(int horizMerge, int vertMerge)
+{ // autoZone tries to automatically zone page
+ Point curul;
+ Point curlr;
+ int changed = 1;
+
+ if (components() != NULL)
+ delete components();
+
+ extractComponents(horizMerge);
+
+ while(changed)
+ {
+ changed = 0;
+ for (int i=0; i < numLines(); i++)
+ {
+ for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next)
+ {
+ Component * mainitem = (Component *) mptr->item;
+ for (int j= i; j < numLines(); j++)
+ for(ListElement * ptr = line(j)->first; ptr != NULL; ptr=ptr->next)
+ {
+
+ Component * item = (Component *) ptr->item;
+ if( (item->ul().y() - mainitem->lr().y()) <= vertMerge &&
+ (mainitem != item) &&
+ mainitem->xoverlap(item))
+ {
+ mainitem->join(item);
+ (line(j))->removeAt(ptr);
+ changed = 1;
+ }
+ }
+ }
+ }
+ }
+
+
+ for (int i=0; i < numLines(); i++)
+ {
+ for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next)
+ {
+ Component * mainitem = (Component *) mptr->item;
+ /* printf(" (ul(%d,%d) lr(%d,%d)) ", mainitem->ul().x(),
+ mainitem->ul().y(), mainitem->lr().x(), mainitem->lr().y());
+ */
+ /*
+ mainitem->display_bounding_box("blue",
+ ZONING_SCALE_FACTOR,
+ ".zoning_window.work_space");
+ */
+ Point ul = Point(mainitem->ul().x() -1,mainitem->ul().y() -1);
+ Point lr = Point( mainitem->lr().x() +1, mainitem->lr().y() +1);
+ docommand("start_region %d %d", (int)(ul.x()*ZONING_SCALE_FACTOR),
+ (int)(ul.y()*ZONING_SCALE_FACTOR));
+
+ docommand("end_region %d %d", (int)(lr.x()*ZONING_SCALE_FACTOR),
+ (int)(lr.y()*ZONING_SCALE_FACTOR));
+
+ Zone * newzone = new Zone(ul,lr);
+ zones()->Append(newzone);
+ }
+
+
+ }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+