|
1 /** Page.cc contains the member functions for the primary OCR class Page */ |
|
2 #include "system.h" |
|
3 #include "Page.h" |
|
4 #include "convertMap.h" |
|
5 #include "get_skew.h" |
|
6 #include "Component.h" |
|
7 #include "status_message.h" |
|
8 |
|
9 /*** Member functions of class Page. ***/ |
|
10 |
|
11 int Page::get_height() |
|
12 { |
|
13 return fRLEMap->imageLength(); |
|
14 } |
|
15 |
|
16 int Page::get_width() |
|
17 { |
|
18 return fRLEMap->imageWidth(); |
|
19 } |
|
20 |
|
21 int Page::get_linenum(int col, int row) |
|
22 /*-------------------------------------------------------------- |
|
23 Primary Purpose: Returns line number of x,y coordinates (just uses y for now) |
|
24 called from proc equation_mark in new_ui.tcl |
|
25 Return value: line number or -1 if no line is here. |
|
26 Requires: setLines be run first |
|
27 Rev: 4/21/96 |
|
28 ---------------------------------------------------------------*/ |
|
29 { |
|
30 assert (flineinfo != NULL); |
|
31 int linenum= -1; |
|
32 |
|
33 for (int i = 0; i < fnumLines; i++) |
|
34 if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row) |
|
35 { |
|
36 linenum = i; |
|
37 if (ENABLE_USER_INTERFACE) |
|
38 { |
|
39 // save last mark before it is overwritten |
|
40 |
|
41 docommand("set curline %d",linenum); |
|
42 docommand("set curline_startrow %d",flineinfo[i].fstartrow); |
|
43 docommand("set curline_endrow %d",flineinfo[i].fendrow); |
|
44 |
|
45 |
|
46 // this will change with zoning |
|
47 docommand("set curline_startcol %d",0); |
|
48 docommand("set curline_endcol %d",get_width()); |
|
49 |
|
50 } |
|
51 |
|
52 break; |
|
53 |
|
54 } |
|
55 return linenum; |
|
56 |
|
57 } |
|
58 |
|
59 int Page::send_words_to_tcl() |
|
60 /*-------------------------------------------------------------- |
|
61 Primary Purpose: Display words in tcl |
|
62 Rev - AR |
|
63 ---------------------------------------------------------------*/ |
|
64 { |
|
65 int word_count = 0; |
|
66 int unknown_char_count = 0; |
|
67 int low_precision_count = 0; |
|
68 int mispelled_count = 0; |
|
69 char* send_chars; |
|
70 Word* temp_word; |
|
71 if(ENABLE_USER_INTERFACE) set_status("Displaying text"); |
|
72 for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next) |
|
73 { |
|
74 word_count++; |
|
75 set_text_display_status(word_count, fWordList->num_words); |
|
76 temp_word = (Word*)ptr->item; |
|
77 send_chars = backslashify(temp_word->characters); |
|
78 /* printf("Added word %s Confidence = %d\n", send_chars, |
|
79 temp_word->confid); */ |
|
80 if(temp_word->confid < VERY_LOW_CONFIDENCE) |
|
81 { |
|
82 docommand("addword \"%s\" %d %d UNKNOWN_CHAR", send_chars, temp_word->ul.x(), temp_word->ul.y()); |
|
83 unknown_char_count++; |
|
84 } |
|
85 else if(temp_word->confid < LOW_CONFIDENCE) |
|
86 { |
|
87 docommand("addword \"%s\" %d %d LOW_PRECISION", send_chars, temp_word->ul.x(), temp_word->ul.y()); |
|
88 low_precision_count++; |
|
89 } |
|
90 else if((temp_word->mispelled) && SPELLCHECK) |
|
91 { |
|
92 docommand("addword \"%s\" %d %d MISPELLED", send_chars, temp_word->ul.x(), temp_word->ul.y()); |
|
93 mispelled_count++; |
|
94 } |
|
95 else |
|
96 { |
|
97 docommand("addword \"%s\" %d %d OK", send_chars, temp_word->ul.x(), temp_word->ul.y()); |
|
98 } |
|
99 update(); |
|
100 } |
|
101 if(ENABLE_USER_INTERFACE) |
|
102 { |
|
103 set_status("Done displaying text"); |
|
104 set_status("Apparent word accuracy: %.3lf%%", (100 - (100 * ((double)(mispelled_count + unknown_char_count + low_precision_count) / (double)word_count)))); |
|
105 } |
|
106 } |
|
107 |
|
108 |
|
109 int Page::deskew(int deskew_method) |
|
110 /*-------------------------------------------------------------- |
|
111 Primary Purpose: Deskew the page |
|
112 Arguments: 1 - RLE Rotation |
|
113 0 - BitMap Rotation |
|
114 Return Value: 1 if successful, 0 if unsuccessful |
|
115 Effects: updates the bitmap and rlemap of the page |
|
116 Constraints: RLEMap Rotation is not currently reliable and probably |
|
117 should not be used |
|
118 Rev: AR |
|
119 ---------------------------------------------------------------*/ |
|
120 { |
|
121 /* a little ugly.... if the page is rotated |
|
122 in here, return 1, else 0 */ |
|
123 |
|
124 if(deskew_method == RLE_DESKEW) |
|
125 { |
|
126 if(fRLEMap->deskew()) |
|
127 { |
|
128 convertMap(fRLEMap, fBitMap); |
|
129 return 1; |
|
130 } |
|
131 return 0; |
|
132 } |
|
133 else |
|
134 { |
|
135 double skew = get_skew(fRLEMap); |
|
136 if((skew >= MINIMUM_SKEW_ANGLE)||(skew <= - MINIMUM_SKEW_ANGLE)) |
|
137 { |
|
138 fBitMap->rotateMap(skew); |
|
139 convertMap(fBitMap, fRLEMap); |
|
140 return 1; |
|
141 } |
|
142 return 0; |
|
143 } |
|
144 } |
|
145 |
|
146 Page::Page() |
|
147 /**Page::Page - constructor allocates bitmap and rlemap*/ |
|
148 { |
|
149 fBitMap = new BitMap; |
|
150 fRLEMap = new RLEMap; |
|
151 fEqnList = new EqnMarkers; |
|
152 fLineComponents = NULL; |
|
153 fWordList = NULL; |
|
154 } |
|
155 |
|
156 Page::~Page() |
|
157 /*-------------------------------------------------------------- |
|
158 Primary Purpose: Destructor deallocates private fields that |
|
159 have been created. |
|
160 Rev: |
|
161 ---------------------------------------------------------------*/ |
|
162 { |
|
163 |
|
164 if (flineinfo) delete flineinfo; |
|
165 for (int i = 0; i <fnumLines; i++) |
|
166 if(fLineComponents[i] != NULL) delete fLineComponents[i]; |
|
167 if(fLineComponents) delete fLineComponents; |
|
168 if (fBitMap) delete fBitMap; |
|
169 if (fRLEMap) delete fRLEMap; |
|
170 if (fWordList) delete fWordList; |
|
171 if (fEqnList) delete fEqnList; |
|
172 } |
|
173 |
|
174 Angle Page::skewAngle() |
|
175 /*-------------------------------------------------------------- |
|
176 Primary Purpose: Determine the angle of rotation of the RLEMap r |
|
177 Arguments: pointer to an RLEMap |
|
178 Return Value: detected angle of rotation |
|
179 Code is in get_skew.cc |
|
180 Rev: AR |
|
181 ---------------------------------------------------------------*/ |
|
182 { |
|
183 return get_skew(fRLEMap); |
|
184 } |
|
185 |
|
186 |
|
187 MapStatus Page::readMap(char * filename) |
|
188 // Calls BitMap::readMap and then converts |
|
189 { |
|
190 MapStatus status; |
|
191 status = fBitMap->readMap(filename); |
|
192 convertMap(fBitMap, fRLEMap); |
|
193 return status; |
|
194 } |
|
195 |
|
196 |
|
197 |
|
198 MapStatus Page::setLines() |
|
199 /*-------------------------------------------------------------- |
|
200 Primary Purpose: Set flineinfo array in Page class with the |
|
201 starting and ending rows of each line of text. |
|
202 Also sets fnumLines to the number of lines |
|
203 Arguments: none |
|
204 Return Value: A Mapstatus either VALID, EMPTY if there is no |
|
205 data in the RLEMAP, or OTHERERROR if there is an unexpected error |
|
206 Effects: Allocates flineinfo and fills with starting and ending row |
|
207 of each line. The following global variables are used as parameters |
|
208 in this function. These are defined in system.cc |
|
209 NoiseTolerance - Rows whose number of pixels is less than this value |
|
210 will be considered empty (current val 6). |
|
211 MinVertSeparation - The minimum number of rows separating lines of text. |
|
212 Lines will be merged if actual Separation is less than this |
|
213 value. (current val 3) |
|
214 MinLineSize - The minimum number of rows in a line of text. |
|
215 Any smaller lines are discarded (currentval 5) |
|
216 |
|
217 Constraints: Page::readMap() must be run first to fill fRLEMap |
|
218 Rev: 10/26 KM |
|
219 ---------------------------------------------------------------*/ |
|
220 { |
|
221 |
|
222 int maxrow = fRLEMap->imageLength() - 1; // maximum row number |
|
223 int actualSeparation = MinVertSeparation + 1; // must be bigger than min |
|
224 // for line 0 |
|
225 |
|
226 int linenum=0; // current line number |
|
227 int prvlinenum = 0; |
|
228 int lineSize; // # rows in current line |
|
229 |
|
230 int maxLines = maxrow/MinLineSize; // max # of lines of text |
|
231 |
|
232 if(maxrow == 0) return EMPTY; |
|
233 |
|
234 flineinfo = new LineMarker[maxLines]; |
|
235 |
|
236 for (int i = 0; i < maxrow;) |
|
237 { |
|
238 LineMarker & thisLine = flineinfo[linenum]; |
|
239 LineMarker & prevLine = flineinfo[prvlinenum]; |
|
240 |
|
241 while (i < maxrow && fRLEMap->row(i)->numPixels < NoiseTolerance) |
|
242 i++; |
|
243 thisLine.fstartrow = i++; |
|
244 while (i < maxrow &&fRLEMap->row(i)->numPixels > NoiseTolerance) |
|
245 i++; |
|
246 |
|
247 |
|
248 lineSize = i - thisLine.fstartrow +1; |
|
249 |
|
250 // If this line is less than MinVertSeparation away |
|
251 // from the last line. Join the two together. |
|
252 if (linenum > 0) |
|
253 { |
|
254 actualSeparation = thisLine.fstartrow - prevLine.fendrow; |
|
255 } |
|
256 if (actualSeparation < MinVertSeparation) |
|
257 { |
|
258 // If too small of a separation, add into prev row |
|
259 prevLine.fendrow = i; |
|
260 } |
|
261 else if (lineSize >= MinLineSize) |
|
262 { |
|
263 thisLine.fendrow = i; |
|
264 /* printf (" Line %d Start: %d End: %d lineHeight %d\n", |
|
265 linenum,thisLine.fstartrow, |
|
266 thisLine.fendrow, |
|
267 thisLine.fendrow - thisLine.fstartrow +1); |
|
268 */ |
|
269 prvlinenum = linenum; |
|
270 linenum++; |
|
271 |
|
272 } |
|
273 if (linenum >= maxLines) return OTHERERROR; |
|
274 } |
|
275 |
|
276 fnumLines = linenum; // Set number of lines in page class |
|
277 |
|
278 |
|
279 if((ENABLE_USER_INTERFACE) && DISPLAY_LINE_BOUNDARIES) |
|
280 { |
|
281 display_line_boundaries(); |
|
282 } |
|
283 /* printf("Setlines found a total of %d lines.\n", fnumLines); */ |
|
284 if(ENABLE_USER_INTERFACE) |
|
285 update(); |
|
286 return VALID; |
|
287 } |
|
288 |
|
289 void Page::display_line_boundaries() |
|
290 /*-------------------------------------------------------------- |
|
291 Primary Purpose: Display line boundaries in TCL/TK. Called from |
|
292 setLines if ENABLE_USER_INTERFACE and DISPLAY_LINE_BOUNDARIES are |
|
293 set to TRUE |
|
294 Effects: Draws a blue line between each line of text |
|
295 Rev: AR |
|
296 ---------------------------------------------------------------*/ |
|
297 { |
|
298 int centerline, width; |
|
299 for(int j=0; j < fnumLines; j++) |
|
300 { |
|
301 centerline = (flineinfo[j].fendrow + flineinfo[j + 1].fstartrow) / 2; |
|
302 width = flineinfo[j + 1].fstartrow - flineinfo[j].fendrow; |
|
303 |
|
304 scale(centerline); |
|
305 scale(width); |
|
306 /* having this pathname here is probably not such a good idea...*/ |
|
307 |
|
308 docommand(".main_window.display.work_space create line %d %d %d %d -width %d -fill blue -tags {project_ray IMAGE_TAG} -stipple @/usr/sww/share/tclX-7.3a/tkX/3.6a/demos/bitmaps/grey.25", 0, centerline, bmap()->imageWidth(), centerline, width); |
|
309 } |
|
310 |
|
311 } |
|
312 |
|
313 |
|
314 int test_rlemap_lines(RLEMap* rmap) |
|
315 { |
|
316 int length = rmap->imageLength(); |
|
317 for(int i = 0; i < length; i++) |
|
318 printf("On line %d, numpixels = %d\n", i, rmap->fMapData[i]->numPixels); |
|
319 } |
|
320 |
|
321 |
|
322 MapStatus Page::extractComponents(int horizMerge) |
|
323 /*-------------------------------------------------------------- |
|
324 Component extraction routines. |
|
325 * |
|
326 * Given the top and bottom line of a row we want to generate a list of |
|
327 * components. The general method is to find the closest dot, trace its |
|
328 * connected dots, then project upwards and downwards and add anything we |
|
329 * find there to the component. We will erase the component from the RLEMap |
|
330 * as it is added to the component list. By projecting up and down |
|
331 * from the piece we first find we should be able |
|
332 * to completely encompass characters like :;i?|! The only problems are |
|
333 * italic or ligatured characters where we may pick up two or more |
|
334 * characters at a time (which would be bad) or characters fragmented |
|
335 * with a vertical gap. |
|
336 |
|
337 Primary Purpose: Main extraction routine. |
|
338 Effects: Makes new components and puts them in a list. Deletes components |
|
339 from RLE map. Fills in component boundaries and calls |
|
340 Component::setProperties to set the property vector |
|
341 Lastly convertMap is run to rebuild the RLEMap |
|
342 Constraints: Page::setLines() must be run first |
|
343 Rev: 4/28/96 |
|
344 ---------------------------------------------------------------*/ |
|
345 { |
|
346 int currentCol, startRow, endRow, rowHeight; |
|
347 ListElement* intrvl; |
|
348 ListElement* tempintrvl; |
|
349 /* printf("fnumLines = %d\n", fnumLines); */ |
|
350 Component* comp; |
|
351 int totalSpacing = 0; // total blank horizontal pixels between components |
|
352 int baselines[MaxVertSize]; // array for finding the baseline |
|
353 last_status = 0.0; |
|
354 int compCounter = 0; |
|
355 int i; |
|
356 int j; |
|
357 int upwardBound; // Projection distances different for equations |
|
358 int downwardBound; // and non-equations |
|
359 |
|
360 |
|
361 bool inEqn; // Variables for finding if the center of a comp |
|
362 int centerx; // is in an equation. |
|
363 int centery; |
|
364 |
|
365 |
|
366 |
|
367 printf("Extracting Components\n"); |
|
368 fLineComponents = new Components*[fnumLines]; |
|
369 for (i = 0; i < fnumLines; i++) { |
|
370 if(ENABLE_USER_INTERFACE) |
|
371 set_component_status(i, fnumLines); |
|
372 currentCol = 0; |
|
373 startRow = flineinfo[i].fstartrow; |
|
374 endRow = flineinfo[i].fendrow; |
|
375 rowHeight = endRow - startRow; |
|
376 assert(rowHeight > 0); |
|
377 |
|
378 for (j=0; j < MaxVertSize; j++) |
|
379 baselines[j] = 0; |
|
380 fLineComponents[i] = new Components(); |
|
381 |
|
382 |
|
383 while (currentCol<=fRLEMap->imageWidth()) { //until we reach the end of the page |
|
384 |
|
385 //Build component starting with closest black dot |
|
386 intrvl = fRLEMap->FindNearHorizDot(currentCol, startRow, endRow); |
|
387 if (intrvl == NULL) { |
|
388 // printf("Reached end of line\n"); |
|
389 break; |
|
390 } |
|
391 comp = new Component(); //Make a new component named comp |
|
392 assert(comp->AddToComponent(intrvl, fRLEMap, horizMerge)); |
|
393 |
|
394 //Now we want to extend upwards |
|
395 //First check if there is a blank space to the right |
|
396 tempintrvl = |
|
397 fRLEMap->FindNearHorizDot(comp->lr().x(), startRow, endRow); |
|
398 |
|
399 |
|
400 if (tempintrvl != NULL && ((RLEPair*) tempintrvl->item)->start > |
|
401 comp->lr().x()+horizMerge+1) |
|
402 while (comp->ul().y() < endRow) { |
|
403 |
|
404 // find the center of the component to check if we are in an equation |
|
405 centerx = (comp->ul().x() + comp->lr().x())/2; |
|
406 centery = (comp->ul().y() + comp->lr().y())/2; |
|
407 inEqn = inEquation(centerx, centery); |
|
408 // Determine projection distance. Only project for non Equations. |
|
409 if(inEqn) |
|
410 { |
|
411 upwardBound = comp->ul().y()+1; |
|
412 downwardBound = comp->lr().y() - 1; |
|
413 } |
|
414 else |
|
415 { |
|
416 upwardBound = startRow; |
|
417 downwardBound = endRow; |
|
418 } |
|
419 intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), |
|
420 comp->lr().x(), comp->lr().y(), |
|
421 upwardBound); |
|
422 // startRow); |
|
423 if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap, |
|
424 horizMerge))) |
|
425 break; |
|
426 if (intrvl == NULL) break; |
|
427 } |
|
428 else |
|
429 while (comp->ul().y() < endRow) { |
|
430 |
|
431 // find the center of the component to check if we are in an equation |
|
432 centerx = (comp->ul().x() + comp->lr().x())/2; |
|
433 centery = (comp->ul().y() + comp->lr().y())/2; |
|
434 inEqn = inEquation(centerx, centery); |
|
435 // Determine projection distance. Only project for non Equations. |
|
436 if(inEqn) |
|
437 { |
|
438 upwardBound = comp->ul().y()+1; |
|
439 downwardBound = comp->lr().y() - 1; |
|
440 } |
|
441 else // regular text |
|
442 { |
|
443 upwardBound = startRow; |
|
444 downwardBound = endRow; |
|
445 } |
|
446 |
|
447 intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), |
|
448 comp->lr().x(), comp->ul().y(), |
|
449 upwardBound); |
|
450 // startRow); |
|
451 if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap, |
|
452 horizMerge))); |
|
453 break; |
|
454 if (intrvl == NULL) break; |
|
455 } |
|
456 |
|
457 //Now we want to extend downwards |
|
458 while (comp->lr().y() > startRow) { |
|
459 intrvl = fRLEMap->FindNearVertDot(comp->ul().x(), comp->lr().x(), |
|
460 comp->lr().y(), downwardBound); |
|
461 if ((intrvl != NULL) && (!comp->AddToComponent(intrvl, fRLEMap, |
|
462 horizMerge))) |
|
463 break; |
|
464 if (intrvl == NULL) break; |
|
465 } |
|
466 |
|
467 // Now we toss out the noise |
|
468 int size; |
|
469 if (comp != NULL) { |
|
470 if (comp->ul() < Point(0,0)) |
|
471 printf("Here's a problem. %d, %d\n", comp->ul().x(), comp->ul().y()); |
|
472 else |
|
473 size = fBitMap->pixelsInRegion(comp->ul(), comp->lr()); |
|
474 } |
|
475 else |
|
476 size = 0; |
|
477 Component * prev = (Component *)(fLineComponents[i]->last->item); |
|
478 if (size < MinComponentSize) { |
|
479 // printf("Deleting some noise of size %d\n", size); |
|
480 // printComponent(comp); |
|
481 delete comp; |
|
482 comp = NULL; |
|
483 } |
|
484 else if (prev != NULL && |
|
485 abs(comp->ul().x() - prev->ul().x()) <= 1 && |
|
486 abs(comp->lr().x() == prev->lr().x()) <= 1) |
|
487 { |
|
488 // Check and see if this and the previous component have the |
|
489 // same x boundaries, if so merge the two. Good for = and : |
|
490 prev->join(comp); |
|
491 prev->setProperties(fBitMap); |
|
492 delete comp; |
|
493 comp ==NULL; |
|
494 } |
|
495 else |
|
496 { |
|
497 compCounter++; |
|
498 // display a rectangle around the component |
|
499 if(ENABLE_USER_INTERFACE) |
|
500 { |
|
501 if(DISPLAY_BOUNDING_BOXES) |
|
502 comp->display_bounding_box(); |
|
503 } |
|
504 |
|
505 // JMH - make an array of frequency of the y coord of bottom of comp |
|
506 int vertOffset = endRow - comp->lr().y(); |
|
507 if(vertOffset < MaxVertSize && vertOffset >= 0) |
|
508 baselines[vertOffset]++; |
|
509 |
|
510 |
|
511 comp->setProperties(fBitMap); |
|
512 if(fLineComponents[i]->last != NULL) |
|
513 { |
|
514 int thisSpacing = comp->ul().x() - |
|
515 ((Component *) (fLineComponents[i]->last->item))->lr().x(); |
|
516 // if a realy big space, make space the width of this comp |
|
517 if (thisSpacing > 200) |
|
518 thisSpacing = 2*(comp->lr().x() - comp->ul().x()); |
|
519 totalSpacing += thisSpacing; |
|
520 } |
|
521 |
|
522 fLineComponents[i]->Append(comp); // add this component to list |
|
523 currentCol = (comp->ul()).x() + 1; // update position on page |
|
524 } |
|
525 } |
|
526 |
|
527 // find most popular bottom of comp and call it the baseline |
|
528 int counter = 0; |
|
529 int baseline; |
|
530 for (j=0; j < MaxVertSize; j++) { |
|
531 if (counter < baselines[j]) { |
|
532 counter = baselines[j]; |
|
533 baseline = endRow - j; |
|
534 } |
|
535 } |
|
536 // printf("For row %d to %d baseline = %d\n", startRow, endRow, baseline); |
|
537 // Now assign each character a group based on it's location |
|
538 for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; |
|
539 ptr = ptr->next) { |
|
540 comp = (Component*) ptr->item; |
|
541 comp->charGroup = 0; |
|
542 |
|
543 // if top of char is higher than top - tolerance |
|
544 if (comp->ul().y() < startRow + (rowHeight/TopLineTolerance)) { |
|
545 comp->charGroup += 2; //tall like a T |
|
546 } |
|
547 |
|
548 // if bottom of char is lower than base - tolerance |
|
549 if (comp->lr().y() > baseline + (rowHeight/BaseLineTolerance)) { |
|
550 comp->charGroup += 1; //has a tail like a y |
|
551 } else |
|
552 if (comp->lr().y() < (baseline - (2*rowHeight/BaseLineTolerance))) { |
|
553 comp->charGroup = 4; //floating like a ' |
|
554 /* printf("bottom at %d < %d\n", comp->lr().y(), |
|
555 baseline - (2*rowHeight/BaseLineTolerance)); */ |
|
556 } |
|
557 // printf("added character in group %d\n", comp->charGroup); |
|
558 } |
|
559 } |
|
560 /* printf("Found %d components on this page.\n", compCounter); */ |
|
561 // printComponents(); |
|
562 last_status = 0.0; |
|
563 if(ENABLE_USER_INTERFACE) |
|
564 set_status("Done extracting characters"); |
|
565 if((compCounter - fnumLines) > 0) /* don't want divide by zero */ |
|
566 { |
|
567 favgSpacing = totalSpacing / (compCounter - fnumLines); |
|
568 } |
|
569 else |
|
570 { |
|
571 favgSpacing = 1; |
|
572 } |
|
573 delete fRLEMap; |
|
574 fRLEMap = new RLEMap; |
|
575 convertMap(fBitMap, fRLEMap); |
|
576 } |
|
577 |
|
578 void Page::printComponents() |
|
579 /*-------------------------------------------------------------- |
|
580 Primary Purpose: Debugging routine that prints little bitmaps |
|
581 of low confidence characters |
|
582 ---------------------------------------------------------------*/ |
|
583 { |
|
584 int compcounter = 0; |
|
585 for (int i = 0; i < fnumLines; i++) { |
|
586 Component* comp; |
|
587 for (ListElement* ptr = fLineComponents[i]->first; ptr != NULL; |
|
588 ptr = ptr->next) { |
|
589 compcounter++; |
|
590 comp = (Component *) ptr->item; |
|
591 if (comp->confid() < (ConfidenceThreshold-20) && comp->asciiId() == 'n') |
|
592 { |
|
593 printf("Here's a poorly recognized component ul=%d,%d, lr=%d,%d.\n\n", |
|
594 (comp->ul()).x(), (comp->ul()).y(), |
|
595 (comp->lr()).x(), (comp->lr()).y()); |
|
596 printComponent(comp); |
|
597 printf("properties: "); |
|
598 printVector(comp->properties(), numProperties); |
|
599 printf("I think it's a -> %c <- confidence: %d line: %d group: %d Comp#%d\n", |
|
600 comp->asciiId(), |
|
601 comp->confid(), i+1, comp->charGroup, compcounter); |
|
602 printf("\n*******************************************************\n"); |
|
603 } |
|
604 } |
|
605 } |
|
606 } |
|
607 |
|
608 void Page::printComponent(Component* comp) |
|
609 // Print a single component. |
|
610 { |
|
611 int right = comp->ul().x()+78; |
|
612 if (comp->lr().x() < right) |
|
613 right = comp->lr().x(); |
|
614 |
|
615 for (int r = comp->ul().y(); |
|
616 r <= comp->lr().y(); r++){ |
|
617 for (int c = comp->ul().x(); |
|
618 c <= right; c++) |
|
619 bitprint(fBitMap->row(r)[c/8], c%8); |
|
620 printf( "\n"); |
|
621 } |
|
622 } |
|
623 |
|
624 int spacing(ListElement * compa, ListElement * compb); |
|
625 // helper function for extractWords (defined below) |
|
626 |
|
627 MapStatus Page::extractWords() |
|
628 /*-------------------------------------------------------------- |
|
629 Primary Purpose: Extract words from each lines components |
|
630 Effects: sets the fWordsList to be a list of all of the words |
|
631 in the document. |
|
632 Constraints: extractComponents must be run first |
|
633 Rev: KM 11/7/95 |
|
634 ---------------------------------------------------------------*/ |
|
635 { |
|
636 bool inWord; |
|
637 ListElement * start; // word Start |
|
638 int count; // counts the components in the word |
|
639 int wordlength; // counts the characters in the word |
|
640 int word_count = 0; |
|
641 int spacingThreshold = (int) (1.25 * ((float) (favgSpacing))); |
|
642 fWordList = new Words; |
|
643 last_status = 0.0; |
|
644 for (int i = 0; i < fnumLines; i++) |
|
645 { |
|
646 if(ENABLE_USER_INTERFACE) |
|
647 set_extract_status(i, fnumLines); |
|
648 inWord = FALSE; |
|
649 for(ListElement *ptr = line(i)->first; ptr != NULL; ptr = ptr->next) { |
|
650 Component * item = (Component *) ptr->item; |
|
651 if(!inWord) |
|
652 { |
|
653 start = ptr; |
|
654 count = 1; |
|
655 if (item->fasciiId == NULL) |
|
656 wordlength = 1; |
|
657 else |
|
658 wordlength = strlen(item->fasciiId); |
|
659 inWord = TRUE; |
|
660 } |
|
661 if( spacing(ptr, ptr->next) > spacingThreshold || |
|
662 inEquation( ptr)) |
|
663 { |
|
664 Word * newWord = new Word(start,count,wordlength); |
|
665 (words())->Append(newWord); |
|
666 if(1) |
|
667 printf("%s ",newWord->characters); |
|
668 inWord = FALSE; |
|
669 word_count++; |
|
670 } |
|
671 else |
|
672 count++; |
|
673 if (item->fasciiId == NULL) wordlength ++; |
|
674 else wordlength += strlen(item->fasciiId); |
|
675 } |
|
676 // Add in a separate word for new line |
|
677 Word * newWord = new Word("\n",2); |
|
678 (words())->Append(newWord); |
|
679 printf("%s", newWord->characters); |
|
680 word_count++; |
|
681 } |
|
682 last_status = 0.0; |
|
683 fWordList->num_words = word_count; |
|
684 if(ENABLE_USER_INTERFACE) |
|
685 set_status("Done extracting words"); |
|
686 return VALID; |
|
687 } |
|
688 |
|
689 void Page::spellcheck() |
|
690 /*-------------------------------------------------------------- |
|
691 Primary Purpose: Run spell checker on word list. |
|
692 Constraints: extractWords must be run first |
|
693 Rev: AR |
|
694 ---------------------------------------------------------------*/ |
|
695 { |
|
696 int word_count = 0; |
|
697 Word* temp_word; |
|
698 for(ListElement* ptr = (words())->first; ptr != NULL; ptr = ptr->next) |
|
699 { |
|
700 word_count++; |
|
701 if(ENABLE_USER_INTERFACE) |
|
702 set_spellcheck_status(word_count, fWordList->num_words); |
|
703 temp_word = (Word*)ptr->item; |
|
704 if(0) |
|
705 printf("Spellchecking word %s\n", temp_word->characters); |
|
706 if(mispelled(temp_word->characters)) |
|
707 { |
|
708 temp_word->mispelled = TRUE; |
|
709 } |
|
710 } |
|
711 } |
|
712 |
|
713 int Page::spacing(ListElement * compa, ListElement * compb) |
|
714 // spacing from end of comp_a to begining of comp_b |
|
715 { |
|
716 int x; |
|
717 if (compb == NULL) return 1000; // end of line |
|
718 |
|
719 Component * a = ((Component *) (compa)->item); |
|
720 Component * b = ((Component *) (compb)->item); |
|
721 int returnval = (b->ul().x() - a->lr().x()); |
|
722 if (returnval < 0) |
|
723 { |
|
724 return 0; |
|
725 } |
|
726 assert (returnval >= 0); |
|
727 return returnval; |
|
728 |
|
729 } |
|
730 |
|
731 |
|
732 void Page::printWords() |
|
733 // Prits out each component of each word. This can take a very long time |
|
734 { |
|
735 |
|
736 Word * thisWord; |
|
737 for (ListElement * ptr = words()->first; ptr !=NULL; ptr= ptr->next) |
|
738 { |
|
739 thisWord = (Word *) ptr->item; |
|
740 printf("!!!!!! NEW WORD %s confid : %d !!!!!\n", thisWord->characters, thisWord->confid); |
|
741 for(int i = 0; i < thisWord->charCount; i++) |
|
742 { |
|
743 Component * comp = thisWord->character[i]; |
|
744 if (comp == NULL) continue; |
|
745 printf("Printing a component ul=%d,%d, lr=%d,%d.\n\n", |
|
746 (comp->ul()).x(), (comp->ul()).y(), |
|
747 (comp->lr()).x(), (comp->lr()).y()); |
|
748 for (int r = comp->ul().y(); |
|
749 r <= comp->lr().y(); r++){ |
|
750 for (int c = comp->ul().x(); |
|
751 c <= comp->lr().x(); c++) |
|
752 bitprint(fBitMap->row(r)[c/8], c%8); |
|
753 printf( "\n"); |
|
754 } |
|
755 printf("properties: "); |
|
756 printVector(comp->properties(), numProperties); |
|
757 printf("Identification: %c distance: %d confidence %d\n", |
|
758 comp->asciiId(), |
|
759 comp->distance(&LearnedChars[comp->asciiId()]), |
|
760 comp->confid()); |
|
761 printf("\n***********************************************\n"); |
|
762 } |
|
763 } |
|
764 } |
|
765 |
|
766 MapStatus Page::recognize() |
|
767 /*-------------------------------------------------------------- |
|
768 Primary Purpose: Recognize entire page. Sets font and ascii id of |
|
769 each component |
|
770 Return Value: VALID if no error occurred OTHERERROR otherwise |
|
771 Constraints: extractComponents must be run first. |
|
772 See recognize(line) below for more detailed info |
|
773 Rev: KM |
|
774 ---------------------------------------------------------------*/ |
|
775 { |
|
776 printf("Recognizing document\n"); |
|
777 last_status = 0.0; |
|
778 for (int i = 0; i< fnumLines; i++) |
|
779 { |
|
780 if(ENABLE_USER_INTERFACE) |
|
781 set_recognize_status(i, fnumLines); |
|
782 recognize(i); |
|
783 } |
|
784 |
|
785 last_status = 0.0; |
|
786 return VALID; |
|
787 |
|
788 } |
|
789 |
|
790 |
|
791 MapStatus Page::recognize(int linenum) |
|
792 /*-------------------------------------------------------------- |
|
793 Primary Purpose: Recognize a line of connected components |
|
794 Arguments: linenum is line number to recognize |
|
795 Effects: sets ascii identification fontid and confidence in each component |
|
796 If confidence is low and character is big enough for two characters. |
|
797 divideAndRecognize is called to split up the component. |
|
798 Constraints: extractComponents must be run first |
|
799 Rev: KM 11/9/95 |
|
800 ---------------------------------------------------------------*/ |
|
801 { |
|
802 Component * comp; |
|
803 Distance d; |
|
804 |
|
805 for(ListElement *ptr = line(linenum)->first; ptr != NULL; ptr = ptr->next) |
|
806 { |
|
807 comp = (Component *) ptr->item; |
|
808 |
|
809 d = comp->recognize(LearnedGroups); |
|
810 if (comp->confid() < ConfidenceThreshold && |
|
811 comp->width() > 2*MinWidth) // really wide |
|
812 divideAndRecognize(line(linenum), ptr, d); |
|
813 |
|
814 /*** |
|
815 if (comp->confid() < ConfidenceThreshold || |
|
816 (ptr != line(linenum)->first && |
|
817 ((Component *) ptr->previous->item)->confid() < ConfidenceThreshold)) |
|
818 uniteAndRecognize(line(linenum), ptr, d); |
|
819 ***/ |
|
820 |
|
821 } |
|
822 |
|
823 return VALID; |
|
824 } |
|
825 |
|
826 |
|
827 |
|
828 void Page::divideAndRecognize (Components *list, ListElement * ptr, Distance d) |
|
829 /*-------------------------------------------------------------- |
|
830 Primary Purpose: Identify and separate merged characters |
|
831 Arguments:ptr is a pointer to a list element containing a component |
|
832 d is the current recognition distance on the component |
|
833 Effects: Subdivides component into two parts, Division is made at |
|
834 the minimum vertical height of the component. If the |
|
835 minHeight > JoinTolerance no divison will be made. |
|
836 (JoinTolerance is a global var that determines |
|
837 the maximum number of merged pixels that are allowed in a |
|
838 column for a division to be made) |
|
839 When a division is made. The component's boundaries are |
|
840 adjusted accordingly and a new component is inserted into |
|
841 the list. |
|
842 |
|
843 Returns if distance is acceptable or width of component |
|
844 is <= MinWidth*2 |
|
845 Rev: KM 11/24/95 |
|
846 ---------------------------------------------------------------*/ |
|
847 { |
|
848 Component * comp = (Component *) ptr->item; |
|
849 Component * newComp; |
|
850 bool allGroups = TRUE; |
|
851 |
|
852 // Save the original component boundaries just in case we cant improve |
|
853 Point oldlr = comp->lr(); |
|
854 Point oldul = comp->ul(); |
|
855 int oldwidth = (int) comp->width(); |
|
856 |
|
857 // Some easy access x,y coordinates |
|
858 int ulx = comp->ul().x(); |
|
859 int uly = comp->ul().y(); |
|
860 int lrx = comp->lr().x(); |
|
861 int lry = comp->lr().y(); |
|
862 |
|
863 Distance newdist, bestdist; |
|
864 int bestlrx; |
|
865 |
|
866 if (comp->confid() > ConfidenceThreshold) |
|
867 return; |
|
868 |
|
869 if (oldwidth < MinWidth*2) // cant be split in two |
|
870 { |
|
871 return; |
|
872 } |
|
873 |
|
874 // Determine where to split. Split at the thinnest point |
|
875 // within JoinTolerance (maximum number of pixels that might be fused) |
|
876 |
|
877 int minHeight = (int)comp->height(); |
|
878 bestlrx = comp->lr().x(); |
|
879 for(int i = MinWidth; i < oldwidth - MinWidth; i++) |
|
880 { |
|
881 |
|
882 int newHeight = |
|
883 fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry)); |
|
884 if (newHeight < minHeight) |
|
885 { |
|
886 minHeight = newHeight; |
|
887 bestlrx = ulx+i; |
|
888 } |
|
889 } |
|
890 // printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight); |
|
891 |
|
892 |
|
893 if (bestlrx < lrx && minHeight < JoinTolerance) |
|
894 { |
|
895 comp->lr().x() = bestlrx; |
|
896 int shrunk = comp->vertShrink(fBitMap); |
|
897 comp->setProperties(fBitMap); |
|
898 if (shrunk) // ignore group if we had to shrink down |
|
899 newdist = comp->recognize(LearnedGroups, allGroups); |
|
900 else |
|
901 newdist = comp->recognize(LearnedGroups); |
|
902 |
|
903 // printf("Distance = %u asciiid = %c \n", newdist, comp->asciiId()); |
|
904 |
|
905 Component * newcomp = new Component(Point(bestlrx+1, oldul.y()) |
|
906 , oldlr); |
|
907 newcomp->vertShrink(fBitMap); |
|
908 newcomp->setProperties(fBitMap); |
|
909 int newcompdist = newcomp->recognize(LearnedGroups,allGroups); |
|
910 |
|
911 if ((newdist < d) && (newcomp->confid() > ConfidenceThreshold*.6)) |
|
912 { |
|
913 list->insertAfter(ptr, newcomp); |
|
914 newcomp->display_bounding_box("red"); |
|
915 comp->display_bounding_box("red"); |
|
916 } |
|
917 else |
|
918 { |
|
919 comp->ul() = oldul; |
|
920 comp->lr() = oldlr; |
|
921 comp->setProperties(fBitMap); |
|
922 comp->recognize(LearnedGroups); |
|
923 delete newcomp; |
|
924 } |
|
925 return; |
|
926 } |
|
927 |
|
928 |
|
929 return; |
|
930 |
|
931 } |
|
932 |
|
933 |
|
934 void Page::uniteAndRecognize (Components *list, ListElement * ptr, Distance d) |
|
935 /*-------------------------------------------------------------- |
|
936 Primary Purpose: Identify and merge a separated character |
|
937 Arguments:ptr is a pointer to a list element containing a component |
|
938 d is the current recognition distance on the component |
|
939 Effects: Unite two components into one. |
|
940 |
|
941 Rev: 5/6/96 |
|
942 ---------------------------------------------------------------*/ |
|
943 { |
|
944 if (ptr->previous == NULL) return; |
|
945 Component * part1 = (Component *) ptr->previous->item; |
|
946 Component * part2 = (Component *) ptr->item; |
|
947 |
|
948 |
|
949 Point ul, lr; |
|
950 ul = part1->ul(); |
|
951 lr = part2->lr(); |
|
952 if (ul.y() > lr.y() || ul.x() > lr.x()) |
|
953 return; |
|
954 Component * newcomp = new Component(ul, lr); |
|
955 |
|
956 newcomp->setProperties(fBitMap); |
|
957 if (part1->charGroup <= 3 && part2->charGroup <= 3) |
|
958 newcomp->charGroup = (part1->charGroup | part2->charGroup); |
|
959 else if (part1->charGroup == 4) |
|
960 newcomp->charGroup = (part2->charGroup | 2); |
|
961 else |
|
962 newcomp->charGroup = (part1->charGroup | 2); |
|
963 if (newcomp->charGroup > 4) newcomp->charGroup = 4; |
|
964 |
|
965 int newdist = newcomp->recognize(LearnedGroups); |
|
966 |
|
967 if (newdist < d && newcomp->confid() > ConfidenceThreshold) |
|
968 { |
|
969 list->removeAt(ptr->previous); |
|
970 list->insertAfter(ptr, newcomp); |
|
971 list->removeAt(ptr); |
|
972 } else delete newcomp; |
|
973 |
|
974 return; |
|
975 |
|
976 } |
|
977 |
|
978 |
|
979 int Page::writeWordPos(char * filename) |
|
980 /*-------------------------------------------------------------- |
|
981 Primary Purpose: Writes word position, confidence, length and string to file |
|
982 Arguments: output file name |
|
983 Return Value: 1 if successful. 0 if an error occured |
|
984 Effects: Calls fWordList->printWordPos |
|
985 // Output format for each word |
|
986 "%6d %6d %6d %6d %s\n", word->ul.x(), word->ul.y(), |
|
987 word->confid, word->charCount, word->characters |
|
988 Rev: 11/25/95 |
|
989 ---------------------------------------------------------------*/ |
|
990 { return fWordList->writeWordPos(filename);}; |
|
991 |
|
992 int Page::writeWordbox(char * filename, int xoffset= 0, int yoffset = 0, |
|
993 bool equationsOnly = FALSE) |
|
994 /*-------------------------------------------------------------- |
|
995 Primary Purpose: Write out word to scanworks wordbox file |
|
996 Arguments: output file, xoffset, yoffset, equationsOnly bool if we only want |
|
997 equations. |
|
998 Return Value: |
|
999 Effects: calls fWordList->writeWordbox |
|
1000 // output format for each word |
|
1001 "%s %d %d %d %d %d %d %d % \n", |
|
1002 word->characters, |
|
1003 word->ul.x(), word->ul.y(), |
|
1004 word->lr.x(), word->lr.y(), |
|
1005 word->lr.x(), word->ul.y(), |
|
1006 word->ul.x(), word->lr.y() ); |
|
1007 New line between lines of text |
|
1008 Rev: 11/25/95 |
|
1009 ---------------------------------------------------------------*/ |
|
1010 { return fWordList->writeWordbox(filename, xoffset, yoffset, this, equationsOnly);}; |
|
1011 |
|
1012 |
|
1013 int Page::writeAscii(char * filename) |
|
1014 /*-------------------------------------------------------------- |
|
1015 Primary Purpose: Write word list to asii file |
|
1016 Arguments: filename to write to |
|
1017 Return Value: 1 if successful 0 if unsuccessful |
|
1018 Effects: Calss fWordList->writeAscii(filename) |
|
1019 Writes words to fill in text format using MinLineSize |
|
1020 to differentiate lines. |
|
1021 Rev: 11/25 KM |
|
1022 ---------------------------------------------------------------*/ |
|
1023 |
|
1024 {return fWordList->writeAscii(filename);}; |
|
1025 |
|
1026 |
|
1027 |
|
1028 int Page::addEquation(int startline, int startcol, int endline, int endcol) |
|
1029 /*-------------------------------------------------------------- |
|
1030 Primary Purpose: Add an equation to the equation list |
|
1031 Arguments: boundaries of equation |
|
1032 Effects: Adds an element fEqnList |
|
1033 Rev: 4/21/96 |
|
1034 ---------------------------------------------------------------*/ |
|
1035 { |
|
1036 EqnMarker * newEqn = new EqnMarker(startline, startcol, endline, endcol); |
|
1037 fEqnList->SortedInsert(newEqn, startline); |
|
1038 } |
|
1039 |
|
1040 int Page::deleteEquation(int col, int row) |
|
1041 /*-------------------------------------------------------------- |
|
1042 Primary Purpose: deletes equations with this coordinate. |
|
1043 Arguments: coordinate of equation to remove |
|
1044 Return Value: 1 if element was remove, 0 otherwise |
|
1045 Effects: removes any equation containing this coordinate |
|
1046 Rev: 4/21/96 |
|
1047 ---------------------------------------------------------------*/ |
|
1048 { |
|
1049 // first determine line number. |
|
1050 int linenum; |
|
1051 |
|
1052 for (int i = 0; i < fnumLines; i++) |
|
1053 if (flineinfo[i].fstartrow <= row && flineinfo[i].fendrow >= row) |
|
1054 { |
|
1055 linenum = i; |
|
1056 break; |
|
1057 } |
|
1058 |
|
1059 for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) |
|
1060 { |
|
1061 EqnMarker * eqn = (EqnMarker *) ptr->item; |
|
1062 if (linenum == eqn->startline && linenum == eqn->endline) |
|
1063 { |
|
1064 if (col >= eqn->startcol && col <= eqn->endcol) |
|
1065 { |
|
1066 delete eqn; |
|
1067 setTclDeleteVars(eqn); |
|
1068 fEqnList->removeAt(ptr); |
|
1069 return 1; |
|
1070 } |
|
1071 } |
|
1072 else if (linenum == eqn->startline && col >= eqn->startcol) |
|
1073 { |
|
1074 delete eqn; |
|
1075 setTclDeleteVars(eqn); |
|
1076 fEqnList->removeAt(ptr); |
|
1077 return 1; |
|
1078 } |
|
1079 else if (linenum > eqn->startline && linenum < eqn->endline) |
|
1080 { |
|
1081 delete eqn; |
|
1082 setTclDeleteVars(eqn); |
|
1083 fEqnList->removeAt(ptr); |
|
1084 return 1; |
|
1085 } |
|
1086 else if (linenum == eqn->endline && col <= eqn->endcol) |
|
1087 { |
|
1088 delete eqn; |
|
1089 setTclDeleteVars(eqn); |
|
1090 fEqnList->removeAt(ptr); |
|
1091 return 1; |
|
1092 } |
|
1093 |
|
1094 } |
|
1095 |
|
1096 return 0; |
|
1097 |
|
1098 |
|
1099 } |
|
1100 |
|
1101 void Page::setTclDeleteVars(EqnMarker * eqn) |
|
1102 { |
|
1103 |
|
1104 if (ENABLE_USER_INTERFACE) |
|
1105 { |
|
1106 docommand("set deleted 1"); |
|
1107 docommand("set curline %d",eqn->endline); |
|
1108 docommand("set curline_startrow %d",flineinfo[eqn->endline].fstartrow); |
|
1109 docommand("set curline_endrow %d",flineinfo[eqn->endline].fendrow); |
|
1110 docommand("set curx %d", eqn->endcol); |
|
1111 |
|
1112 // prevlines are actually starting lines but allowed same use of |
|
1113 // tcl add equation code |
|
1114 docommand("set prevline %d",eqn->startline); |
|
1115 docommand("set prevline_startrow %d",flineinfo[eqn->startline].fstartrow); |
|
1116 docommand("set prevline_endrow %d",flineinfo[eqn->startline].fendrow); |
|
1117 docommand("set prevx %d", eqn->startcol); |
|
1118 |
|
1119 |
|
1120 |
|
1121 // this will change with zoning |
|
1122 docommand("set curline_startcol %d",0); |
|
1123 docommand("set curline_endcol %d",get_width()); |
|
1124 |
|
1125 |
|
1126 } |
|
1127 |
|
1128 |
|
1129 } |
|
1130 |
|
1131 Component * Page::compAt(Point p) |
|
1132 /*-------------------------------------------------------------- |
|
1133 Primary Purpose: Calls Components::compAt to return the smallest |
|
1134 component containing point p |
|
1135 Return Value: Pointer to the component or null if no component here |
|
1136 Effects: |
|
1137 Rev: 4/25/96 |
|
1138 ---------------------------------------------------------------*/ |
|
1139 { |
|
1140 Component * returnComp= NULL; |
|
1141 int linenum = get_linenum(p.x(), p.y() ); |
|
1142 |
|
1143 if (linenum >= 0) |
|
1144 { |
|
1145 Components * complist = line(linenum); |
|
1146 returnComp = complist->compAt(p); |
|
1147 } |
|
1148 if (returnComp == NULL) |
|
1149 printf("No component found at ( %d, %d)\n ", p.x(), p.y()); |
|
1150 else |
|
1151 printf("Component found at ( %d, %d)\n ul = (%d, %d) lr = (%d, %d)\n " |
|
1152 , p.x(), p.y(),returnComp->ul().x(),returnComp->ul().y(), |
|
1153 returnComp->lr().x(),returnComp->lr().y()); |
|
1154 |
|
1155 |
|
1156 return returnComp; |
|
1157 } |
|
1158 |
|
1159 |
|
1160 bool Page::inEquation(int col, int row) |
|
1161 /*-------------------------------------------------------------- |
|
1162 Primary Purpose: determine if x,y is in an equation |
|
1163 Arguments: x,y coordinates |
|
1164 Return Value: true if in an Equation, false otherwise |
|
1165 Effects: determines if equation with these coordinated is in fEqnList |
|
1166 Rev: 11/25/95 |
|
1167 ---------------------------------------------------------------*/ |
|
1168 { |
|
1169 // first determine line number. |
|
1170 int linenum = get_linenum(col, row); |
|
1171 |
|
1172 |
|
1173 for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) |
|
1174 { |
|
1175 EqnMarker * eqn = (EqnMarker *) ptr->item; |
|
1176 if (linenum == eqn->startline && linenum == eqn->endline) |
|
1177 { |
|
1178 if(col >= eqn->startcol && col <= eqn->endcol) |
|
1179 return true; |
|
1180 } |
|
1181 else if (linenum == eqn->startline && col >= eqn->startcol) |
|
1182 return true; |
|
1183 else if (linenum > eqn->startline && linenum < eqn->endline) |
|
1184 return true; |
|
1185 else if (linenum == eqn->endline && col <= eqn->endcol) |
|
1186 return true; |
|
1187 } |
|
1188 return false; |
|
1189 |
|
1190 } |
|
1191 |
|
1192 bool Page::inEquation(ListElement * comp) |
|
1193 /*-------------------------------------------------------------- |
|
1194 Primary Purpose: determine if the component in this list element |
|
1195 is in an equation |
|
1196 Arguments: A list element from a component list |
|
1197 Return Value: true if in equation, false otherwise |
|
1198 Effects: calls inEquation(x,y) to do the real work |
|
1199 Rev: 4/21/96 |
|
1200 ---------------------------------------------------------------*/ |
|
1201 { |
|
1202 Component * c = (Component *) comp->item; |
|
1203 return inEquation(c->ul().x(), c->ul().y()); |
|
1204 } |
|
1205 |
|
1206 |
|
1207 int Page::writeEquations(char * filename, int lineOffset) |
|
1208 /*-------------------------------------------------------------- |
|
1209 Primary Purpose: Writes boundaries of equations |
|
1210 Arguments: output file name |
|
1211 Return Value: 1 if successful 0 otherwise |
|
1212 Effects: Outputs to filename for each equation |
|
1213 int startline, int startcol, int endline, int endcol <CR/LF> |
|
1214 Rev: 11/25/95 |
|
1215 ---------------------------------------------------------------*/ |
|
1216 { |
|
1217 FILE * outfile; |
|
1218 outfile = fopen(filename, "w"); |
|
1219 if (outfile == NULL) |
|
1220 { |
|
1221 printf("Error openning %s", filename); |
|
1222 return 0; |
|
1223 } |
|
1224 |
|
1225 for(ListElement *ptr = fEqnList->first; ptr != NULL; ptr = ptr->next) |
|
1226 { |
|
1227 EqnMarker * eqn = (EqnMarker *) ptr->item; |
|
1228 fprintf(outfile, " %6d %6d %6d %6d\n", eqn->startline+lineOffset, |
|
1229 eqn->startcol, |
|
1230 eqn->endline+lineOffset, eqn->endcol); |
|
1231 } |
|
1232 fclose(outfile); |
|
1233 return 1; |
|
1234 } |
|
1235 |
|
1236 void Page::join(Component * a, Component * b) |
|
1237 { |
|
1238 if (a == b) return; |
|
1239 Component * primary; |
|
1240 Component * secondary; |
|
1241 |
|
1242 primary = (( a < b) ? a : b); |
|
1243 secondary = ((primary == a) ? b : a); |
|
1244 assert(primary != secondary); |
|
1245 assert(get_linenum(a) == get_linenum(b)); |
|
1246 |
|
1247 primary->join(secondary); |
|
1248 |
|
1249 // remove secondary component from component list. |
|
1250 int linenum = get_linenum(secondary); |
|
1251 line(linenum)->removeElement(secondary); |
|
1252 |
|
1253 |
|
1254 } |
|
1255 |
|
1256 |
|
1257 |
|
1258 |
|
1259 |
|
1260 int Page::thinnestHorizontalSplit(Components * complist, |
|
1261 ListElement * compptr) |
|
1262 /*-------------------------------------------------------------- |
|
1263 Primary Purpose: Splits this component at thinnest point |
|
1264 Arguments: the component list that contains the compoent and |
|
1265 a pointer to its listelement |
|
1266 Return Value: 1 if split performed 0 otherwise. |
|
1267 Effects: Adds a new component to the list |
|
1268 Constraints: |
|
1269 Rev: 4/26 |
|
1270 ---------------------------------------------------------------*/ |
|
1271 { |
|
1272 Component * comp = (Component *) compptr->item; |
|
1273 // Some easy access x,y coordinates |
|
1274 int ulx = comp->ul().x(); |
|
1275 int uly = comp->ul().y(); |
|
1276 int lrx = comp->lr().x(); |
|
1277 int lry = comp->lr().y(); |
|
1278 |
|
1279 int bestlrx; |
|
1280 |
|
1281 // Determine where to split. Split at the thinnest point |
|
1282 // within JoinTolerance (maximum number of pixels that might be fused) |
|
1283 |
|
1284 |
|
1285 int minHeight = (int)comp->height(); |
|
1286 int oldwidth = (int) comp->width(); |
|
1287 |
|
1288 bestlrx = comp->lr().x(); |
|
1289 // MinWidth is the minimum width of a learned charcter |
|
1290 for(int i = MinWidth; i < oldwidth - MinWidth; i++) |
|
1291 { |
|
1292 |
|
1293 int newHeight = |
|
1294 fBitMap->pixelsInRegion(Point(ulx+i,uly), Point(ulx+i,lry)); |
|
1295 if (newHeight < minHeight) |
|
1296 { |
|
1297 minHeight = newHeight; |
|
1298 bestlrx = ulx+i; |
|
1299 } |
|
1300 } |
|
1301 // printf("bestlrx = %d, minHeight = %d\n", bestlrx, minHeight); |
|
1302 horizontalCompSplit(complist, compptr, bestlrx); |
|
1303 |
|
1304 |
|
1305 } |
|
1306 |
|
1307 int Page::thinnestHorizontalSplit(Component * comp) |
|
1308 { |
|
1309 |
|
1310 int i = get_linenum(comp); |
|
1311 |
|
1312 Components * complist = fLineComponents[i]; |
|
1313 ListElement * compptr; |
|
1314 |
|
1315 for (ListElement * ptr = complist->first; ptr != NULL; ptr = ptr->next) |
|
1316 { |
|
1317 if ((Component *) (ptr->item) == comp) |
|
1318 { |
|
1319 compptr = ptr; |
|
1320 thinnestHorizontalSplit(complist, compptr); |
|
1321 return 1; |
|
1322 } |
|
1323 } |
|
1324 return 0; |
|
1325 |
|
1326 } |
|
1327 |
|
1328 int Page::horizontalCompSplit(Components * complist, |
|
1329 ListElement * compptr, int x) |
|
1330 /*-------------------------------------------------------------- |
|
1331 Primary Purpose: Split this component in the list into two components |
|
1332 at the indicated x coordinate |
|
1333 Arguments: x coordinate of splite |
|
1334 Return Value: 1 if split is performed 0 otherwise |
|
1335 Effects: Adds a new element to the list. One component is split into two |
|
1336 Constraints: fulx <= x >= flrx |
|
1337 Rev: 4/26/96 |
|
1338 ---------------------------------------------------------------*/ |
|
1339 |
|
1340 { |
|
1341 Component * comp = (Component *) compptr->item; |
|
1342 bool allGroups = TRUE; |
|
1343 comp->display_bounding_box("white"); |
|
1344 |
|
1345 if( x < comp->ul().x() || x > comp->lr().x()) |
|
1346 { |
|
1347 cout << " Cant split component " << x << "is not between" |
|
1348 << comp->ul().x() << "and" << comp->lr().x() << endl; |
|
1349 return 0; |
|
1350 } |
|
1351 else |
|
1352 { |
|
1353 Component * newcomp = new Component(Point(x,comp->ul().y()), |
|
1354 comp->lr()); |
|
1355 comp->lr().x() = x-1; |
|
1356 int compShrunk = comp->vertShrink(fBitMap); |
|
1357 comp->setProperties(fBitMap); |
|
1358 if(compShrunk) |
|
1359 comp->recognize(LearnedGroups, allGroups); |
|
1360 else |
|
1361 comp->recognize(LearnedGroups); |
|
1362 |
|
1363 |
|
1364 int newCompShrunk = newcomp->vertShrink(fBitMap); |
|
1365 newcomp->setProperties(fBitMap); |
|
1366 |
|
1367 if(newCompShrunk) // ignore group if shrunk |
|
1368 newcomp->recognize(LearnedGroups, allGroups); |
|
1369 else |
|
1370 newcomp->recognize(LearnedGroups); |
|
1371 |
|
1372 complist->insertAfter(compptr, newcomp); |
|
1373 comp->display_bounding_box("blue"); |
|
1374 newcomp->display_bounding_box("blue"); |
|
1375 |
|
1376 return 1; |
|
1377 } |
|
1378 |
|
1379 |
|
1380 } |
|
1381 |
|
1382 |
|
1383 ZonedPage::ZonedPage() |
|
1384 :Page(){ fzones = new Zones();} |
|
1385 |
|
1386 ZonedPage::~ZonedPage() |
|
1387 { |
|
1388 ((Page *)this)->~Page(); |
|
1389 delete fzones; |
|
1390 } |
|
1391 |
|
1392 Zones * ZonedPage::zones() |
|
1393 { return fzones; } |
|
1394 |
|
1395 Page * ZonedPage::activate(int x, int y) |
|
1396 // activate the page at Point(x,y) |
|
1397 { |
|
1398 Zone * activeZone = zones()->findZone(x,y); |
|
1399 if (activeZone == NULL) return NULL; |
|
1400 docommand("set cur_xoffset %d", activeZone->ul().x()); |
|
1401 docommand("set cur_yoffset %d", activeZone->ul().y()); |
|
1402 |
|
1403 if (activeZone->page() == NULL) |
|
1404 { |
|
1405 activeZone->buildPage(this); |
|
1406 } |
|
1407 |
|
1408 return activeZone->page(); |
|
1409 |
|
1410 } |
|
1411 |
|
1412 void ZonedPage::autoZone(int horizMerge, int vertMerge) |
|
1413 { // autoZone tries to automatically zone page |
|
1414 Point curul; |
|
1415 Point curlr; |
|
1416 int changed = 1; |
|
1417 |
|
1418 if (components() != NULL) |
|
1419 delete components(); |
|
1420 |
|
1421 extractComponents(horizMerge); |
|
1422 |
|
1423 while(changed) |
|
1424 { |
|
1425 changed = 0; |
|
1426 for (int i=0; i < numLines(); i++) |
|
1427 { |
|
1428 for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next) |
|
1429 { |
|
1430 Component * mainitem = (Component *) mptr->item; |
|
1431 for (int j= i; j < numLines(); j++) |
|
1432 for(ListElement * ptr = line(j)->first; ptr != NULL; ptr=ptr->next) |
|
1433 { |
|
1434 |
|
1435 Component * item = (Component *) ptr->item; |
|
1436 if( (item->ul().y() - mainitem->lr().y()) <= vertMerge && |
|
1437 (mainitem != item) && |
|
1438 mainitem->xoverlap(item)) |
|
1439 { |
|
1440 mainitem->join(item); |
|
1441 (line(j))->removeAt(ptr); |
|
1442 changed = 1; |
|
1443 } |
|
1444 } |
|
1445 } |
|
1446 } |
|
1447 } |
|
1448 |
|
1449 |
|
1450 for (int i=0; i < numLines(); i++) |
|
1451 { |
|
1452 for(ListElement * mptr = line(i)->first; mptr != NULL; mptr=mptr->next) |
|
1453 { |
|
1454 Component * mainitem = (Component *) mptr->item; |
|
1455 /* printf(" (ul(%d,%d) lr(%d,%d)) ", mainitem->ul().x(), |
|
1456 mainitem->ul().y(), mainitem->lr().x(), mainitem->lr().y()); |
|
1457 */ |
|
1458 /* |
|
1459 mainitem->display_bounding_box("blue", |
|
1460 ZONING_SCALE_FACTOR, |
|
1461 ".zoning_window.work_space"); |
|
1462 */ |
|
1463 Point ul = Point(mainitem->ul().x() -1,mainitem->ul().y() -1); |
|
1464 Point lr = Point( mainitem->lr().x() +1, mainitem->lr().y() +1); |
|
1465 docommand("start_region %d %d", (int)(ul.x()*ZONING_SCALE_FACTOR), |
|
1466 (int)(ul.y()*ZONING_SCALE_FACTOR)); |
|
1467 |
|
1468 docommand("end_region %d %d", (int)(lr.x()*ZONING_SCALE_FACTOR), |
|
1469 (int)(lr.y()*ZONING_SCALE_FACTOR)); |
|
1470 |
|
1471 Zone * newzone = new Zone(ul,lr); |
|
1472 zones()->Append(newzone); |
|
1473 } |
|
1474 |
|
1475 |
|
1476 } |
|
1477 } |
|
1478 |
|
1479 |
|
1480 |
|
1481 |
|
1482 |
|
1483 |
|
1484 |
|
1485 |
|
1486 |
|
1487 |
|
1488 |
|
1489 |
|
1490 |
|
1491 |
|
1492 |