|
1 /*-------------------------------------------------------------- |
|
2 Learn.cc - |
|
3 readlearnfiles - sources the tcl file to change learn files |
|
4 learn(char * tifffile, char * asciifile) |
|
5 Performs character learning by reading tiff and ascii translation |
|
6 Characters are partitioned into character groups as described |
|
7 in system.cc. See learn function for more details |
|
8 |
|
9 writeLearnedGroups(char * filename) Writes learned character to file |
|
10 readLearnedGroups(char * filename) Reads saved learned characters |
|
11 from file. |
|
12 ---------------------------------------------------------------*/ |
|
13 #include "tcl_interface.h" |
|
14 #include "system.h" |
|
15 #include "learn.h" |
|
16 #include "Page.h" |
|
17 #include "list.h" |
|
18 |
|
19 void readLearnFiles() |
|
20 /*-------------------------------------------------------------- |
|
21 Primary Purpose: Sources learnfile.tcl where new learn files can be |
|
22 specified without recompiling **/ |
|
23 { |
|
24 docommand("source learnfile.tcl"); |
|
25 } |
|
26 |
|
27 bool whitespace(char c) |
|
28 // Returns TRUE if c is a whitespace charater (called by learn.cc) |
|
29 { |
|
30 if ( c == '\n' || c == '\t' || c == ' ') return TRUE; |
|
31 return FALSE; |
|
32 |
|
33 } |
|
34 |
|
35 bool blank(char * string) |
|
36 { |
|
37 if (string == NULL) return TRUE; |
|
38 int len = strlen(string); |
|
39 for(int c=0; c< len; c++) |
|
40 { |
|
41 if (!(whitespace(string[c]))) |
|
42 return FALSE; |
|
43 } |
|
44 return TRUE; |
|
45 } |
|
46 |
|
47 |
|
48 void printLearnedGroups() |
|
49 { |
|
50 // Just print these guys out to make sure they are ok. |
|
51 for(unsigned int i = 0; i < NumCharGroups; i++) |
|
52 for(ListElement * ptr = LearnedGroups[i].first; |
|
53 ptr != NULL; ptr = ptr->next) |
|
54 { Component * item = (Component *) ptr->item; |
|
55 printf("learned char %s, group %d\n", item->fasciiId, |
|
56 item->charGroup); |
|
57 } |
|
58 |
|
59 } |
|
60 |
|
61 int lengthNextWord(char * buffer,int offset, int buflength) |
|
62 { |
|
63 // counts things in '< >' as one character |
|
64 int count; |
|
65 |
|
66 for(int c=offset; c < buflength && !(whitespace(buffer[c])); c++) |
|
67 { |
|
68 if(buffer[c] == '<') |
|
69 { |
|
70 while((buffer[c] != '>') && (c < buflength)) |
|
71 c++; |
|
72 count++; |
|
73 } |
|
74 else |
|
75 count++; |
|
76 } |
|
77 return count; |
|
78 } |
|
79 |
|
80 |
|
81 int learn(Component * comp, char * id, Confidence threshold) |
|
82 /*-------------------------------------------------------------- |
|
83 Primary Purpose: Make a copy of this component and add it to |
|
84 LearnedGroups. id is ascii identification. |
|
85 Component will only be learned if confidence |
|
86 is below threshold or if id and asciiid dont match |
|
87 Arguments: comp - component to learn |
|
88 id - ascii identification |
|
89 threshold - confidence threshold for learning |
|
90 Return Value: 1 if component was learned, 0 otherwise |
|
91 Rev: 4/25/96 |
|
92 ---------------------------------------------------------------*/ |
|
93 { |
|
94 Component * newcomp; |
|
95 |
|
96 if (comp->confid() < threshold || !(strcmp(comp->fasciiId, id))) |
|
97 { |
|
98 newcomp = comp->copy(); |
|
99 delete newcomp->fasciiId; |
|
100 newcomp->fasciiId = new char[strlen(id)+1]; |
|
101 strcpy(newcomp->fasciiId , id); |
|
102 |
|
103 LearnedGroups[newcomp->charGroup].Append(newcomp); |
|
104 return 1; |
|
105 } |
|
106 return 0; |
|
107 } |
|
108 |
|
109 void learn(char * tifFile, char * asciiFile, bool synchwords) |
|
110 /*-------------------------------------------------------------- |
|
111 Primary Purpose: Learns from TIFF and ascii file. Groups learned |
|
112 characters by baseline into LearnedGroups and |
|
113 sets properties. |
|
114 Arguments: tiffFile name of a tiff file to learn from |
|
115 asciiFile name of an ascii translation file |
|
116 Effects: Assumes a one to one correspondence between each connected |
|
117 component on a line of the tif file and each character on the corresponding |
|
118 line of the ascii file. |
|
119 |
|
120 Rev: 4/26/96 |
|
121 ---------------------------------------------------------------------*/ |
|
122 { |
|
123 |
|
124 Page * learnPage = new Page; |
|
125 initCharBitsSet(); |
|
126 if(learnPage->readMap(tifFile) != VALID) |
|
127 { |
|
128 printf("Problem opening the learn image file (file doesn't exist?)\n"); |
|
129 return; |
|
130 } |
|
131 learnPage->setLines(); |
|
132 learnPage->extractComponents(MinHorizSeparation); |
|
133 learnPage->extractWords(); |
|
134 learn(learnPage, asciiFile, synchwords); |
|
135 |
|
136 // delete learnPage; |
|
137 |
|
138 } |
|
139 |
|
140 |
|
141 void learn(Page * learnPage, char * asciiFile, bool synchWords) |
|
142 /*-------------------------------------------------------------- |
|
143 Primary Purpose: Learns from a Page and an ascii file. Used from |
|
144 tcl user interface under File/Learn opation |
|
145 Groups learned |
|
146 characters by baseline into LearnedGroups and |
|
147 sets properties. |
|
148 Arguments: tiffFile name of a tiff file to learn from |
|
149 asciiFile name of an ascii translation file |
|
150 Effects: Assumes a one to one correspondence between each connected |
|
151 component on a line of the tif file and each character on the corresponding |
|
152 line of the ascii file. |
|
153 |
|
154 Rev: 4/26/96 |
|
155 ---------------------------------------------------------------*/ |
|
156 { |
|
157 FILE * transFile; |
|
158 |
|
159 transFile = fopen(asciiFile,"r"); |
|
160 if(!transFile) |
|
161 { |
|
162 printf("Could not open the ascii learn file"); |
|
163 return; |
|
164 } |
|
165 if (LearnedGroups == NULL) |
|
166 LearnedGroups = new Components[NumCharGroups]; |
|
167 |
|
168 int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; |
|
169 char buffer[maxCharsPerLine]; |
|
170 int i = -1; |
|
171 int buflength=0; |
|
172 bool instring= FALSE; |
|
173 bool emptyLine; |
|
174 Components * components = NULL; |
|
175 Words * words; |
|
176 Component * item; |
|
177 |
|
178 double width, height = 0.0; |
|
179 int h; |
|
180 |
|
181 |
|
182 words = learnPage->words(); |
|
183 int c = 0; |
|
184 Word * word; |
|
185 |
|
186 for (ListElement * ptr = words->first; ptr != NULL && |
|
187 (i < learnPage->numLines()) ; ptr = ptr->next) |
|
188 { |
|
189 |
|
190 word = (Word *) ptr->item; |
|
191 // if new line get new text line |
|
192 if (word->characters[0] == '\n' || buflength == 0) |
|
193 { |
|
194 char * ok; |
|
195 do { |
|
196 ok =fgets(buffer, maxCharsPerLine, transFile); |
|
197 } while (ok && blank(buffer)); // skip blank lines. |
|
198 buflength= strlen(buffer); |
|
199 components = learnPage->line(++i); |
|
200 c =0; |
|
201 if (word->characters[0] == '\n') continue; |
|
202 } |
|
203 |
|
204 |
|
205 // skip over white space |
|
206 while(whitespace(buffer[c]) && c < buflength)c++; |
|
207 |
|
208 // Make sure we have an equal # of components characters |
|
209 if (synchWords && |
|
210 (word->charCount == lengthNextWord(buffer,c,buflength))) |
|
211 { |
|
212 // skip over this word |
|
213 while(!(whitespace(buffer[c])) && c < buflength) |
|
214 c++; |
|
215 continue; // move on to the next word |
|
216 } |
|
217 |
|
218 for (int ch = 0; ch < word->charCount; ch++) |
|
219 { |
|
220 while(whitespace(buffer[c]) && c < buflength)c++; |
|
221 item = word->character[ch]; |
|
222 if (c >= buflength) break; |
|
223 |
|
224 // Link string translation to component. Characters between |
|
225 // brackets are for one component. |
|
226 if(buffer[c] == '<' && !instring) |
|
227 { |
|
228 instring = TRUE; |
|
229 int startString = c; |
|
230 while(c++ < buflength && buffer[c] != '>'); |
|
231 int endString = c+1; |
|
232 |
|
233 int stringSize = endString - startString; |
|
234 char newstring[stringSize+1]; |
|
235 strncpy(newstring, &buffer[startString],stringSize); |
|
236 newstring[stringSize] = '\0'; |
|
237 // learn if id's don't match or below threshold |
|
238 learn(item, newstring, ConfidenceThreshold); |
|
239 c++; |
|
240 instring = FALSE; |
|
241 } |
|
242 else |
|
243 { |
|
244 char newstring[2]; |
|
245 newstring[0] = buffer[c++]; |
|
246 newstring[1]= '\0'; |
|
247 learn(item, newstring, ConfidenceThreshold); |
|
248 } |
|
249 |
|
250 LearnedGroups[item->charGroup].Append(item); |
|
251 //ptr->item = NULL; // Set to Null in page so it wont get |
|
252 // clobbered on delete |
|
253 h = item->lr().y() - item->ul().y(); |
|
254 if (h > height) height = h; |
|
255 width = item->lr().x() - item->ul().x(); |
|
256 if (height/width > MaxHWRatio) |
|
257 MaxHWRatio = height/width; |
|
258 |
|
259 if (h/width < MinHWRatio) |
|
260 MinHWRatio = h/width; |
|
261 |
|
262 if (width < MinWidth) |
|
263 MinWidth = (int) width; |
|
264 |
|
265 |
|
266 } |
|
267 } |
|
268 |
|
269 |
|
270 |
|
271 |
|
272 if (fgets(buffer, maxCharsPerLine, transFile)) |
|
273 printf("Uh, oh. There are more characters to learn!\n"); |
|
274 /* printf("Maximum height/width ratio = %f\n", MaxHWRatio); */ |
|
275 /* printf("Minimum height/width ratio = %f\n", MinHWRatio); */ |
|
276 |
|
277 |
|
278 // printLearnedGroups(); |
|
279 |
|
280 } |
|
281 |
|
282 |
|
283 int writeLearnedGroups(char * filename) |
|
284 /*-------------------------------------------------------------- |
|
285 Primary Purpose: Write Learned groups out to file for reading |
|
286 in by readLearnedGroups |
|
287 Arguments: filename to write learned chars to |
|
288 Return Value: 1 if successful 0 if not |
|
289 Effects: Writes contents of LearnedGroups array out to filename |
|
290 LearnedGroups is an array of lists of components that is decleared |
|
291 in system.cc and initialized by the learn() function. |
|
292 For each group writes the number of Components the group contains |
|
293 followed by the group data. |
|
294 Other learned values such as MinWidth MinHWRatio etc are written to |
|
295 the file as well. |
|
296 Constraints: LearnedGroups must be initialized and filled with learned |
|
297 chars before this function is invoked. |
|
298 Rev: 11/27 KM |
|
299 ---------------------------------------------------------------*/ |
|
300 { |
|
301 int status; |
|
302 FILE * outfile; |
|
303 assert(LearnedGroups != NULL); |
|
304 |
|
305 outfile = fopen(filename, "w"); |
|
306 if (outfile == NULL) |
|
307 { |
|
308 printf("error openning %s \n", filename); |
|
309 return 0; |
|
310 } |
|
311 |
|
312 // Write global information about learned characters |
|
313 |
|
314 fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile); |
|
315 fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile); |
|
316 fwrite(&MinWidth, sizeof(MinWidth),1,outfile); |
|
317 fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile); |
|
318 for(unsigned int i = 0; i < NumCharGroups; i++) |
|
319 { |
|
320 unsigned int numChars = LearnedGroups[i].length; |
|
321 // Write group number and number of characters |
|
322 fwrite(&i, sizeof(i), 1, outfile); |
|
323 status = fwrite(&numChars, sizeof(numChars),1,outfile); |
|
324 if (status == 0) return 0; |
|
325 for(ListElement * ptr = LearnedGroups[i].first; |
|
326 ptr != NULL; ptr = ptr->next) |
|
327 { |
|
328 |
|
329 Component * comp = (Component *) ptr->item; |
|
330 |
|
331 status = fwrite(comp, sizeof(Component),1,outfile); |
|
332 // printf("\tChar:%c status:%d \n", comp->asciiId(), status); |
|
333 int stringSize = strlen(comp->fasciiId) +1; |
|
334 status = fwrite(&stringSize, sizeof(stringSize),1,outfile); |
|
335 status = fwrite(comp->fasciiId, stringSize,1,outfile); |
|
336 for(int p = 0; p < numProperties; p++) |
|
337 { |
|
338 status = fwrite(&(comp->fproperty[p]), |
|
339 sizeof(Property), |
|
340 1, outfile); |
|
341 if (status == 0) |
|
342 { |
|
343 printf("Error writing properties of comp %c", |
|
344 comp->asciiId()); |
|
345 return 0; |
|
346 } |
|
347 } |
|
348 } |
|
349 } |
|
350 status = fclose(outfile); |
|
351 if (status == -1) return 0; |
|
352 else return 1; |
|
353 |
|
354 } |
|
355 |
|
356 int readLearnedGroups(char * filename) |
|
357 /*-------------------------------------------------------------- |
|
358 Primary Purpose: Read Learned groups from file that has been |
|
359 created by writeLearnedGroups |
|
360 Arguments: filename to read learned chars from |
|
361 Return Value: 1 if successful 0 if not |
|
362 Effects: Reads contents of filename into LearnedGroups array |
|
363 LearnedGroups is an array of lists of components that is decleared |
|
364 in system.cc and initialized here or in the learn() function. |
|
365 Constraints: LearnedGroups must not yet be initialized |
|
366 Rev: 11/27 KM |
|
367 ---------------------------------------------------------------*/ |
|
368 { |
|
369 int status; |
|
370 FILE * infile; |
|
371 unsigned int numGroups; // # of groups stored in file. |
|
372 |
|
373 initCharBitsSet(); |
|
374 if(LearnedGroups == NULL) |
|
375 LearnedGroups = new Components[NumCharGroups]; |
|
376 |
|
377 |
|
378 infile = fopen(filename, "r"); |
|
379 if (infile == NULL) |
|
380 { |
|
381 printf("error openning %s \n", filename); |
|
382 return 0; |
|
383 } |
|
384 |
|
385 // Read Globals |
|
386 fread(&numGroups, sizeof(numGroups),1, infile); |
|
387 assert(numGroups == NumCharGroups); |
|
388 fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile); |
|
389 fread(&MinWidth, sizeof(MinWidth),1,infile); |
|
390 fread(&MinHWRatio, sizeof(MinHWRatio),1,infile); |
|
391 for(unsigned int i = 0; i < NumCharGroups; i++) |
|
392 { |
|
393 unsigned int groupnum; |
|
394 unsigned int numChars; |
|
395 fread(&groupnum, sizeof(groupnum), 1, infile); |
|
396 assert(groupnum == i); |
|
397 fread(&numChars, sizeof(numChars),1,infile); |
|
398 |
|
399 printf("\nReading group %d - %d characters\n",i,numChars); |
|
400 for(unsigned int c = 0; c< numChars; c++) |
|
401 { |
|
402 Component * comp = new Component; |
|
403 short int * savepropptr = comp->fproperty; |
|
404 |
|
405 status = fread(comp, sizeof(Component),1,infile); |
|
406 int stringSize; |
|
407 status = fread(&stringSize, sizeof(stringSize),1,infile); |
|
408 comp->fasciiId = new char[stringSize]; |
|
409 status = fread(comp->fasciiId, stringSize,1,infile); |
|
410 |
|
411 comp->fproperty = savepropptr; |
|
412 |
|
413 for(int p = 0; p < numProperties; p++) |
|
414 { |
|
415 status = fread(&(comp->fproperty[p]), sizeof(Property), |
|
416 1, infile); |
|
417 if (status == 0) |
|
418 { |
|
419 printf("Error reading properties"); |
|
420 return 0; |
|
421 } |
|
422 } |
|
423 // printf("\tChar:%c status:%d ", comp->asciiId(), status); |
|
424 // printVector(comp->properties(), numProperties); |
|
425 LearnedGroups[i].Append(comp); |
|
426 |
|
427 } |
|
428 |
|
429 } |
|
430 status = fclose(infile); |
|
431 if (status == -1) return 0; |
|
432 else return 1; |
|
433 } |
|
434 |
|
435 void testLearn() |
|
436 { |
|
437 |
|
438 learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif", |
|
439 "/amd/nfs/cochise/home/ee/cs169/fa95class/cs169-ab/train.txt"); |
|
440 } |
|
441 |
|
442 /***************************************************************** |
|
443 FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS |
|
444 AND ARE NOT CURRENTLY USED. |
|
445 *******************************************************************/ |
|
446 |
|
447 void initLearnedChars() |
|
448 /*-------------------------------------------------------------- |
|
449 Primary Purpose: Initializes learned character array. Sets asciiId |
|
450 to array offset. |
|
451 Rev: KM 11/6/95 |
|
452 ---------------------------------------------------------------*/ |
|
453 { |
|
454 LearnedChars = new Component[256]; |
|
455 |
|
456 for (int i=0; i < 256; i++) |
|
457 { |
|
458 LearnedChars[i].asciiId() = (char)i; |
|
459 } |
|
460 |
|
461 } |
|
462 |
|
463 void oldlearn(char * tifFile, char * asciiFile) |
|
464 /*-------------------------------------------------------------- |
|
465 Primary Purpose: builds property vectors for LearnedChars array |
|
466 Arguments: tiffFile name of a tiff file to learn from |
|
467 asciiFile name of an ascii translation file |
|
468 Effects: Assumes a one to one correspondence between each connected |
|
469 component on a line of the tif file and each character on the corresponding |
|
470 line of the ascii file. For learned characters confidence is set |
|
471 to the number of examples. |
|
472 |
|
473 Rev: 11/6/95 |
|
474 ---------------------------------------------------------------*/ |
|
475 { |
|
476 FILE * transFile; |
|
477 transFile = fopen(asciiFile,"r"); |
|
478 Page * learnPage = new Page; |
|
479 initCharBitsSet(); |
|
480 learnPage->readMap(tifFile); |
|
481 learnPage->setLines(); |
|
482 learnPage->extractComponents(MinHorizSeparation); /* why minlinesize? */ |
|
483 int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; |
|
484 char buffer[maxCharsPerLine]; |
|
485 int i = 0; |
|
486 int buflength; |
|
487 bool emptyLine; |
|
488 Components * components; |
|
489 Component * item; |
|
490 int count[256]; // a count of how many of each char have been encountered |
|
491 int prop[256][numProperties]; // Character property sums. Need ints so that |
|
492 // property sum does |
|
493 // not exceed char boundaries |
|
494 char id; |
|
495 |
|
496 initLearnedChars(); |
|
497 for (i = 0; i < 256; i++) |
|
498 { |
|
499 count[i] = 0; |
|
500 for (int p = 0; p < numProperties; p++) |
|
501 prop[i][p] = 0; |
|
502 } |
|
503 i=0; |
|
504 |
|
505 int offset; |
|
506 while (i < learnPage->numLines() && |
|
507 fgets(buffer, maxCharsPerLine, transFile)) |
|
508 { |
|
509 buflength = strlen(buffer); |
|
510 components = learnPage->line(i++); |
|
511 int c = 0; |
|
512 for (ListElement* ptr = components->first; ptr != NULL; |
|
513 ptr = ptr->next) |
|
514 { |
|
515 item = (Component *)(ptr->item); |
|
516 // skip over white space |
|
517 while(whitespace(buffer[c]) && c < buflength)c++; |
|
518 if (c >= buflength)break; |
|
519 id = buffer[c++]; |
|
520 count[id]++; // increment character count |
|
521 for (offset=0; offset < numProperties; offset++) |
|
522 prop[id][offset] += (item->properties())[offset]; |
|
523 LearnedChars[i].numBits() += item->numBits(); |
|
524 } |
|
525 } |
|
526 // now divide by count and put in Learned character |
|
527 for(int j = 0; j < 256; j++) |
|
528 { |
|
529 if(count[j] > 0) |
|
530 { |
|
531 for (int offset=0; offset < numProperties; offset++) |
|
532 prop[j][offset] /= count[j]; |
|
533 LearnedChars[j].numBits() /= count[j]; |
|
534 LearnedChars[j].confid() = count[j]; |
|
535 for (offset=0; offset < numProperties; offset++) |
|
536 (LearnedChars[j].properties())[offset] = prop[j][offset]; |
|
537 // printf("%d occurrences of %c\n", count[j], (char)j); |
|
538 printVector(LearnedChars[j].properties(), numProperties); |
|
539 |
|
540 } |
|
541 |
|
542 } |
|
543 } |
|
544 |
|
545 void oldtestLearn() |
|
546 { |
|
547 |
|
548 |
|
549 learn("train.tif", "train.txt"); |
|
550 if (ENABLE_USER_INTERFACE) |
|
551 docommand(".main_window.display.work_space delete IMAGE_TAG"); |
|
552 } |
|
553 |
|
554 |
|
555 |
|
556 |
|
557 |
|
558 |