|
1 /*-------------------------------------------------------------- |
|
2 Learn.cc - |
|
3 readlearnfiles - sources the tcl file to change learn files |
|
4 learn(char * tifffile, char * asciifile) |
|
5 Performs character learning by reading tiff and ascii translation |
|
6 Characters are partitioned into character groups as described |
|
7 in system.cc. See learn function for more details |
|
8 |
|
9 writeLearnedGroups(char * filename) Writes learned character to file |
|
10 readLearnedGroups(char * filename) Reads saved learned characters |
|
11 from file. |
|
12 ---------------------------------------------------------------*/ |
|
13 #include "tcl_interface.h" |
|
14 #include "system.h" |
|
15 #include "learn.h" |
|
16 #include "Page.h" |
|
17 #include "list.h" |
|
18 |
|
19 void readLearnFiles() |
|
20 /*-------------------------------------------------------------- |
|
21 Primary Purpose: Sources learnfile.tcl where new learn files can be |
|
22 specified without recompiling **/ |
|
23 { |
|
24 docommand("source learnfile.tcl"); |
|
25 } |
|
26 |
|
27 bool whitespace(char c) |
|
28 // Returns TRUE if c is a whitespace charater (called by learn.cc) |
|
29 { |
|
30 if ( c == '\n' || c == '\t' || c == ' ') return TRUE; |
|
31 return FALSE; |
|
32 |
|
33 } |
|
34 |
|
35 |
|
36 void learn(char * tifFile, char * asciiFile) |
|
37 /*-------------------------------------------------------------- |
|
38 Primary Purpose: Learns from TIFF and ascii file. Groups learned |
|
39 characters by baseline into LearnedGroups and |
|
40 sets properties. |
|
41 Arguments: tiffFile name of a tiff file to learn from |
|
42 asciiFile name of an ascii translation file |
|
43 Effects: Assumes a one to one correspondence between each connected |
|
44 component on a line of the tif file and each character on the corresponding |
|
45 line of the ascii file. |
|
46 |
|
47 Rev: 11/20/95 |
|
48 ---------------------------------------------------------------*/ |
|
49 { |
|
50 FILE * transFile; |
|
51 |
|
52 transFile = fopen(asciiFile,"r"); |
|
53 if(!transFile) |
|
54 { |
|
55 printf("Could not open the ascii learn file"); |
|
56 return; |
|
57 } |
|
58 if (LearnedGroups == NULL) |
|
59 LearnedGroups = new Components[NumCharGroups]; |
|
60 |
|
61 Page * learnPage = new Page; |
|
62 initCharBitsSet(); |
|
63 if(learnPage->readMap(tifFile) != VALID) |
|
64 { |
|
65 printf("Problem opening the learn image file (file doesn't exist?)\n"); |
|
66 return; |
|
67 } |
|
68 learnPage->setLines(); |
|
69 learnPage->extractComponents(); |
|
70 int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; |
|
71 char buffer[maxCharsPerLine]; |
|
72 int i = 0; |
|
73 int buflength; |
|
74 bool emptyLine; |
|
75 Components * components; |
|
76 Component * item; |
|
77 |
|
78 double width, height = 0.0; |
|
79 int h; |
|
80 |
|
81 while (i < learnPage->numLines() && |
|
82 fgets(buffer, maxCharsPerLine, transFile)) |
|
83 { |
|
84 buflength = strlen(buffer); |
|
85 components = learnPage->line(i++); |
|
86 int c = 0; |
|
87 for (ListElement* ptr = components->first; ptr != NULL; |
|
88 ptr = ptr->next) |
|
89 { |
|
90 item = (Component *)(ptr->item); |
|
91 |
|
92 // skip over white space |
|
93 while(whitespace(buffer[c]) && c < buflength)c++; |
|
94 |
|
95 if (c >= buflength) |
|
96 break; |
|
97 |
|
98 item->asciiId() = buffer[c++]; |
|
99 |
|
100 LearnedGroups[item->charGroup].Append((void*) item); |
|
101 ptr->item = NULL; // Set to Null in page so it wont get |
|
102 // clobbered on delete |
|
103 h = item->lr().y() - item->ul().y(); |
|
104 if (h > height) height = h; |
|
105 width = item->lr().x() - item->ul().x(); |
|
106 if (height/width > MaxHWRatio) |
|
107 MaxHWRatio = height/width; |
|
108 |
|
109 if (h/width < MinHWRatio) |
|
110 MinHWRatio = h/width; |
|
111 |
|
112 if (width < MinWidth) |
|
113 MinWidth = (int) width; |
|
114 |
|
115 /* printf("learned char %c, group %d\n", item->asciiId(), |
|
116 item->charGroup); |
|
117 */ |
|
118 |
|
119 } |
|
120 |
|
121 } |
|
122 |
|
123 |
|
124 if (fgets(buffer, maxCharsPerLine, transFile)) |
|
125 printf("Uh, oh. There are more characters to learn!\n"); |
|
126 /* printf("Maximum height/width ratio = %f\n", MaxHWRatio); */ |
|
127 /* printf("Minimum height/width ratio = %f\n", MinHWRatio); */ |
|
128 delete learnPage; |
|
129 } |
|
130 |
|
131 |
|
132 int writeLearnedGroups(char * filename) |
|
133 /*-------------------------------------------------------------- |
|
134 Primary Purpose: Write Learned groups out to file for reading |
|
135 in by readLearnedGroups |
|
136 Arguments: filename to write learned chars to |
|
137 Return Value: 1 if successful 0 if not |
|
138 Effects: Writes contents of LearnedGroups array out to filename |
|
139 LearnedGroups is an array of lists of components that is decleared |
|
140 in system.cc and initialized by the learn() function. |
|
141 For each group writes the number of Components the group contains |
|
142 followed by the group data. |
|
143 Other learned values such as MinWidth MinHWRatio etc are written to |
|
144 the file as well. |
|
145 Constraints: LearnedGroups must be initialized and filled with learned |
|
146 chars before this function is invoked. |
|
147 Rev: 11/27 KM |
|
148 ---------------------------------------------------------------*/ |
|
149 { |
|
150 int status; |
|
151 FILE * outfile; |
|
152 assert(LearnedGroups != NULL); |
|
153 |
|
154 outfile = fopen(filename, "w"); |
|
155 if (outfile == NULL) |
|
156 { |
|
157 printf("error openning %s \n", filename); |
|
158 return 0; |
|
159 } |
|
160 |
|
161 // Write global information about learned characters |
|
162 |
|
163 fwrite(&NumCharGroups, sizeof(NumCharGroups),1, outfile); |
|
164 fwrite(&MaxHWRatio, sizeof(MaxHWRatio),1, outfile); |
|
165 fwrite(&MinWidth, sizeof(MinWidth),1,outfile); |
|
166 fwrite(&MinHWRatio, sizeof(MinHWRatio),1,outfile); |
|
167 for(unsigned int i = 0; i < NumCharGroups; i++) |
|
168 { |
|
169 unsigned int numChars = LearnedGroups[i].length; |
|
170 // Write group number and number of characters |
|
171 fwrite(&i, sizeof(i), 1, outfile); |
|
172 status = fwrite(&numChars, sizeof(numChars),1,outfile); |
|
173 if (status == 0) return 0; |
|
174 for(ListElement * ptr = LearnedGroups[i].first; |
|
175 ptr != NULL; ptr = ptr->next) |
|
176 { |
|
177 |
|
178 Component * comp = (Component *) ptr->item; |
|
179 |
|
180 status = fwrite(comp, sizeof(Component),1,outfile); |
|
181 // printf("\tChar:%c status:%d \n", comp->asciiId(), status); |
|
182 |
|
183 for(int p = 0; p < numProperties; p++) |
|
184 { |
|
185 status = fwrite(&(comp->fproperty[p]), |
|
186 sizeof(Property), |
|
187 1, outfile); |
|
188 if (status == 0) |
|
189 { |
|
190 printf("Error writing properties of comp %c", |
|
191 comp->asciiId()); |
|
192 return 0; |
|
193 } |
|
194 } |
|
195 } |
|
196 } |
|
197 status = fclose(outfile); |
|
198 if (status == -1) return 0; |
|
199 else return 1; |
|
200 |
|
201 } |
|
202 |
|
203 int readLearnedGroups(char * filename) |
|
204 /*-------------------------------------------------------------- |
|
205 Primary Purpose: Read Learned groups from file that has been |
|
206 created by writeLearnedGroups |
|
207 Arguments: filename to read learned chars from |
|
208 Return Value: 1 if successful 0 if not |
|
209 Effects: Reads contents of filename into LearnedGroups array |
|
210 LearnedGroups is an array of lists of components that is decleared |
|
211 in system.cc and initialized here or in the learn() function. |
|
212 Constraints: LearnedGroups must not yet be initialized |
|
213 Rev: 11/27 KM |
|
214 ---------------------------------------------------------------*/ |
|
215 { |
|
216 int status; |
|
217 FILE * infile; |
|
218 unsigned int numGroups; // # of groups stored in file. |
|
219 |
|
220 initCharBitsSet(); |
|
221 if(LearnedGroups == NULL) |
|
222 LearnedGroups = new Components[NumCharGroups]; |
|
223 |
|
224 |
|
225 infile = fopen(filename, "r"); |
|
226 if (infile == NULL) |
|
227 { |
|
228 printf("error openning %s \n", filename); |
|
229 return 0; |
|
230 } |
|
231 |
|
232 // Read Globals |
|
233 fread(&numGroups, sizeof(numGroups),1, infile); |
|
234 assert(numGroups == NumCharGroups); |
|
235 fread(&MaxHWRatio, sizeof(MaxHWRatio),1, infile); |
|
236 fread(&MinWidth, sizeof(MinWidth),1,infile); |
|
237 fread(&MinHWRatio, sizeof(MinHWRatio),1,infile); |
|
238 for(unsigned int i = 0; i < NumCharGroups; i++) |
|
239 { |
|
240 unsigned int groupnum; |
|
241 unsigned int numChars; |
|
242 fread(&groupnum, sizeof(groupnum), 1, infile); |
|
243 assert(groupnum == i); |
|
244 fread(&numChars, sizeof(numChars),1,infile); |
|
245 |
|
246 printf("\nReading group %d - %d characters\n",i,numChars); |
|
247 for(unsigned int c = 0; c< numChars; c++) |
|
248 { |
|
249 Component * comp = new Component; |
|
250 short int * savepropptr = comp->fproperty; |
|
251 status = fread(comp, sizeof(Component),1,infile); |
|
252 comp->fproperty = savepropptr; |
|
253 for(int p = 0; p < numProperties; p++) |
|
254 { |
|
255 status = fread(&(comp->fproperty[p]), sizeof(Property), |
|
256 1, infile); |
|
257 if (status == 0) |
|
258 { |
|
259 printf("Error reading properties"); |
|
260 return 0; |
|
261 } |
|
262 } |
|
263 // printf("\tChar:%c status:%d ", comp->asciiId(), status); |
|
264 // printVector(comp->properties(), numProperties); |
|
265 LearnedGroups[i].Append(comp); |
|
266 |
|
267 } |
|
268 |
|
269 } |
|
270 status = fclose(infile); |
|
271 if (status == -1) return 0; |
|
272 else return 1; |
|
273 } |
|
274 |
|
275 void testLearn() |
|
276 { |
|
277 |
|
278 learn("/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.tif", |
|
279 "/amd/nfs/cochise/home/ee/cs169/fa95/class/cs169-ab/train.txt"); |
|
280 } |
|
281 |
|
282 /***************************************************************** |
|
283 FUNCTIONS BEYOND THIS POINT ARE FOR AVERAGING LEARNED CHARACTERS |
|
284 AND ARE NOT CURRENTLY USED. |
|
285 *******************************************************************/ |
|
286 |
|
287 void initLearnedChars() |
|
288 /*-------------------------------------------------------------- |
|
289 Primary Purpose: Initializes learned character array. Sets asciiId |
|
290 to array offset. |
|
291 Rev: KM 11/6/95 |
|
292 ---------------------------------------------------------------*/ |
|
293 { |
|
294 LearnedChars = new Component[256]; |
|
295 |
|
296 for (int i=0; i < 256; i++) |
|
297 { |
|
298 LearnedChars[i].asciiId() = (char)i; |
|
299 } |
|
300 |
|
301 } |
|
302 |
|
303 void oldlearn(char * tifFile, char * asciiFile) |
|
304 /*-------------------------------------------------------------- |
|
305 Primary Purpose: builds property vectors for LearnedChars array |
|
306 Arguments: tiffFile name of a tiff file to learn from |
|
307 asciiFile name of an ascii translation file |
|
308 Effects: Assumes a one to one correspondence between each connected |
|
309 component on a line of the tif file and each character on the corresponding |
|
310 line of the ascii file. For learned characters confidence is set |
|
311 to the number of examples. |
|
312 |
|
313 Rev: 11/6/95 |
|
314 ---------------------------------------------------------------*/ |
|
315 { |
|
316 FILE * transFile; |
|
317 transFile = fopen(asciiFile,"r"); |
|
318 Page * learnPage = new Page; |
|
319 initCharBitsSet(); |
|
320 learnPage->readMap(tifFile); |
|
321 learnPage->setLines(); |
|
322 learnPage->extractComponents(); /* why minlinesize? */ |
|
323 int maxCharsPerLine = learnPage->bmap()->imageWidth() / MinLineSize; |
|
324 char buffer[maxCharsPerLine]; |
|
325 int i = 0; |
|
326 int buflength; |
|
327 bool emptyLine; |
|
328 Components * components; |
|
329 Component * item; |
|
330 int count[256]; // a count of how many of each char have been encountered |
|
331 int prop[256][numProperties]; // Character property sums. Need ints so that |
|
332 // property sum does |
|
333 // not exceed char boundaries |
|
334 char id; |
|
335 |
|
336 initLearnedChars(); |
|
337 for (i = 0; i < 256; i++) |
|
338 { |
|
339 count[i] = 0; |
|
340 for (int p = 0; p < numProperties; p++) |
|
341 prop[i][p] = 0; |
|
342 } |
|
343 i=0; |
|
344 |
|
345 int offset; |
|
346 while (i < learnPage->numLines() && |
|
347 fgets(buffer, maxCharsPerLine, transFile)) |
|
348 { |
|
349 buflength = strlen(buffer); |
|
350 components = learnPage->line(i++); |
|
351 int c = 0; |
|
352 for (ListElement* ptr = components->first; ptr != NULL; |
|
353 ptr = ptr->next) |
|
354 { |
|
355 item = (Component *)(ptr->item); |
|
356 // skip over white space |
|
357 while(whitespace(buffer[c]) && c < buflength)c++; |
|
358 if (c >= buflength)break; |
|
359 id = buffer[c++]; |
|
360 count[id]++; // increment character count |
|
361 for (offset=0; offset < numProperties; offset++) |
|
362 prop[id][offset] += (item->properties())[offset]; |
|
363 LearnedChars[i].numBits() += item->numBits(); |
|
364 } |
|
365 } |
|
366 // now divide by count and put in Learned character |
|
367 for(int j = 0; j < 256; j++) |
|
368 { |
|
369 if(count[j] > 0) |
|
370 { |
|
371 for (int offset=0; offset < numProperties; offset++) |
|
372 prop[j][offset] /= count[j]; |
|
373 LearnedChars[j].numBits() /= count[j]; |
|
374 LearnedChars[j].confid() = count[j]; |
|
375 for (offset=0; offset < numProperties; offset++) |
|
376 (LearnedChars[j].properties())[offset] = prop[j][offset]; |
|
377 // printf("%d occurrences of %c\n", count[j], (char)j); |
|
378 printVector(LearnedChars[j].properties(), numProperties); |
|
379 |
|
380 } |
|
381 |
|
382 } |
|
383 } |
|
384 |
|
385 void oldtestLearn() |
|
386 { |
|
387 |
|
388 |
|
389 learn("train.tif", "train.txt"); |
|
390 if (ENABLE_USER_INTERFACE) |
|
391 docommand(".main_window.display.work_space delete IMAGE_TAG"); //6/16/00 |
|
392 //docommand("button .b -text \"hello\" -command exit \n pack .b\n"); |
|
393 |
|
394 } |
|
395 |