|
1 #include "system.h" |
|
2 #include "Point.h" |
|
3 |
|
4 |
|
5 // Global Variables |
|
6 Point NOPNT(-1,-1); // Used for default entries |
|
7 /* Global variables used to fine tune OCR. These can be adjusted |
|
8 without recompiling by setting them in link_vars.tcl */ |
|
9 int NoiseTolerance = 1; // Minumum number of pixels in a line of text |
|
10 int MinLineSize = 5; // Minimum number of rows in a line of text |
|
11 int MinVertSeparation = 0; // Minimum number of rows between lines of text |
|
12 int MinHorizSeparation = 1; // Minimum number of cols between characters |
|
13 int ConfidenceThreshold = 150; //Minimum confidence for some operations |
|
14 int JoinTolerance = 6; // Max number of pixels joining fused chars. |
|
15 |
|
16 |
|
17 |
|
18 /* Number of properties in property vector for Components **/ |
|
19 int numProperties = 30; |
|
20 |
|
21 /* Grid size for gray scale analysis */ |
|
22 int NumHorizDiv = 5; |
|
23 int NumVertDiv = 5; |
|
24 |
|
25 // The next four are used in character grouping set in Page::extractComponents |
|
26 /* Group 0 - amo |
|
27 Group 1 - Descenders yjp |
|
28 Group 2 - Ascenders JPK |
|
29 Group 3 - Both descenders and Ascenders () |
|
30 Group 4 - floaters * - ` |
|
31 */ |
|
32 unsigned int NumCharGroups=5; |
|
33 int MaxVertSize = 50; // Max vert pixels in char (used for baseline) |
|
34 int BaseLineTolerance = 10; // How far in 1/x of line size from base is okay |
|
35 int TopLineTolerance = 10; // How far in 1/x of line size from top is okay |
|
36 // 20 = 5%, 10 = 10% |
|
37 int MinComponentSize = 16; // Minimum number of pixels in smallest character |
|
38 |
|
39 uchar CharBitsSet[256]; // Table of number of bits set in each num 0-256 |
|
40 // Used for determining gray scale and pixel counts |
|
41 |
|
42 /** Some globals set in learn() or readLearnedChars(). These are just starting |
|
43 values **/ |
|
44 |
|
45 double MaxHWRatio = 0.0; |
|
46 double MinHWRatio = 1000; |
|
47 int MinWidth = 1000; // Min component width in learned set |
|
48 |
|
49 |
|
50 |
|
51 |
|
52 Component * LearnedChars; // Learned character averages /** NOT USED **/ |
|
53 Components * LearnedGroups=NULL; //Learned character list array by group type |
|
54 |
|
55 |
|
56 /*** Some values for TCL/TK interface. These variables can be |
|
57 set in the file link_vars.tcl without recompiling ***/ |
|
58 |
|
59 int ENABLE_USER_INTERFACE = 0; |
|
60 int VERY_LOW_CONFIDENCE = 150; |
|
61 int LOW_CONFIDENCE = 200; |
|
62 int DISPLAY_LINE_BOUNDARIES = 0; |
|
63 int DISPLAY_BOUNDING_BOXES = 0; // boxes around components |
|
64 int SPELLCHECK = 0; |
|
65 int DISPLAY_IMAGE = 1; |
|
66 int DESKEW_METHOD = BITMAP_DESKEW; |
|
67 double ZONING_SCALE_FACTOR = .50; |
|
68 double SCALE_FACTOR = 0.5; |
|
69 |
|
70 TclMode mode = REGULAR; |
|
71 |
|
72 void initCharBitsSet() |
|
73 // Initializes lookup table for the number of bits set in a uchar |
|
74 { |
|
75 int pixCount; |
|
76 for (int c = 0; c<256;c++) |
|
77 { |
|
78 pixCount = 0; |
|
79 for (int i = 7; i >=0; i--) |
|
80 pixCount +=((c>>i)&1); // if this is a black pixel |
|
81 CharBitsSet[c]=pixCount; |
|
82 } |
|
83 } |
|
84 |
|
85 char* backslashify(char* w) |
|
86 /* backslashes all $ " [] {} () */ |
|
87 { |
|
88 int length = strlen(w); |
|
89 char* new_word = (char*)malloc(length*2); |
|
90 int new_word_pos = 0; |
|
91 for(int i = 0; i < length; i++) |
|
92 { |
|
93 if((w[i] == '$') || |
|
94 (w[i] == '[') || |
|
95 (w[i] == ']') || |
|
96 (w[i] == '\\') || |
|
97 (w[i] == '{') || |
|
98 (w[i] == '}') || |
|
99 (w[i] == '(') || |
|
100 (w[i] == ')') || |
|
101 (w[i] == ';')) |
|
102 { |
|
103 new_word[new_word_pos] = '\\'; |
|
104 new_word[new_word_pos+1] = w[i]; |
|
105 new_word_pos += 2; |
|
106 } |
|
107 else |
|
108 { |
|
109 new_word[new_word_pos] = w[i]; |
|
110 new_word_pos += 1; |
|
111 } |
|
112 } |
|
113 new_word[new_word_pos] = '\0'; |
|
114 return new_word; |
|
115 } |
|
116 |
|
117 void invertBitsInBuffer(uchar * buf, int size) |
|
118 { |
|
119 for(int i = 0; i < size; i++) |
|
120 buf[i] = ~buf[i] ; |
|
121 |
|
122 } |
|
123 |
|
124 void clearBitsInBuffer(uchar * buf, int size) |
|
125 { |
|
126 for(int i = 0; i < size; i++) |
|
127 buf[i]=0; |
|
128 |
|
129 |
|
130 } |
|
131 |
|
132 short int countBitsSet(uchar c) |
|
133 { |
|
134 int pixCount = 0; |
|
135 /* |
|
136 for (int i = 7; i >=0; i--) |
|
137 pixCount +=((c>>i)&1); // if this is a black pixel |
|
138 */ |
|
139 if (!(CharBitsSet['f'])) |
|
140 initCharBitsSet(); |
|
141 return CharBitsSet[c]; |
|
142 } |
|
143 |
|
144 int pixelsBetween(uchar * ar, int start, int end) |
|
145 { |
|
146 // Counts the number of black pixels between start and end |
|
147 int startCharNum = start / 8; |
|
148 int endCharNum = end / 8 ; |
|
149 int pixCount=0, startOffset, endOffset; |
|
150 uchar nextChar; |
|
151 |
|
152 startOffset = start - startCharNum*8; // first bit of range in first char |
|
153 endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char |
|
154 |
|
155 // count the whole characters |
|
156 for (int i = startCharNum + 1; i < endCharNum; i++) |
|
157 { |
|
158 nextChar = ar[i]; |
|
159 pixCount += countBitsSet(nextChar); |
|
160 } |
|
161 // Now add in end peices |
|
162 // Get our part of the starting character |
|
163 // Add in just the last part of the char (get rid of hi bits) |
|
164 nextChar = ar[startCharNum] << startOffset; |
|
165 if (startCharNum != endCharNum ) |
|
166 { |
|
167 pixCount += countBitsSet(nextChar); |
|
168 // Get our part of the ending character, |
|
169 // Add in just the first endOffset bits (get rid of lo bits) |
|
170 nextChar = ar[endCharNum] >> (8 - endOffset); |
|
171 pixCount += countBitsSet(nextChar); |
|
172 } |
|
173 else |
|
174 { |
|
175 // just shift the adjusted starting char |
|
176 int shift = (8-endOffset)+startOffset; |
|
177 pixCount += countBitsSet(nextChar >> shift); |
|
178 } |
|
179 return pixCount; |
|
180 |
|
181 } |
|
182 |
|
183 |
|
184 void setRange(uchar ar[], int start, int end) |
|
185 // Sets bits from position start to position end |
|
186 { |
|
187 int startCharNum = start / 8; |
|
188 int endCharNum = end / 8 ; |
|
189 int startOffset, endOffset; |
|
190 |
|
191 startOffset = start - startCharNum*8; // first bit of range in first char |
|
192 endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char |
|
193 |
|
194 // set the whole characters |
|
195 for (int i = startCharNum + 1; i < endCharNum; i++) |
|
196 { |
|
197 ar[i] = 255; |
|
198 } |
|
199 // Now set end peices |
|
200 if (startCharNum != endCharNum ) |
|
201 { |
|
202 ar[startCharNum] |= (255 >> startOffset); |
|
203 ar[endCharNum] |= (255 << (8 - endOffset)); |
|
204 } |
|
205 else // start and end char are the same |
|
206 { |
|
207 char mask = 255 >> startOffset; |
|
208 mask &= 255 << (8-endOffset); |
|
209 ar[endCharNum] |= mask; |
|
210 } |
|
211 |
|
212 }; |
|
213 |
|
214 |
|
215 |
|
216 |
|
217 |
|
218 |
|
219 |
|
220 |
|
221 |
|
222 |
|
223 |
|
224 |
|
225 |
|
226 |
|
227 |
|
228 |
|
229 |
|
230 |
|
231 |
|
232 |
|
233 |
|
234 |
|
235 |
|
236 |
|
237 |
|
238 |
|
239 |
|
240 |
|
241 |
|
242 |