|
1 #include "system.h" |
|
2 #include "Point.h" |
|
3 |
|
4 |
|
5 // Global Variables |
|
6 Point NOPNT(-1,-1); // Used for default entries |
|
7 /* Global variables used to fine tune OCR. These can be adjusted |
|
8 without recompiling by setting them in link_vars.tcl */ |
|
9 int NoiseTolerance = 1; // Minumum number of pixels in a line of text |
|
10 int MinLineSize = 5; // Minimum number of rows in a line of text |
|
11 int MinVertSeparation = 0; // Minimum number of rows between lines of text |
|
12 int MinHorizSeparation = 1; // Minimum number of cols between characters |
|
13 int ConfidenceThreshold = 150; //Minimum confidence for some operations |
|
14 int JoinTolerance = 6; // Max number of pixels joining fused chars. |
|
15 |
|
16 |
|
17 |
|
18 /* Number of properties in property vector for Components **/ |
|
19 int numProperties = 30; |
|
20 |
|
21 /* Grid size for gray scale analysis */ |
|
22 int NumHorizDiv = 5; |
|
23 int NumVertDiv = 5; |
|
24 |
|
25 // The next four are used in character grouping set in Page::extractComponents |
|
26 /* Group 0 - amo |
|
27 Group 1 - Descenders yjp |
|
28 Group 2 - Ascenders JPK |
|
29 Group 3 - Both descenders and Ascenders () |
|
30 Group 4 - floaters * - ` |
|
31 */ |
|
32 unsigned int NumCharGroups=5; |
|
33 int MaxVertSize = 50; // Max vert pixels in char (used for baseline) |
|
34 int BaseLineTolerance = 10; // How far in 1/x of line size from base is okay |
|
35 int TopLineTolerance = 10; // How far in 1/x of line size from top is okay |
|
36 // 20 = 5%, 10 = 10% |
|
37 int MinComponentSize = 16; // Minimum number of pixels in smallest character |
|
38 |
|
39 uchar CharBitsSet[256]; // Table of number of bits set in each num 0-256 |
|
40 // Used for determining gray scale and pixel counts |
|
41 |
|
42 /** Some globals set in learn() or readLearnedChars(). These are just starting |
|
43 values **/ |
|
44 |
|
45 double MaxHWRatio = 0.0; |
|
46 double MinHWRatio = 1000; |
|
47 int MinWidth = 1000; // Min component width in learned set |
|
48 |
|
49 |
|
50 |
|
51 |
|
52 Component * LearnedChars; // Learned character averages /** NOT USED **/ |
|
53 Components * LearnedGroups=NULL; //Learned character list array by group type |
|
54 |
|
55 |
|
56 /*** Some values for TCL/TK interface. These variables can be |
|
57 set in the file link_vars.tcl without recompiling ***/ |
|
58 |
|
59 int ENABLE_USER_INTERFACE = 0; |
|
60 int VERY_LOW_CONFIDENCE = 150; |
|
61 int LOW_CONFIDENCE = 200; |
|
62 int DISPLAY_LINE_BOUNDARIES = 0; |
|
63 int DISPLAY_BOUNDING_BOXES = 0; // boxes around components |
|
64 int SPELLCHECK = 0; |
|
65 int DISPLAY_IMAGE = 1; |
|
66 int DESKEW_METHOD = BITMAP_DESKEW; |
|
67 double SCALE_FACTOR = 0.5; |
|
68 |
|
69 void initCharBitsSet() |
|
70 // Initializes lookup table for the number of bits set in a uchar |
|
71 { |
|
72 int pixCount; |
|
73 for (int c = 0; c<256;c++) |
|
74 { |
|
75 pixCount = 0; |
|
76 for (int i = 7; i >=0; i--) |
|
77 pixCount +=((c>>i)&1); // if this is a black pixel |
|
78 CharBitsSet[c]=pixCount; |
|
79 } |
|
80 } |
|
81 |
|
82 char* backslashify(char* w) |
|
83 /* backslashes all $ " [] {} () */ |
|
84 { |
|
85 int length = strlen(w); |
|
86 char* new_word = (char*)malloc(length*2); |
|
87 int new_word_pos = 0; |
|
88 for(int i = 0; i < length; i++) |
|
89 { |
|
90 if((w[i] == '$') || |
|
91 (w[i] == '[') || |
|
92 (w[i] == ']') || |
|
93 (w[i] == '\\') || |
|
94 (w[i] == '{') || |
|
95 (w[i] == '}') || |
|
96 (w[i] == '(') || |
|
97 (w[i] == ')') || |
|
98 (w[i] == ';')) |
|
99 { |
|
100 new_word[new_word_pos] = '\\'; |
|
101 new_word[new_word_pos+1] = w[i]; |
|
102 new_word_pos += 2; |
|
103 } |
|
104 else |
|
105 { |
|
106 new_word[new_word_pos] = w[i]; |
|
107 new_word_pos += 1; |
|
108 } |
|
109 } |
|
110 new_word[new_word_pos] = '\0'; |
|
111 return new_word; |
|
112 } |
|
113 |
|
114 void invertBitsInBuffer(uchar * buf, int size) |
|
115 { |
|
116 for(int i = 0; i < size; i++) |
|
117 buf[i] = ~buf[i] ; |
|
118 |
|
119 } |
|
120 |
|
121 |
|
122 short int countBitsSet(uchar c) |
|
123 { |
|
124 int pixCount = 0; |
|
125 /* |
|
126 for (int i = 7; i >=0; i--) |
|
127 pixCount +=((c>>i)&1); // if this is a black pixel |
|
128 */ |
|
129 return CharBitsSet[c]; |
|
130 } |
|
131 |
|
132 int pixelsBetween(uchar * ar, int start, int end) |
|
133 { |
|
134 // Counts the number of black pixels between start and end |
|
135 int startCharNum = start / 8; |
|
136 int endCharNum = end / 8 ; |
|
137 int pixCount=0, startOffset, endOffset; |
|
138 uchar nextChar; |
|
139 |
|
140 startOffset = start - startCharNum*8; // first bit of range in first char |
|
141 endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char |
|
142 |
|
143 // count the whole characters |
|
144 for (int i = startCharNum + 1; i < endCharNum; i++) |
|
145 { |
|
146 nextChar = ar[i]; |
|
147 pixCount += countBitsSet(nextChar); |
|
148 } |
|
149 // Now add in end peices |
|
150 // Get our part of the starting character |
|
151 // Add in just the last part of the char (get rid of hi bits) |
|
152 nextChar = ar[startCharNum] << startOffset; |
|
153 if (startCharNum != endCharNum ) |
|
154 { |
|
155 pixCount += countBitsSet(nextChar); |
|
156 // Get our part of the ending character, |
|
157 // Add in just the first endOffset bits (get rid of lo bits) |
|
158 nextChar = ar[endCharNum] >> (8 - endOffset); |
|
159 pixCount += countBitsSet(nextChar); |
|
160 } |
|
161 else |
|
162 { |
|
163 // just shift the adjusted starting char |
|
164 int shift = (8-endOffset)+startOffset; |
|
165 pixCount += countBitsSet(nextChar >> shift); |
|
166 } |
|
167 return pixCount; |
|
168 |
|
169 } |
|
170 |
|
171 |
|
172 void setRange(uchar ar[], int start, int end) |
|
173 // Sets bits from position start to position end |
|
174 { |
|
175 int startCharNum = start / 8; |
|
176 int endCharNum = end / 8 ; |
|
177 int startOffset, endOffset; |
|
178 |
|
179 startOffset = start - startCharNum*8; // first bit of range in first char |
|
180 endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char |
|
181 |
|
182 // set the whole characters |
|
183 for (int i = startCharNum + 1; i < endCharNum; i++) |
|
184 { |
|
185 ar[i] = 255; |
|
186 } |
|
187 // Now set end peices |
|
188 if (startCharNum != endCharNum ) |
|
189 { |
|
190 ar[startCharNum] |= (255 >> startOffset); |
|
191 ar[endCharNum] |= (255 << (8 - endOffset)); |
|
192 } |
|
193 else // start and end char are the same |
|
194 { |
|
195 char mask = 255 >> startOffset; |
|
196 mask &= 255 << (8-endOffset); |
|
197 ar[endCharNum] |= mask; |
|
198 } |
|
199 |
|
200 }; |
|
201 |
|
202 |
|
203 |
|
204 |
|
205 |
|
206 |
|
207 |
|
208 |
|
209 |
|
210 |
|
211 |
|
212 |
|
213 |
|
214 |
|
215 |
|
216 |
|
217 |
|
218 |
|
219 |
|
220 |
|
221 |
|
222 |
|
223 |
|
224 |
|
225 |
|
226 |
|
227 |
|
228 |
|
229 |
|
230 |
|
231 |