0
|
1 |
#include "system.h"
|
|
2 |
#include "Point.h"
|
|
3 |
|
|
4 |
|
|
5 |
// Global Variables
|
|
6 |
Point NOPNT(-1,-1); // Used for default entries
|
|
7 |
/* Global variables used to fine tune OCR. These can be adjusted
|
|
8 |
without recompiling by setting them in link_vars.tcl */
|
|
9 |
int NoiseTolerance = 1; // Minumum number of pixels in a line of text
|
|
10 |
int MinLineSize = 5; // Minimum number of rows in a line of text
|
|
11 |
int MinVertSeparation = 0; // Minimum number of rows between lines of text
|
|
12 |
int MinHorizSeparation = 1; // Minimum number of cols between characters
|
|
13 |
int ConfidenceThreshold = 150; //Minimum confidence for some operations
|
|
14 |
int JoinTolerance = 6; // Max number of pixels joining fused chars.
|
|
15 |
|
|
16 |
|
|
17 |
|
|
18 |
/* Number of properties in property vector for Components **/
|
|
19 |
int numProperties = 30;
|
|
20 |
|
|
21 |
/* Grid size for gray scale analysis */
|
|
22 |
int NumHorizDiv = 5;
|
|
23 |
int NumVertDiv = 5;
|
|
24 |
|
|
25 |
// The next four are used in character grouping set in Page::extractComponents
|
|
26 |
/* Group 0 - amo
|
|
27 |
Group 1 - Descenders yjp
|
|
28 |
Group 2 - Ascenders JPK
|
|
29 |
Group 3 - Both descenders and Ascenders ()
|
|
30 |
Group 4 - floaters * - `
|
|
31 |
*/
|
|
32 |
unsigned int NumCharGroups=5;
|
|
33 |
int MaxVertSize = 50; // Max vert pixels in char (used for baseline)
|
|
34 |
int BaseLineTolerance = 10; // How far in 1/x of line size from base is okay
|
|
35 |
int TopLineTolerance = 10; // How far in 1/x of line size from top is okay
|
|
36 |
// 20 = 5%, 10 = 10%
|
|
37 |
int MinComponentSize = 16; // Minimum number of pixels in smallest character
|
|
38 |
|
|
39 |
uchar CharBitsSet[256]; // Table of number of bits set in each num 0-256
|
|
40 |
// Used for determining gray scale and pixel counts
|
|
41 |
|
|
42 |
/** Some globals set in learn() or readLearnedChars(). These are just starting
|
|
43 |
values **/
|
|
44 |
|
|
45 |
double MaxHWRatio = 0.0;
|
|
46 |
double MinHWRatio = 1000;
|
|
47 |
int MinWidth = 1000; // Min component width in learned set
|
|
48 |
|
|
49 |
|
|
50 |
|
|
51 |
|
|
52 |
Component * LearnedChars; // Learned character averages /** NOT USED **/
|
|
53 |
Components * LearnedGroups=NULL; //Learned character list array by group type
|
|
54 |
|
|
55 |
|
|
56 |
/*** Some values for TCL/TK interface. These variables can be
|
|
57 |
set in the file link_vars.tcl without recompiling ***/
|
|
58 |
|
|
59 |
int ENABLE_USER_INTERFACE = 0;
|
|
60 |
int VERY_LOW_CONFIDENCE = 150;
|
|
61 |
int LOW_CONFIDENCE = 200;
|
|
62 |
int DISPLAY_LINE_BOUNDARIES = 0;
|
|
63 |
int DISPLAY_BOUNDING_BOXES = 0; // boxes around components
|
|
64 |
int SPELLCHECK = 0;
|
|
65 |
int DISPLAY_IMAGE = 1;
|
|
66 |
int DESKEW_METHOD = BITMAP_DESKEW;
|
|
67 |
double ZONING_SCALE_FACTOR = .50;
|
|
68 |
double SCALE_FACTOR = 0.5;
|
|
69 |
|
|
70 |
TclMode mode = REGULAR;
|
|
71 |
|
|
72 |
void initCharBitsSet()
|
|
73 |
// Initializes lookup table for the number of bits set in a uchar
|
|
74 |
{
|
|
75 |
int pixCount;
|
|
76 |
for (int c = 0; c<256;c++)
|
|
77 |
{
|
|
78 |
pixCount = 0;
|
|
79 |
for (int i = 7; i >=0; i--)
|
|
80 |
pixCount +=((c>>i)&1); // if this is a black pixel
|
|
81 |
CharBitsSet[c]=pixCount;
|
|
82 |
}
|
|
83 |
}
|
|
84 |
|
|
85 |
char* backslashify(char* w)
|
|
86 |
/* backslashes all $ " [] {} () */
|
|
87 |
{
|
|
88 |
int length = strlen(w);
|
|
89 |
char* new_word = (char*)malloc(length*2);
|
|
90 |
int new_word_pos = 0;
|
|
91 |
for(int i = 0; i < length; i++)
|
|
92 |
{
|
|
93 |
if((w[i] == '$') ||
|
|
94 |
(w[i] == '[') ||
|
|
95 |
(w[i] == ']') ||
|
|
96 |
(w[i] == '\\') ||
|
|
97 |
(w[i] == '{') ||
|
|
98 |
(w[i] == '}') ||
|
|
99 |
(w[i] == '(') ||
|
|
100 |
(w[i] == ')') ||
|
|
101 |
(w[i] == ';'))
|
|
102 |
{
|
|
103 |
new_word[new_word_pos] = '\\';
|
|
104 |
new_word[new_word_pos+1] = w[i];
|
|
105 |
new_word_pos += 2;
|
|
106 |
}
|
|
107 |
else
|
|
108 |
{
|
|
109 |
new_word[new_word_pos] = w[i];
|
|
110 |
new_word_pos += 1;
|
|
111 |
}
|
|
112 |
}
|
|
113 |
new_word[new_word_pos] = '\0';
|
|
114 |
return new_word;
|
|
115 |
}
|
|
116 |
|
|
117 |
void invertBitsInBuffer(uchar * buf, int size)
|
|
118 |
{
|
|
119 |
for(int i = 0; i < size; i++)
|
|
120 |
buf[i] = ~buf[i] ;
|
|
121 |
|
|
122 |
}
|
|
123 |
|
|
124 |
void clearBitsInBuffer(uchar * buf, int size)
|
|
125 |
{
|
|
126 |
for(int i = 0; i < size; i++)
|
|
127 |
buf[i]=0;
|
|
128 |
|
|
129 |
|
|
130 |
}
|
|
131 |
|
|
132 |
short int countBitsSet(uchar c)
|
|
133 |
{
|
|
134 |
int pixCount = 0;
|
|
135 |
/*
|
|
136 |
for (int i = 7; i >=0; i--)
|
|
137 |
pixCount +=((c>>i)&1); // if this is a black pixel
|
|
138 |
*/
|
|
139 |
if (!(CharBitsSet['f']))
|
|
140 |
initCharBitsSet();
|
|
141 |
return CharBitsSet[c];
|
|
142 |
}
|
|
143 |
|
|
144 |
int pixelsBetween(uchar * ar, int start, int end)
|
|
145 |
{
|
|
146 |
// Counts the number of black pixels between start and end
|
|
147 |
int startCharNum = start / 8;
|
|
148 |
int endCharNum = end / 8 ;
|
|
149 |
int pixCount=0, startOffset, endOffset;
|
|
150 |
uchar nextChar;
|
|
151 |
|
|
152 |
startOffset = start - startCharNum*8; // first bit of range in first char
|
|
153 |
endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
|
|
154 |
|
|
155 |
// count the whole characters
|
|
156 |
for (int i = startCharNum + 1; i < endCharNum; i++)
|
|
157 |
{
|
|
158 |
nextChar = ar[i];
|
|
159 |
pixCount += countBitsSet(nextChar);
|
|
160 |
}
|
|
161 |
// Now add in end peices
|
|
162 |
// Get our part of the starting character
|
|
163 |
// Add in just the last part of the char (get rid of hi bits)
|
|
164 |
nextChar = ar[startCharNum] << startOffset;
|
|
165 |
if (startCharNum != endCharNum )
|
|
166 |
{
|
|
167 |
pixCount += countBitsSet(nextChar);
|
|
168 |
// Get our part of the ending character,
|
|
169 |
// Add in just the first endOffset bits (get rid of lo bits)
|
|
170 |
nextChar = ar[endCharNum] >> (8 - endOffset);
|
|
171 |
pixCount += countBitsSet(nextChar);
|
|
172 |
}
|
|
173 |
else
|
|
174 |
{
|
|
175 |
// just shift the adjusted starting char
|
|
176 |
int shift = (8-endOffset)+startOffset;
|
|
177 |
pixCount += countBitsSet(nextChar >> shift);
|
|
178 |
}
|
|
179 |
return pixCount;
|
|
180 |
|
|
181 |
}
|
|
182 |
|
|
183 |
|
|
184 |
void setRange(uchar ar[], int start, int end)
|
|
185 |
// Sets bits from position start to position end
|
|
186 |
{
|
|
187 |
int startCharNum = start / 8;
|
|
188 |
int endCharNum = end / 8 ;
|
|
189 |
int startOffset, endOffset;
|
|
190 |
|
|
191 |
startOffset = start - startCharNum*8; // first bit of range in first char
|
|
192 |
endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
|
|
193 |
|
|
194 |
// set the whole characters
|
|
195 |
for (int i = startCharNum + 1; i < endCharNum; i++)
|
|
196 |
{
|
|
197 |
ar[i] = 255;
|
|
198 |
}
|
|
199 |
// Now set end peices
|
|
200 |
if (startCharNum != endCharNum )
|
|
201 |
{
|
|
202 |
ar[startCharNum] |= (255 >> startOffset);
|
|
203 |
ar[endCharNum] |= (255 << (8 - endOffset));
|
|
204 |
}
|
|
205 |
else // start and end char are the same
|
|
206 |
{
|
|
207 |
char mask = 255 >> startOffset;
|
|
208 |
mask &= 255 << (8-endOffset);
|
|
209 |
ar[endCharNum] |= mask;
|
|
210 |
}
|
|
211 |
|
|
212 |
};
|
|
213 |
|
|
214 |
|
|
215 |
|
|
216 |
|
|
217 |
|
|
218 |
|
|
219 |
|
|
220 |
|
|
221 |
|
|
222 |
|
|
223 |
|
|
224 |
|
|
225 |
|
|
226 |
|
|
227 |
|
|
228 |
|
|
229 |
|
|
230 |
|
|
231 |
|
|
232 |
|
|
233 |
|
|
234 |
|
|
235 |
|
|
236 |
|
|
237 |
|
|
238 |
|
|
239 |
|
|
240 |
|
|
241 |
|
|
242 |
|