reference/ocr-new/system.cc
changeset 0 6b8091ca909a
equal deleted inserted replaced
-1:000000000000 0:6b8091ca909a
       
     1 #include "system.h"
       
     2 #include "Point.h"
       
     3 
       
     4 
       
     5 // Global Variables
       
     6 Point NOPNT(-1,-1);      // Used for default entries
       
     7 /* Global variables used to fine tune OCR.  These can be adjusted
       
     8    without recompiling by setting them in link_vars.tcl */
       
     9 int NoiseTolerance = 1;      // Minumum number of pixels in a line of text
       
    10 int MinLineSize = 5;        // Minimum number of rows in a line of text
       
    11 int MinVertSeparation = 0;  // Minimum number of rows between lines of text
       
    12 int MinHorizSeparation = 1; // Minimum number of cols between characters
       
    13 int ConfidenceThreshold = 150; //Minimum confidence for some operations
       
    14 int JoinTolerance = 6;        // Max number of pixels joining fused chars.
       
    15 
       
    16 
       
    17 
       
    18 /* Number of properties in property vector for Components **/
       
    19 int numProperties = 30;
       
    20 
       
    21 /* Grid size for gray scale analysis */
       
    22 int NumHorizDiv = 5;
       
    23 int NumVertDiv = 5;
       
    24 
       
    25 // The next four are used in character grouping set in Page::extractComponents
       
    26 /* Group 0 - amo
       
    27    Group 1 - Descenders yjp
       
    28    Group 2 - Ascenders JPK
       
    29    Group 3 - Both descenders and Ascenders ()
       
    30    Group 4 - floaters * - `
       
    31 */
       
    32 unsigned int NumCharGroups=5; 
       
    33 int MaxVertSize = 50;        // Max vert pixels in char (used for baseline)
       
    34 int BaseLineTolerance = 10;  // How far in 1/x of line size from base is okay
       
    35 int TopLineTolerance  = 10; // How far in 1/x of line size from top is okay
       
    36                                 // 20 = 5%, 10 = 10%
       
    37 int MinComponentSize = 16;  // Minimum number of pixels in smallest character
       
    38 
       
    39 uchar CharBitsSet[256];    // Table of number of bits set in each num 0-256
       
    40                            // Used for determining gray scale and pixel counts
       
    41 
       
    42 /** Some globals set in learn() or readLearnedChars(). These are just starting
       
    43      values  **/
       
    44 
       
    45 double MaxHWRatio = 0.0;
       
    46 double MinHWRatio = 1000; 
       
    47 int MinWidth = 1000;        // Min component width in learned set
       
    48 
       
    49 
       
    50 
       
    51 
       
    52 Component * LearnedChars;   // Learned character averages  /** NOT USED **/
       
    53 Components * LearnedGroups=NULL; //Learned character list array by group type
       
    54 
       
    55 
       
    56 /*** Some values for TCL/TK interface.  These variables can be 
       
    57    set in the file link_vars.tcl without recompiling ***/
       
    58 
       
    59 int ENABLE_USER_INTERFACE = 0;
       
    60 int VERY_LOW_CONFIDENCE = 150;
       
    61 int LOW_CONFIDENCE = 200;
       
    62 int DISPLAY_LINE_BOUNDARIES = 0;
       
    63 int DISPLAY_BOUNDING_BOXES = 0;  // boxes around components 
       
    64 int SPELLCHECK = 0;
       
    65 int DISPLAY_IMAGE = 1;
       
    66 int DESKEW_METHOD = BITMAP_DESKEW;
       
    67 double ZONING_SCALE_FACTOR = .50;
       
    68 double SCALE_FACTOR = 0.5;
       
    69 
       
    70 TclMode mode = REGULAR;
       
    71 
       
    72 void initCharBitsSet()
       
    73 // Initializes lookup table for the number of bits set in a uchar
       
    74 {
       
    75   int pixCount;
       
    76   for (int c = 0; c<256;c++)
       
    77       {
       
    78 	pixCount = 0;
       
    79 	for (int i = 7; i >=0; i--)
       
    80 	  pixCount +=((c>>i)&1);      // if this is a black pixel
       
    81 	CharBitsSet[c]=pixCount;
       
    82       }
       
    83 }
       
    84 
       
    85 char* backslashify(char* w)
       
    86 /* backslashes all $ " [] {} () */
       
    87 {
       
    88   int length = strlen(w);
       
    89   char* new_word = (char*)malloc(length*2);
       
    90   int new_word_pos = 0;
       
    91   for(int i = 0; i < length; i++)
       
    92     {
       
    93       if((w[i] == '$') ||
       
    94 	 (w[i] == '[') ||
       
    95 	 (w[i] == ']') ||
       
    96 	 (w[i] == '\\') ||
       
    97 	 (w[i] == '{') ||
       
    98 	 (w[i] == '}') ||
       
    99 	 (w[i] == '(') ||
       
   100 	 (w[i] == ')') ||
       
   101 	 (w[i] == ';'))
       
   102 	{
       
   103 	  new_word[new_word_pos] = '\\';
       
   104 	  new_word[new_word_pos+1] = w[i];
       
   105 	  new_word_pos += 2;
       
   106 	}
       
   107       else
       
   108 	{
       
   109 	  new_word[new_word_pos] = w[i];
       
   110 	  new_word_pos += 1;
       
   111 	}
       
   112     }
       
   113   new_word[new_word_pos] = '\0';
       
   114   return new_word;
       
   115 }
       
   116 
       
   117 void invertBitsInBuffer(uchar * buf, int size)
       
   118 {
       
   119   for(int i = 0; i < size; i++)
       
   120     buf[i] = ~buf[i] ;
       
   121 
       
   122 }
       
   123 
       
   124 void clearBitsInBuffer(uchar * buf, int size)
       
   125 {
       
   126   for(int i = 0; i < size; i++)
       
   127       buf[i]=0;
       
   128 
       
   129 
       
   130 }
       
   131 
       
   132 short int countBitsSet(uchar c)
       
   133 {
       
   134   int pixCount = 0;
       
   135 /*
       
   136   for (int i = 7; i >=0; i--)
       
   137     pixCount +=((c>>i)&1);      // if this is a black pixel
       
   138 */
       
   139   if (!(CharBitsSet['f']))
       
   140       initCharBitsSet();
       
   141   return CharBitsSet[c];
       
   142 }
       
   143 
       
   144 int pixelsBetween(uchar * ar, int start, int end)
       
   145 {
       
   146  // Counts the number of black pixels between start and end
       
   147   int startCharNum = start / 8;
       
   148   int endCharNum = end / 8 ; 
       
   149   int pixCount=0, startOffset, endOffset;
       
   150   uchar nextChar;
       
   151 
       
   152   startOffset = start - startCharNum*8;  // first bit of range in first char
       
   153   endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
       
   154  
       
   155      // count the whole characters
       
   156       for (int i = startCharNum + 1; i < endCharNum; i++)
       
   157 	{
       
   158 	  nextChar = ar[i];
       
   159 	  pixCount += countBitsSet(nextChar);
       
   160 	}
       
   161 	  // Now add in end peices
       
   162 	  // Get our part of the starting character
       
   163 	  // Add in just the last part of the  char (get rid of hi bits)
       
   164 	  nextChar = ar[startCharNum] << startOffset;
       
   165 	  if (startCharNum != endCharNum )
       
   166              {
       
   167 	       pixCount += countBitsSet(nextChar);
       
   168 	       // Get our part of the ending character, 
       
   169 	       // Add in just the first endOffset bits (get rid of lo bits)
       
   170 	       nextChar = ar[endCharNum] >> (8 - endOffset);
       
   171 	       pixCount += countBitsSet(nextChar);
       
   172 	     }
       
   173          else
       
   174 	   {
       
   175 	     // just shift the adjusted starting char
       
   176 	     int shift = (8-endOffset)+startOffset;
       
   177 	     pixCount += countBitsSet(nextChar >> shift);
       
   178 	   }
       
   179   return pixCount;
       
   180 
       
   181 }
       
   182 
       
   183 
       
   184 void setRange(uchar ar[], int start, int end)
       
   185 // Sets bits from position start to position end 
       
   186 {
       
   187   int startCharNum = start / 8;
       
   188   int endCharNum = end / 8 ; 
       
   189   int startOffset, endOffset;
       
   190 
       
   191   startOffset = start - startCharNum*8;  // first bit of range in first char
       
   192   endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
       
   193  
       
   194      // set the whole characters
       
   195       for (int i = startCharNum + 1; i < endCharNum; i++)
       
   196 	{
       
   197 	  ar[i] = 255;
       
   198 	}
       
   199 	  // Now set end peices
       
   200 	  if (startCharNum != endCharNum )
       
   201              {
       
   202 	       ar[startCharNum] |=  (255 >> startOffset);
       
   203 	       ar[endCharNum] |= (255 << (8 - endOffset));
       
   204 	     }
       
   205          else  // start and end char are the same
       
   206 	   {
       
   207 	     char mask = 255 >> startOffset;
       
   208 	     mask  &= 255 << (8-endOffset);
       
   209 	     ar[endCharNum] |= mask;
       
   210 	   }
       
   211 
       
   212 };
       
   213 
       
   214 
       
   215 
       
   216 
       
   217 
       
   218 
       
   219 
       
   220 
       
   221 
       
   222 
       
   223 
       
   224 
       
   225 
       
   226 
       
   227 
       
   228 
       
   229 
       
   230 
       
   231 
       
   232 
       
   233 
       
   234 
       
   235 
       
   236 
       
   237 
       
   238 
       
   239 
       
   240 
       
   241 
       
   242