reference/ocr-simple/system.cc
changeset 0 6b8091ca909a
equal deleted inserted replaced
-1:000000000000 0:6b8091ca909a
       
     1 #include "system.h"
       
     2 #include "Point.h"
       
     3 
       
     4 
       
     5 // Global Variables
       
     6 Point NOPNT(-1,-1);      // Used for default entries
       
     7 /* Global variables used to fine tune OCR.  These can be adjusted
       
     8    without recompiling by setting them in link_vars.tcl */
       
     9 int NoiseTolerance = 1;      // Minumum number of pixels in a line of text
       
    10 int MinLineSize = 5;        // Minimum number of rows in a line of text
       
    11 int MinVertSeparation = 0;  // Minimum number of rows between lines of text
       
    12 int MinHorizSeparation = 1; // Minimum number of cols between characters
       
    13 int ConfidenceThreshold = 150; //Minimum confidence for some operations
       
    14 int JoinTolerance = 6;        // Max number of pixels joining fused chars.
       
    15 
       
    16 
       
    17 
       
    18 /* Number of properties in property vector for Components **/
       
    19 int numProperties = 30;
       
    20 
       
    21 /* Grid size for gray scale analysis */
       
    22 int NumHorizDiv = 5;
       
    23 int NumVertDiv = 5;
       
    24 
       
    25 // The next four are used in character grouping set in Page::extractComponents
       
    26 /* Group 0 - amo
       
    27    Group 1 - Descenders yjp
       
    28    Group 2 - Ascenders JPK
       
    29    Group 3 - Both descenders and Ascenders ()
       
    30    Group 4 - floaters * - `
       
    31 */
       
    32 unsigned int NumCharGroups=5; 
       
    33 int MaxVertSize = 50;        // Max vert pixels in char (used for baseline)
       
    34 int BaseLineTolerance = 10;  // How far in 1/x of line size from base is okay
       
    35 int TopLineTolerance  = 10; // How far in 1/x of line size from top is okay
       
    36                                 // 20 = 5%, 10 = 10%
       
    37 int MinComponentSize = 16;  // Minimum number of pixels in smallest character
       
    38 
       
    39 uchar CharBitsSet[256];    // Table of number of bits set in each num 0-256
       
    40                            // Used for determining gray scale and pixel counts
       
    41 
       
    42 /** Some globals set in learn() or readLearnedChars(). These are just starting
       
    43      values  **/
       
    44 
       
    45 double MaxHWRatio = 0.0;
       
    46 double MinHWRatio = 1000; 
       
    47 int MinWidth = 1000;        // Min component width in learned set
       
    48 
       
    49 
       
    50 
       
    51 
       
    52 Component * LearnedChars;   // Learned character averages  /** NOT USED **/
       
    53 Components * LearnedGroups=NULL; //Learned character list array by group type
       
    54 
       
    55 
       
    56 /*** Some values for TCL/TK interface.  These variables can be 
       
    57    set in the file link_vars.tcl without recompiling ***/
       
    58 
       
    59 int ENABLE_USER_INTERFACE = 0;
       
    60 int VERY_LOW_CONFIDENCE = 150;
       
    61 int LOW_CONFIDENCE = 200;
       
    62 int DISPLAY_LINE_BOUNDARIES = 0;
       
    63 int DISPLAY_BOUNDING_BOXES = 0;  // boxes around components 
       
    64 int SPELLCHECK = 0;
       
    65 int DISPLAY_IMAGE = 1;
       
    66 int DESKEW_METHOD = BITMAP_DESKEW;
       
    67 double SCALE_FACTOR = 0.5;
       
    68 
       
    69 void initCharBitsSet()
       
    70 // Initializes lookup table for the number of bits set in a uchar
       
    71 {
       
    72   int pixCount;
       
    73   for (int c = 0; c<256;c++)
       
    74       {
       
    75 	pixCount = 0;
       
    76 	for (int i = 7; i >=0; i--)
       
    77 	  pixCount +=((c>>i)&1);      // if this is a black pixel
       
    78 	CharBitsSet[c]=pixCount;
       
    79       }
       
    80 }
       
    81 
       
    82 char* backslashify(char* w)
       
    83 /* backslashes all $ " [] {} () */
       
    84 {
       
    85   int length = strlen(w);
       
    86   char* new_word = (char*)malloc(length*2);
       
    87   int new_word_pos = 0;
       
    88   for(int i = 0; i < length; i++)
       
    89     {
       
    90       if((w[i] == '$') ||
       
    91 	 (w[i] == '[') ||
       
    92 	 (w[i] == ']') ||
       
    93 	 (w[i] == '\\') ||
       
    94 	 (w[i] == '{') ||
       
    95 	 (w[i] == '}') ||
       
    96 	 (w[i] == '(') ||
       
    97 	 (w[i] == ')') ||
       
    98 	 (w[i] == ';'))
       
    99 	{
       
   100 	  new_word[new_word_pos] = '\\';
       
   101 	  new_word[new_word_pos+1] = w[i];
       
   102 	  new_word_pos += 2;
       
   103 	}
       
   104       else
       
   105 	{
       
   106 	  new_word[new_word_pos] = w[i];
       
   107 	  new_word_pos += 1;
       
   108 	}
       
   109     }
       
   110   new_word[new_word_pos] = '\0';
       
   111   return new_word;
       
   112 }
       
   113 
       
   114 void invertBitsInBuffer(uchar * buf, int size)
       
   115 {
       
   116   for(int i = 0; i < size; i++)
       
   117     buf[i] = ~buf[i] ;
       
   118 
       
   119 }
       
   120 
       
   121 
       
   122 short int countBitsSet(uchar c)
       
   123 {
       
   124   int pixCount = 0;
       
   125 /*
       
   126   for (int i = 7; i >=0; i--)
       
   127     pixCount +=((c>>i)&1);      // if this is a black pixel
       
   128 */
       
   129   return CharBitsSet[c];
       
   130 }
       
   131 
       
   132 int pixelsBetween(uchar * ar, int start, int end)
       
   133 {
       
   134  // Counts the number of black pixels between start and end
       
   135   int startCharNum = start / 8;
       
   136   int endCharNum = end / 8 ; 
       
   137   int pixCount=0, startOffset, endOffset;
       
   138   uchar nextChar;
       
   139 
       
   140   startOffset = start - startCharNum*8;  // first bit of range in first char
       
   141   endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
       
   142  
       
   143      // count the whole characters
       
   144       for (int i = startCharNum + 1; i < endCharNum; i++)
       
   145 	{
       
   146 	  nextChar = ar[i];
       
   147 	  pixCount += countBitsSet(nextChar);
       
   148 	}
       
   149 	  // Now add in end peices
       
   150 	  // Get our part of the starting character
       
   151 	  // Add in just the last part of the  char (get rid of hi bits)
       
   152 	  nextChar = ar[startCharNum] << startOffset;
       
   153 	  if (startCharNum != endCharNum )
       
   154              {
       
   155 	       pixCount += countBitsSet(nextChar);
       
   156 	       // Get our part of the ending character, 
       
   157 	       // Add in just the first endOffset bits (get rid of lo bits)
       
   158 	       nextChar = ar[endCharNum] >> (8 - endOffset);
       
   159 	       pixCount += countBitsSet(nextChar);
       
   160 	     }
       
   161          else
       
   162 	   {
       
   163 	     // just shift the adjusted starting char
       
   164 	     int shift = (8-endOffset)+startOffset;
       
   165 	     pixCount += countBitsSet(nextChar >> shift);
       
   166 	   }
       
   167   return pixCount;
       
   168 
       
   169 }
       
   170 
       
   171 
       
   172 void setRange(uchar ar[], int start, int end)
       
   173 // Sets bits from position start to position end 
       
   174 {
       
   175   int startCharNum = start / 8;
       
   176   int endCharNum = end / 8 ; 
       
   177   int startOffset, endOffset;
       
   178 
       
   179   startOffset = start - startCharNum*8;  // first bit of range in first char
       
   180   endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
       
   181  
       
   182      // set the whole characters
       
   183       for (int i = startCharNum + 1; i < endCharNum; i++)
       
   184 	{
       
   185 	  ar[i] = 255;
       
   186 	}
       
   187 	  // Now set end peices
       
   188 	  if (startCharNum != endCharNum )
       
   189              {
       
   190 	       ar[startCharNum] |=  (255 >> startOffset);
       
   191 	       ar[endCharNum] |= (255 << (8 - endOffset));
       
   192 	     }
       
   193          else  // start and end char are the same
       
   194 	   {
       
   195 	     char mask = 255 >> startOffset;
       
   196 	     mask  &= 255 << (8-endOffset);
       
   197 	     ar[endCharNum] |= mask;
       
   198 	   }
       
   199 
       
   200 };
       
   201 
       
   202 
       
   203 
       
   204 
       
   205 
       
   206 
       
   207 
       
   208 
       
   209 
       
   210 
       
   211 
       
   212 
       
   213 
       
   214 
       
   215 
       
   216 
       
   217 
       
   218 
       
   219 
       
   220 
       
   221 
       
   222 
       
   223 
       
   224 
       
   225 
       
   226 
       
   227 
       
   228 
       
   229 
       
   230 
       
   231