reference/ocr-new/system.cc
changeset 0 6b8091ca909a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/reference/ocr-new/system.cc	Thu May 18 23:12:51 2006 +0200
@@ -0,0 +1,242 @@
+#include "system.h"
+#include "Point.h"
+
+
+// Global Variables
+Point NOPNT(-1,-1);      // Used for default entries
+/* Global variables used to fine tune OCR.  These can be adjusted
+   without recompiling by setting them in link_vars.tcl */
+int NoiseTolerance = 1;      // Minumum number of pixels in a line of text
+int MinLineSize = 5;        // Minimum number of rows in a line of text
+int MinVertSeparation = 0;  // Minimum number of rows between lines of text
+int MinHorizSeparation = 1; // Minimum number of cols between characters
+int ConfidenceThreshold = 150; //Minimum confidence for some operations
+int JoinTolerance = 6;        // Max number of pixels joining fused chars.
+
+
+
+/* Number of properties in property vector for Components **/
+int numProperties = 30;
+
+/* Grid size for gray scale analysis */
+int NumHorizDiv = 5;
+int NumVertDiv = 5;
+
+// The next four are used in character grouping set in Page::extractComponents
+/* Group 0 - amo
+   Group 1 - Descenders yjp
+   Group 2 - Ascenders JPK
+   Group 3 - Both descenders and Ascenders ()
+   Group 4 - floaters * - `
+*/
+unsigned int NumCharGroups=5; 
+int MaxVertSize = 50;        // Max vert pixels in char (used for baseline)
+int BaseLineTolerance = 10;  // How far in 1/x of line size from base is okay
+int TopLineTolerance  = 10; // How far in 1/x of line size from top is okay
+                                // 20 = 5%, 10 = 10%
+int MinComponentSize = 16;  // Minimum number of pixels in smallest character
+
+uchar CharBitsSet[256];    // Table of number of bits set in each num 0-256
+                           // Used for determining gray scale and pixel counts
+
+/** Some globals set in learn() or readLearnedChars(). These are just starting
+     values  **/
+
+double MaxHWRatio = 0.0;
+double MinHWRatio = 1000; 
+int MinWidth = 1000;        // Min component width in learned set
+
+
+
+
+Component * LearnedChars;   // Learned character averages  /** NOT USED **/
+Components * LearnedGroups=NULL; //Learned character list array by group type
+
+
+/*** Some values for TCL/TK interface.  These variables can be 
+   set in the file link_vars.tcl without recompiling ***/
+
+int ENABLE_USER_INTERFACE = 0;
+int VERY_LOW_CONFIDENCE = 150;
+int LOW_CONFIDENCE = 200;
+int DISPLAY_LINE_BOUNDARIES = 0;
+int DISPLAY_BOUNDING_BOXES = 0;  // boxes around components 
+int SPELLCHECK = 0;
+int DISPLAY_IMAGE = 1;
+int DESKEW_METHOD = BITMAP_DESKEW;
+double ZONING_SCALE_FACTOR = .50;
+double SCALE_FACTOR = 0.5;
+
+TclMode mode = REGULAR;
+
+void initCharBitsSet()
+// Initializes lookup table for the number of bits set in a uchar
+{
+  int pixCount;
+  for (int c = 0; c<256;c++)
+      {
+	pixCount = 0;
+	for (int i = 7; i >=0; i--)
+	  pixCount +=((c>>i)&1);      // if this is a black pixel
+	CharBitsSet[c]=pixCount;
+      }
+}
+
+char* backslashify(char* w)
+/* backslashes all $ " [] {} () */
+{
+  int length = strlen(w);
+  char* new_word = (char*)malloc(length*2);
+  int new_word_pos = 0;
+  for(int i = 0; i < length; i++)
+    {
+      if((w[i] == '$') ||
+	 (w[i] == '[') ||
+	 (w[i] == ']') ||
+	 (w[i] == '\\') ||
+	 (w[i] == '{') ||
+	 (w[i] == '}') ||
+	 (w[i] == '(') ||
+	 (w[i] == ')') ||
+	 (w[i] == ';'))
+	{
+	  new_word[new_word_pos] = '\\';
+	  new_word[new_word_pos+1] = w[i];
+	  new_word_pos += 2;
+	}
+      else
+	{
+	  new_word[new_word_pos] = w[i];
+	  new_word_pos += 1;
+	}
+    }
+  new_word[new_word_pos] = '\0';
+  return new_word;
+}
+
+void invertBitsInBuffer(uchar * buf, int size)
+{
+  for(int i = 0; i < size; i++)
+    buf[i] = ~buf[i] ;
+
+}
+
+void clearBitsInBuffer(uchar * buf, int size)
+{
+  for(int i = 0; i < size; i++)
+      buf[i]=0;
+
+
+}
+
+short int countBitsSet(uchar c)
+{
+  int pixCount = 0;
+/*
+  for (int i = 7; i >=0; i--)
+    pixCount +=((c>>i)&1);      // if this is a black pixel
+*/
+  if (!(CharBitsSet['f']))
+      initCharBitsSet();
+  return CharBitsSet[c];
+}
+
+int pixelsBetween(uchar * ar, int start, int end)
+{
+ // Counts the number of black pixels between start and end
+  int startCharNum = start / 8;
+  int endCharNum = end / 8 ; 
+  int pixCount=0, startOffset, endOffset;
+  uchar nextChar;
+
+  startOffset = start - startCharNum*8;  // first bit of range in first char
+  endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
+ 
+     // count the whole characters
+      for (int i = startCharNum + 1; i < endCharNum; i++)
+	{
+	  nextChar = ar[i];
+	  pixCount += countBitsSet(nextChar);
+	}
+	  // Now add in end peices
+	  // Get our part of the starting character
+	  // Add in just the last part of the  char (get rid of hi bits)
+	  nextChar = ar[startCharNum] << startOffset;
+	  if (startCharNum != endCharNum )
+             {
+	       pixCount += countBitsSet(nextChar);
+	       // Get our part of the ending character, 
+	       // Add in just the first endOffset bits (get rid of lo bits)
+	       nextChar = ar[endCharNum] >> (8 - endOffset);
+	       pixCount += countBitsSet(nextChar);
+	     }
+         else
+	   {
+	     // just shift the adjusted starting char
+	     int shift = (8-endOffset)+startOffset;
+	     pixCount += countBitsSet(nextChar >> shift);
+	   }
+  return pixCount;
+
+}
+
+
+void setRange(uchar ar[], int start, int end)
+// Sets bits from position start to position end 
+{
+  int startCharNum = start / 8;
+  int endCharNum = end / 8 ; 
+  int startOffset, endOffset;
+
+  startOffset = start - startCharNum*8;  // first bit of range in first char
+  endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
+ 
+     // set the whole characters
+      for (int i = startCharNum + 1; i < endCharNum; i++)
+	{
+	  ar[i] = 255;
+	}
+	  // Now set end peices
+	  if (startCharNum != endCharNum )
+             {
+	       ar[startCharNum] |=  (255 >> startOffset);
+	       ar[endCharNum] |= (255 << (8 - endOffset));
+	     }
+         else  // start and end char are the same
+	   {
+	     char mask = 255 >> startOffset;
+	     mask  &= 255 << (8-endOffset);
+	     ar[endCharNum] |= mask;
+	   }
+
+};
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+