reference/ocr-new/system.cc
author viric@llimona
Thu, 18 May 2006 23:12:51 +0200
changeset 0 6b8091ca909a
permissions -rw-r--r--
Init from working directory of svn repository.

#include "system.h"
#include "Point.h"


// Global Variables
Point NOPNT(-1,-1);      // Used for default entries
/* Global variables used to fine tune OCR.  These can be adjusted
   without recompiling by setting them in link_vars.tcl */
int NoiseTolerance = 1;      // Minumum number of pixels in a line of text
int MinLineSize = 5;        // Minimum number of rows in a line of text
int MinVertSeparation = 0;  // Minimum number of rows between lines of text
int MinHorizSeparation = 1; // Minimum number of cols between characters
int ConfidenceThreshold = 150; //Minimum confidence for some operations
int JoinTolerance = 6;        // Max number of pixels joining fused chars.



/* Number of properties in property vector for Components **/
int numProperties = 30;

/* Grid size for gray scale analysis */
int NumHorizDiv = 5;
int NumVertDiv = 5;

// The next four are used in character grouping set in Page::extractComponents
/* Group 0 - amo
   Group 1 - Descenders yjp
   Group 2 - Ascenders JPK
   Group 3 - Both descenders and Ascenders ()
   Group 4 - floaters * - `
*/
unsigned int NumCharGroups=5; 
int MaxVertSize = 50;        // Max vert pixels in char (used for baseline)
int BaseLineTolerance = 10;  // How far in 1/x of line size from base is okay
int TopLineTolerance  = 10; // How far in 1/x of line size from top is okay
                                // 20 = 5%, 10 = 10%
int MinComponentSize = 16;  // Minimum number of pixels in smallest character

uchar CharBitsSet[256];    // Table of number of bits set in each num 0-256
                           // Used for determining gray scale and pixel counts

/** Some globals set in learn() or readLearnedChars(). These are just starting
     values  **/

double MaxHWRatio = 0.0;
double MinHWRatio = 1000; 
int MinWidth = 1000;        // Min component width in learned set




Component * LearnedChars;   // Learned character averages  /** NOT USED **/
Components * LearnedGroups=NULL; //Learned character list array by group type


/*** Some values for TCL/TK interface.  These variables can be 
   set in the file link_vars.tcl without recompiling ***/

int ENABLE_USER_INTERFACE = 0;
int VERY_LOW_CONFIDENCE = 150;
int LOW_CONFIDENCE = 200;
int DISPLAY_LINE_BOUNDARIES = 0;
int DISPLAY_BOUNDING_BOXES = 0;  // boxes around components 
int SPELLCHECK = 0;
int DISPLAY_IMAGE = 1;
int DESKEW_METHOD = BITMAP_DESKEW;
double ZONING_SCALE_FACTOR = .50;
double SCALE_FACTOR = 0.5;

TclMode mode = REGULAR;

void initCharBitsSet()
// Initializes lookup table for the number of bits set in a uchar
{
  int pixCount;
  for (int c = 0; c<256;c++)
      {
	pixCount = 0;
	for (int i = 7; i >=0; i--)
	  pixCount +=((c>>i)&1);      // if this is a black pixel
	CharBitsSet[c]=pixCount;
      }
}

char* backslashify(char* w)
/* backslashes all $ " [] {} () */
{
  int length = strlen(w);
  char* new_word = (char*)malloc(length*2);
  int new_word_pos = 0;
  for(int i = 0; i < length; i++)
    {
      if((w[i] == '$') ||
	 (w[i] == '[') ||
	 (w[i] == ']') ||
	 (w[i] == '\\') ||
	 (w[i] == '{') ||
	 (w[i] == '}') ||
	 (w[i] == '(') ||
	 (w[i] == ')') ||
	 (w[i] == ';'))
	{
	  new_word[new_word_pos] = '\\';
	  new_word[new_word_pos+1] = w[i];
	  new_word_pos += 2;
	}
      else
	{
	  new_word[new_word_pos] = w[i];
	  new_word_pos += 1;
	}
    }
  new_word[new_word_pos] = '\0';
  return new_word;
}

void invertBitsInBuffer(uchar * buf, int size)
{
  for(int i = 0; i < size; i++)
    buf[i] = ~buf[i] ;

}

void clearBitsInBuffer(uchar * buf, int size)
{
  for(int i = 0; i < size; i++)
      buf[i]=0;


}

short int countBitsSet(uchar c)
{
  int pixCount = 0;
/*
  for (int i = 7; i >=0; i--)
    pixCount +=((c>>i)&1);      // if this is a black pixel
*/
  if (!(CharBitsSet['f']))
      initCharBitsSet();
  return CharBitsSet[c];
}

int pixelsBetween(uchar * ar, int start, int end)
{
 // Counts the number of black pixels between start and end
  int startCharNum = start / 8;
  int endCharNum = end / 8 ; 
  int pixCount=0, startOffset, endOffset;
  uchar nextChar;

  startOffset = start - startCharNum*8;  // first bit of range in first char
  endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
 
     // count the whole characters
      for (int i = startCharNum + 1; i < endCharNum; i++)
	{
	  nextChar = ar[i];
	  pixCount += countBitsSet(nextChar);
	}
	  // Now add in end peices
	  // Get our part of the starting character
	  // Add in just the last part of the  char (get rid of hi bits)
	  nextChar = ar[startCharNum] << startOffset;
	  if (startCharNum != endCharNum )
             {
	       pixCount += countBitsSet(nextChar);
	       // Get our part of the ending character, 
	       // Add in just the first endOffset bits (get rid of lo bits)
	       nextChar = ar[endCharNum] >> (8 - endOffset);
	       pixCount += countBitsSet(nextChar);
	     }
         else
	   {
	     // just shift the adjusted starting char
	     int shift = (8-endOffset)+startOffset;
	     pixCount += countBitsSet(nextChar >> shift);
	   }
  return pixCount;

}


void setRange(uchar ar[], int start, int end)
// Sets bits from position start to position end 
{
  int startCharNum = start / 8;
  int endCharNum = end / 8 ; 
  int startOffset, endOffset;

  startOffset = start - startCharNum*8;  // first bit of range in first char
  endOffset = end- endCharNum*8 + 1 ; // first bit after end in last char
 
     // set the whole characters
      for (int i = startCharNum + 1; i < endCharNum; i++)
	{
	  ar[i] = 255;
	}
	  // Now set end peices
	  if (startCharNum != endCharNum )
             {
	       ar[startCharNum] |=  (255 >> startOffset);
	       ar[endCharNum] |= (255 << (8 - endOffset));
	     }
         else  // start and end char are the same
	   {
	     char mask = 255 >> startOffset;
	     mask  &= 255 << (8-endOffset);
	     ar[endCharNum] |= mask;
	   }

};