src/jdict/DictIndex.java
author viric@llimona
Sat, 18 Aug 2007 23:18:41 +0200
changeset 43 988a367ae8d2
parent 38 45c0a27c902f
permissions -rw-r--r--
Fixing a bug on the threshold between Binary Search and Prefix Sequence search

package jdict;

import java.io.*;
import java.util.Vector;

public class DictIndex
{
    BlockFile in;

    String index_basename;
    String defs_basename;
    int index_id;

    int top_index; /* File size of the Index file */

    Vector first_words;

    public DictIndex(String dictname)
    {
        index_basename = "/dicts/x-" + dictname + ".index";
        in = new BlockFile(index_basename);
        get_top_index();
        /*
        System.gc();
        Main.showMem();
        */
        prepare_first_words();
        in.enableMarks();
    }

    public void prepare_first_words()
    {
        int bsize = in.get_block_size();
        int max_blocks = top_index / bsize;
        int i;
        String str;

        if (max_blocks * bsize < top_index)
            max_blocks++;

        first_words = new Vector();

        /* We assume offset 0 */
        
        /* First word appart */
        str = getName();
        first_words.addElement(new FirstWord(str, 0));

        /* Next words */
        for(i=1; i<max_blocks; ++i)
        {
            int offset = bsize * i;
            int pos;

            try {
                in.seekFromStart(offset);
            } catch (IOException e)
            {
                System.out.println("Cannot go back to start in First Words.");
                break;
            }
            readUntilNewLine();
            pos = in.getAbsPos();
            str = getName();
            first_words.addElement(new FirstWord(str, pos));
            System.out.println("New first word: " +
                    str + " - " + new Integer(pos).toString());
        }

        try {
            in.seekFromStart(0);
        } catch (IOException e)
        {
            System.out.println("Cannot go back to start in First Words.");
            return;
        }
    }

    public int [] getBlockBounds(String word)
    {
        int i;
        int res[] = new int[2];

        res[0] = 0;
        res[1] = top_index;

        for(i=0; i < first_words.size(); ++i)
        {
            FirstWord tmp;
            int cmp;
            int pos;
            tmp = (FirstWord) first_words.elementAt(i);
            pos = tmp.getPos();
            cmp = word.compareTo(tmp.getString());
            if (cmp <= 0)
            {
                res[1] = pos;
                break;
            }

            res[0] = pos;
        }

        return res;
    }

    private void get_top_index()
    {
        InputStream bfile = getClass().getResourceAsStream("/dicts/TOPINDEX");
        /* 10 bytes is enough for a simple int */
        byte array[] = new byte[10];
        int total;
        try {
            total = bfile.read(array, 0, 10);
        } catch (IOException e)
        {
            System.out.println("Cannot open TOPINDEX.");
            top_index = 0;
            return;
        }
        /* This will have '\n' */
        String str = new String(array, 0, total - 1 /* - \n */);
        top_index = Integer.parseInt(str);
        System.out.println("Top Index: " +
                new Integer(top_index).toString());
    }

    public String  getName()
    {
        byte tmp[] = new byte[100];
        int i;

        i = 0;
        do
        {
            try {
                int c = in.read();
                if (c == -1)
                {
                    System.out.println("EOF in getName");
                    break;
                }
                tmp[i] = (byte) c;
            } catch (IOException e)
            {
                System.out.println("IO Exception: " +  e.getMessage());
                break;
            }
            i += 1;
        } while (tmp[i-1] != '\t' /* tab */ && i < 100);

        if (i == 51 || i == 0)
            return null;

        String result;
        try {
            result = new String(tmp, 0, i-1, "UTF-8");
        } catch (UnsupportedEncodingException e)
        {
            System.out.println("Unsupported encoding.");
            return null;
        }

        return result;
    }

    public byte[]  getNameBytes()
    {
        byte tmp[] = new byte[100];
        int i;

        i = 0;
        do
        {
            try {
                int c = in.read();
                if (c == -1)
                {
                    System.out.println("EOF in getName");
                    break;
                }
                tmp[i] = (byte) c;
            } catch (IOException e)
            {
                System.out.println("IO Exception: " +  e.getMessage());
                break;
            }
            i += 1;
        } while (tmp[i-1] != '\t' /* tab */ && i < 100);

        if (i == 51 || i == 0)
            return null;

        return tmp;
    }

    public int getLength()
    {
        /* We reuse getOffset, as it breaks on \n too */
        return getOffset();
    }

    public int getOffset()
    {
        byte tmp[] = new byte[100];
        int i;

        i = 0;
        do
        {
            try {
                int c = in.read();
                if (c == -1)
                {
                    System.out.println("EOF in getOffest");
                    return -1;
                }
                tmp[i] = (byte) c;
            } catch (IOException e)
            {
                System.out.println("IO Exception: " +  e.getMessage());
                return -1;
            }
            i += 1;
        } while (tmp[i-1] != '\t' /* tab */ && tmp[i-1] != '\n' && i < 100);

        if (i == 51)
            return -1;

        return IA5toNumber(tmp, i-1);
    }

    public void readUntilNewLine()
    {
        byte tmp;

        do
        {
            try {
                int c = in.read();
                if (c == -1)
                {
                    System.out.println("EOF in readUntilNewLine");
                    return;
                }
                tmp = (byte) c;
            } catch (IOException e)
            {
                System.out.println("IO Exception: " +  e.getMessage());
                return;
            }
        } while (tmp != '\n');

    }

    public int IA5value(byte letter)
    {
        switch(letter)
        {
            case 'A': return 0;
            case 'B': return 1;
            case 'C': return 2;
            case 'D': return 3;
            case 'E': return 4;
            case 'F': return 5;
            case 'G': return 6;
            case 'H': return 7;
            case 'I': return 8;
            case 'J': return 9;
            case 'K': return 10;
            case 'L': return 11;
            case 'M': return 12;
            case 'N': return 13;
            case 'O': return 14;
            case 'P': return 15;
            case 'Q': return 16;
            case 'R': return 17;
            case 'S': return 18;
            case 'T': return 19;
            case 'U': return 20;
            case 'V': return 21;
            case 'W': return 22;
            case 'X': return 23;
            case 'Y': return 24;
            case 'Z': return 25;
            case 'a': return 26;
            case 'b': return 27;
            case 'c': return 28;
            case 'd': return 29;
            case 'e': return 30;
            case 'f': return 31;
            case 'g': return 32;
            case 'h': return 33;
            case 'i': return 34;
            case 'j': return 35;
            case 'k': return 36;
            case 'l': return 37;
            case 'm': return 38;
            case 'n': return 39;
            case 'o': return 40;
            case 'p': return 41;
            case 'q': return 42;
            case 'r': return 43;
            case 's': return 44;
            case 't': return 45;
            case 'u': return 46;
            case 'v': return 47;
            case 'w': return 48;
            case 'x': return 49;
            case 'y': return 50;
            case 'z': return 51;
            case '0': return 52;
            case '1': return 53;
            case '2': return 54;
            case '3': return 55;
            case '4': return 56;
            case '5': return 57;
            case '6': return 58;
            case '7': return 59;
            case '8': return 60;
            case '9': return 61;
            case '+': return 62;
            case '/': return 63;
            default:
                      return 0;
        }
    }

    public int IA5toNumber(byte array[], int length)
    {
        int i = 0;
        int value = 0;

        while (i < length)
        {
            /* DEBUG
            System.out.println("Value1: " + new Integer(value).toString());
            System.out.println("Array[i]: " + (char) array[i]);
            System.out.println("IA5Value: " +
                    new Integer(IA5value(array[i])).toString());
                    */

            value = IA5value(array[i]) + value * 64;

            /*System.out.println("Value2: " + new Integer(value).toString());
             * */
            i += 1;
        }
        return value;
    }

    public String EntryToString()
    {
        String name = getName();
        int offset = getOffset();
        int length = getLength();

        return new String(name + " " + new Integer(offset).toString() +
                " " + new Integer(length).toString());
    }

    public boolean WordMatches(String w1, String w2)
    {
        /* System.out.println("Comparing " + w1 + " to " + w2);*/
        if (w1.equals(w2))
            return true;
        return false;
    }

    public Vector SearchDefinition(String word)
    {
        return SearchDefinition(word, -1);
    }

    /* if max >0, limit the search. */
    public Vector SearchDefinition(String word, int max)
    {
        try {
            in.seekFromStart(0);
        } catch (IOException e)
        {
            System.out.println("Cannot go back to start in search def.");
        }

        Vector results = new Vector();

        return SearchNextDefinition(results, word, max);
    }

    /* if max >0, limit the search. */
    public Vector SearchNextDefinition(Vector results, String word, int max)
    {
        int count = 0;
        System.out.println("Searching " + word);

        String test;
        do
        {
            if (max > 0 && count > max)
            {
                /* Void results */
                break;
            }
            test = getName();

            if (test == null)
                break;
            if (WordMatches(word,test))
            {
                int offset = getOffset();
                int length = getLength();
                System.out.println("Definition for " + word + " at " +
                        new Integer(offset).toString() +  " length " +
                        new Integer(length).toString());
                /*String definition = defs.getDefinition(offset, length);*/
                Vorto vorto = new Vorto(test, offset, length);

                results.addElement(vorto);
            } else
            {
                readUntilNewLine();
            }
            ++count;
        } while (test != null);

        return results;
    }

    /* if max >0, limit the search. */
    public Vector SearchNextPrefixes(Vector results, String word, int max)
    {
        int count = 0;
        System.out.println("Searching " + word);

        String test;
        do
        {
            if (max > 0 && count > max)
            {
                /* Void results */
                break;
            }
            test = getName();

            if (test == null)
                break;
            boolean does_start_with;
            does_start_with = test.startsWith(word);
            System.out.println("Prefix linear comparing to " + test +
                    ": " + does_start_with);
            if (does_start_with)
            {
                int offset = getOffset();
                int length = getLength();
                System.out.println("Definition for " + word + "("
                        + test + ") at " +
                        new Integer(offset).toString() +  " length " +
                        new Integer(length).toString());
                /*String definition = defs.getDefinition(offset, length);*/
                Vorto vorto = new Vorto(test, offset, length);

                results.addElement(vorto);
                /*
                System.out.println("Until now: " +
                        new Integer(results.size()).toString()); */
            } else
            {
                break;
            }
            ++count;
        } while (test != null);

        return results;
    }

    private boolean byteStartsWith(byte a1[], byte a2[])
    {
        int i;

        i = 0;
        do
        {
            if (a1[i] != a2[i])
                return false;
            ++i;
        }
        while (i < a1.length);

        return true;
    }

    private int byteCompare(byte a1[], byte a2[])
    {
        int i;
        int min;

        min = a1.length;
        if (a2.length < min)
            min = a2.length;

        i = 0;
        do
        {
            if (a1[i] < a2[i])
                return -1;
            else if (a1[i] > a2[i])
                return 1;
            ++i;
        }
        while (i < min);

        if (i < a1.length)
            return 1;
        else if (i < a2.length)
            return -1;

        return 0;
    }

    public String bytesToString(byte a[])
    {
        String result;
        try {
            result = new String(a, 0, a.length, "UTF-8");
        } catch (UnsupportedEncodingException e)
        {
            System.out.println("Unsupported encoding.");
            result = null;
        }

        return result;
    }

    public byte[] stringToBytes(String s)
    {
        byte res[] = {};

        try {
            res = s.getBytes("UTF-8");
        } catch (UnsupportedEncodingException e)
        {
            System.out.println("Unsupported encoding.");
        }

        return res;
    }

    public Vector BinarySearchDefinition(String word)
    {
        Vector results = new Vector();
        int bounds[];
        int pivot = top_index / 2;
        int step = top_index / 2;
        boolean found = false;
        boolean found_laste = false;
        String test;

        /*
        System.gc();
        */
        Main.showMem();

        bounds = getBlockBounds(word);
        final int min = bounds[0];
        final int max = bounds[1];
        pivot = min + ((max - min) / 2);
        step = (max - min) / 2;

        System.out.println("Block bounds: " +
                new Integer(min).toString() + " to " +
                new Integer(max).toString());

        /*
        targetword = stringToBytes(word);
        if (targetword.length == 0)
            return null;
            */

        do
        {
            /*
             * System.out.println("Pivoting to " +
                    new Integer(pivot).toString());
                    */
            try {
                in.seekFromStart(pivot);
            } catch (IOException e)
            {
                System.out.println("Seek from start error");
                return results;
            }
            /*System.out.println("Searching new line");*/
            readUntilNewLine();
            /*System.out.println("Getting name bytes");*/
            test = getName();
            if (test == null) /* EOF probably */
            {
                /* if EOF, go back */
                step = step / 2;
                pivot = pivot - step;
                continue;
            }
            int comparision = word.compareTo(test);
            System.out.println("Binary comparing to " + test +
                    ": " + new Integer(comparision).toString());
            if (comparision <= 0)
            {
                /* If == 0, then we don't know that it is the
                 * FIRST match possible in the dictionary.
                 * There may be more than one entry for the same word, and
                 * we may not have found still the first. */
                step = step / 2;
                pivot = pivot - step;
            } else if (comparision > 0)
            {
                step = step / 2;
                pivot = pivot + step;
            }
        } while (step > 0);

        /* If we found the exact word in a non-last comparision,
         * it's possible that the final binary search points us
         * to the word PREVIOUS to the good match. */
        if (!test.startsWith(word))
        {
            System.out.println("The word " + test +
                    " doesn't start with " + word);
            readUntilNewLine();
        }
        else
        {
            try {
                in.seekFromStart(pivot);
            } catch (IOException e)
            {
                System.out.println("Seek from start error");
                return results;
            }
            readUntilNewLine();
        }


        /* Add the prefixes */
        SearchNextPrefixes(results, word, 49); /* Max 50 prefixes. Match+nexts*/

        return results;
    }
}