LUCENE-2792: add FST impl

git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1044834 13f79535-47bb-0310-9956-ffa450edef68
kaivalnp · Dec 12, 2010 · 994aaec · 994aaec
1 parent c45253d
commit 994aaec
Show file tree

Hide file tree

Showing 22 changed files with 4,288 additions and 137 deletions.
diff --git a/.hgignore b/.hgignore
@@ -1,2 +1,4 @@
 syntax: glob
 */build/*
+*.class
+
diff --git a/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java b/lucene/src/java/org/apache/lucene/index/codecs/simpletext/SimpleTextFieldsReader.java
@@ -31,15 +31,16 @@
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.fst.Builder;
+import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum;
+import org.apache.lucene.util.automaton.fst.FST;
+import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
+import org.apache.lucene.util.automaton.fst.PairOutputs;
 
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.Map;
-import java.util.Set;
 import java.util.HashMap;
-import java.util.TreeMap;
-import java.util.SortedMap;
-import java.util.Iterator;
 
 class SimpleTextFieldsReader extends FieldsProducer {
 
@@ -116,73 +117,39 @@ public TermsEnum terms() throws IOException {
   private class SimpleTextTermsEnum extends TermsEnum {
     private final IndexInput in;
     private final boolean omitTF;
-    private BytesRef current;
     private int docFreq;
     private long docsStart;
     private boolean ended;
-    private final TreeMap<BytesRef,TermData> allTerms;
-    private Iterator<Map.Entry<BytesRef,TermData>> iter;
+    private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;
 
-    public SimpleTextTermsEnum(TreeMap<BytesRef,TermData> allTerms, boolean omitTF) throws IOException {
+    public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
       this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
-      this.allTerms = allTerms;
       this.omitTF = omitTF;
-      iter = allTerms.entrySet().iterator();
+      fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
     }
 
     public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {
-
-      final SortedMap<BytesRef,TermData> tailMap = allTerms.tailMap(text);
 
-      if (tailMap.isEmpty()) {
-        current = null;
+      fstEnum.reset();
+      //System.out.println("seek to text=" + text.utf8ToString());
+      final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.advance(text);
+      if (result == null) {
+        //System.out.println("  end");
         return SeekStatus.END;
       } else {
-        current = tailMap.firstKey();
-        final TermData td = tailMap.get(current);
-        docsStart = td.docsStart;
-        docFreq = td.docFreq;
-        iter = tailMap.entrySet().iterator();
-        assert iter.hasNext();
-        iter.next();
-        if (current.equals(text)) {
-          return SeekStatus.FOUND;
-        } else {
-          return SeekStatus.NOT_FOUND;
-        }
-      }
-
-      /*
-      if (current != null) {
-        final int cmp = current.compareTo(text);
-        if (cmp == 0) {
-          return SeekStatus.FOUND;
-        } else if (cmp > 0) {
-          ended = false;
-          in.seek(fieldStart);
-        }
-      } else {
-        ended = false;
-        in.seek(fieldStart);
-      }
+        //System.out.println("  got text=" + term.utf8ToString());
+        PairOutputs.Pair<Long,Long> pair = result.output;
+        docsStart = pair.output1;
+        docFreq = pair.output2.intValue();
 
-      // Naive!!  This just scans... would be better to do
-      // up-front scan to build in-RAM index
-      BytesRef b;
-      while((b = next()) != null) {
-        final int cmp = b.compareTo(text);
-        if (cmp == 0) {
-          ended = false;
+        if (result.input.equals(text)) {
+          //System.out.println("  match docsStart=" + docsStart);
           return SeekStatus.FOUND;
-        } else if (cmp > 0) {
-          ended = false;
+        } else {
+          //System.out.println("  not match docsStart=" + docsStart);
           return SeekStatus.NOT_FOUND;
         }
       }
-      current = null;
-      ended = true;
-      return SeekStatus.END;
-      */
     }
 
     @Override
@@ -192,56 +159,20 @@ public void cacheCurrentTerm() {
     @Override
     public BytesRef next() throws IOException {
       assert !ended;
-
-      if (iter.hasNext()) {
-        Map.Entry<BytesRef,TermData> ent = iter.next();
-        current = ent.getKey();
-        TermData td = ent.getValue();
-        docFreq = td.docFreq;
-        docsStart = td.docsStart;
-        return current;
+      final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
+      if (result != null) {
+        final PairOutputs.Pair<Long,Long> pair = result.output;
+        docsStart = pair.output1;
+        docFreq = pair.output2.intValue();
+        return result.input;
       } else {
-        current = null;
-        return null;
-      }
-
-      /*
-      readLine(in, scratch);
-      if (scratch.equals(END) || scratch.startsWith(FIELD)) {
-        ended = true;
-        current = null;
         return null;
-      } else {
-        assert scratch.startsWith(TERM): "got " + scratch.utf8ToString();
-        docsStart = in.getFilePointer();
-        final int len = scratch.length - TERM.length;
-        if (len > scratch2.length) {
-          scratch2.grow(len);
-        }
-        System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len);
-        scratch2.length = len;
-        current = scratch2;
-        docFreq = 0;
-        long lineStart = 0;
-        while(true) {
-          lineStart = in.getFilePointer();
-          readLine(in, scratch);
-          if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) {
-            break;
-          }
-          if (scratch.startsWith(DOC)) {
-            docFreq++;
-          }
-        }
-        in.seek(lineStart);
-        return current;
       }
-      */
     }
 
     @Override
     public BytesRef term() {
-      return current;
+      return fstEnum.current().input;
     }
 
     @Override
@@ -512,10 +443,7 @@ private class SimpleTextTerms extends Terms {
     private final String field;
     private final long termsStart;
     private final boolean omitTF;
-
-    // NOTE: horribly, horribly RAM consuming, but then
-    // SimpleText should never be used in production
-    private final TreeMap<BytesRef,TermData> allTerms = new TreeMap<BytesRef,TermData>();
+    private FST<PairOutputs.Pair<Long,Long>> fst;
 
     private final BytesRef scratch = new BytesRef(10);
 
@@ -527,6 +455,8 @@ public SimpleTextTerms(String field, long termsStart) throws IOException {
     }
 
     private void loadTerms() throws IOException {
+      PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
+      Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
       IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
       in.seek(termsStart);
       final BytesRef lastTerm = new BytesRef(10);
@@ -536,16 +466,14 @@ private void loadTerms() throws IOException {
         readLine(in, scratch);
         if (scratch.equals(END) || scratch.startsWith(FIELD)) {
           if (lastDocsStart != -1) {
-            allTerms.put(new BytesRef(lastTerm),
-                         new TermData(lastDocsStart, docFreq));
+            b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
           }
           break;
         } else if (scratch.startsWith(DOC)) {
           docFreq++;
         } else if (scratch.startsWith(TERM)) {
           if (lastDocsStart != -1) {
-            allTerms.put(new BytesRef(lastTerm),
-                         new TermData(lastDocsStart, docFreq));
+            b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
           }
           lastDocsStart = in.getFilePointer();
           final int len = scratch.length - TERM.length;
@@ -557,11 +485,23 @@ private void loadTerms() throws IOException {
           docFreq = 0;
         }
       }
+      fst = b.finish();
+      /*
+      PrintStream ps = new PrintStream("out.dot");
+      fst.toDot(ps);
+      ps.close();
+      System.out.println("SAVED out.dot");
+      */
+      //System.out.println("FST " + fst.sizeInBytes());
     }
 
     @Override
     public TermsEnum iterator() throws IOException {
-      return new SimpleTextTermsEnum(allTerms, omitTF);
+      if (fst != null) {
+        return new SimpleTextTermsEnum(fst, omitTF);
+      } else {
+        return TermsEnum.EMPTY;
+      }
     }
 
     @Override

diff --git a/lucene/src/java/org/apache/lucene/util/ArrayUtil.java b/lucene/src/java/org/apache/lucene/util/ArrayUtil.java
@@ -19,6 +19,7 @@
 
 import java.util.Collection;
 import java.util.Comparator;
+import java.lang.reflect.Array;
 
 /**
  * Methods for manipulating arrays.
@@ -392,7 +393,7 @@ public static int hashCode(char[] array, int start, int end) {
   }
 
   /**
-   * Returns hash of chars in range start (inclusive) to
+   * Returns hash of bytes in range start (inclusive) to
    * end (inclusive)
    */
   public static int hashCode(byte[] array, int start, int end) {
@@ -429,6 +430,31 @@ public static boolean equals(char[] left, int offsetLeft, char[] right, int offs
     return false;
   }
 
+  public static <T> T[] grow(T[] array, int minSize) {
+    if (array.length < minSize) {
+      @SuppressWarnings("unchecked") final T[] newArray =
+        (T[]) Array.newInstance(array.getClass().getComponentType(), oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJ_REF));
+      System.arraycopy(array, 0, newArray, 0, array.length);
+      return newArray;
+    } else
+      return array;
+  }
+
+  public static <T> T[] grow(T[] array) {
+    return grow(array, 1 + array.length);
+  }
+
+  public static <T> T[] shrink(T[] array, int targetSize) {
+    final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJ_REF);
+    if (newSize != array.length) {
+      @SuppressWarnings("unchecked") final T[] newArray =
+        (T[]) Array.newInstance(array.getClass().getComponentType(), newSize);
+      System.arraycopy(array, 0, newArray, 0, newSize);
+      return newArray;
+    } else
+      return array;
+  }
+
   // Since Arrays.equals doesn't implement offsets for equals
   /**
    * See if two array slices are the same.

diff --git a/lucene/src/java/org/apache/lucene/util/IntsRef.java b/lucene/src/java/org/apache/lucene/util/IntsRef.java
@@ -21,7 +21,7 @@
  *  existing int[].
  *
  *  @lucene.internal */
-public final class IntsRef {
+public final class IntsRef implements Comparable<IntsRef> {
 
   public int[] ints;
   public int offset;
@@ -81,6 +81,31 @@ public boolean intsEquals(IntsRef other) {
     }
   }
 
+  /** Signed int order comparison */
+  public int compareTo(IntsRef other) {
+    if (this == other) return 0;
+
+    final int[] aInts = this.ints;
+    int aUpto = this.offset;
+    final int[] bInts = other.ints;
+    int bUpto = other.offset;
+
+    final int aStop = aUpto + Math.min(this.length, other.length);
+
+    while(aUpto < aStop) {
+      int aInt = aInts[aUpto++];
+      int bInt = bInts[bUpto++];
+      if (aInt > bInt) {
+        return 1;
+      } else if (aInt < bInt) {
+        return -1;
+      }
+    }
+
+    // One is a prefix of the other, or, they are equal:
+    return this.length - other.length;
+  }
+
   public void copy(IntsRef other) {
     if (ints == null) {
       ints = new int[other.length];
@@ -97,4 +122,18 @@ public void grow(int newLength) {
       ints = ArrayUtil.grow(ints, newLength);
     }
   }
+
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append('[');
+    final int end = offset + length;
+    for(int i=offset;i<end;i++) {
+      if (i > offset) {
+        sb.append(' ');
+      }
+      sb.append(Integer.toHexString(ints[i]));
+    }
+    sb.append(']');
+    return sb.toString();
+  }
 }
diff --git a/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java b/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
@@ -93,13 +93,7 @@ public synchronized byte[] getByteBlock() {
   @Override
   public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
     final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start);
-    final int size = freeBlocks + numBlocks;
-    if (size >= freeByteBlocks.length) {
-      final byte[][] newBlocks = new byte[ArrayUtil.oversize(size,
-          RamUsageEstimator.NUM_BYTES_OBJ_REF)][];
-      System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks);
-      freeByteBlocks = newBlocks;
-    }
+    freeByteBlocks = ArrayUtil.grow(freeByteBlocks, freeBlocks + numBlocks);
     final int stop = start + numBlocks;
     for (int i = start; i < stop; i++) {
       freeByteBlocks[freeBlocks++] = blocks[i];

diff --git a/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java b/lucene/src/java/org/apache/lucene/util/automaton/Automaton.java
@@ -40,7 +40,6 @@
 import java.util.Set;
 
 import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.RamUsageEstimator;
 
 /**
  * Finite-state automaton with regular expression operations.
@@ -281,9 +280,7 @@ public State[] getNumberedStates() {
             worklist.add(t.to);
             t.to.number = upto;
             if (upto == numberedStates.length) {
-              final State[] newArray = new State[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
-              System.arraycopy(numberedStates, 0, newArray, 0, upto);
-              numberedStates = newArray;
+              numberedStates = ArrayUtil.grow(numberedStates);
             }
             numberedStates[upto] = t.to;
             upto++;