Skip to content

Commit

Permalink
LUCENE-2792: add FST impl
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/lucene/dev/trunk@1044834 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
mikemccand committed Dec 12, 2010
1 parent c45253d commit 994aaec
Show file tree
Hide file tree
Showing 22 changed files with 4,288 additions and 137 deletions.
2 changes: 2 additions & 0 deletions .hgignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
syntax: glob
*/build/*
*.class

Original file line number Diff line number Diff line change
Expand Up @@ -31,15 +31,16 @@
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.fst.Builder;
import org.apache.lucene.util.automaton.fst.BytesRefFSTEnum;
import org.apache.lucene.util.automaton.fst.FST;
import org.apache.lucene.util.automaton.fst.PositiveIntOutputs;
import org.apache.lucene.util.automaton.fst.PairOutputs;

import java.io.IOException;
import java.util.Comparator;
import java.util.Map;
import java.util.Set;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.SortedMap;
import java.util.Iterator;

class SimpleTextFieldsReader extends FieldsProducer {

Expand Down Expand Up @@ -116,73 +117,39 @@ public TermsEnum terms() throws IOException {
private class SimpleTextTermsEnum extends TermsEnum {
private final IndexInput in;
private final boolean omitTF;
private BytesRef current;
private int docFreq;
private long docsStart;
private boolean ended;
private final TreeMap<BytesRef,TermData> allTerms;
private Iterator<Map.Entry<BytesRef,TermData>> iter;
private final BytesRefFSTEnum<PairOutputs.Pair<Long,Long>> fstEnum;

public SimpleTextTermsEnum(TreeMap<BytesRef,TermData> allTerms, boolean omitTF) throws IOException {
public SimpleTextTermsEnum(FST<PairOutputs.Pair<Long,Long>> fst, boolean omitTF) throws IOException {
this.in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
this.allTerms = allTerms;
this.omitTF = omitTF;
iter = allTerms.entrySet().iterator();
fstEnum = new BytesRefFSTEnum<PairOutputs.Pair<Long,Long>>(fst);
}

public SeekStatus seek(BytesRef text, boolean useCache /* ignored */) throws IOException {

final SortedMap<BytesRef,TermData> tailMap = allTerms.tailMap(text);

if (tailMap.isEmpty()) {
current = null;
fstEnum.reset();
//System.out.println("seek to text=" + text.utf8ToString());
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.advance(text);
if (result == null) {
//System.out.println(" end");
return SeekStatus.END;
} else {
current = tailMap.firstKey();
final TermData td = tailMap.get(current);
docsStart = td.docsStart;
docFreq = td.docFreq;
iter = tailMap.entrySet().iterator();
assert iter.hasNext();
iter.next();
if (current.equals(text)) {
return SeekStatus.FOUND;
} else {
return SeekStatus.NOT_FOUND;
}
}

/*
if (current != null) {
final int cmp = current.compareTo(text);
if (cmp == 0) {
return SeekStatus.FOUND;
} else if (cmp > 0) {
ended = false;
in.seek(fieldStart);
}
} else {
ended = false;
in.seek(fieldStart);
}
//System.out.println(" got text=" + term.utf8ToString());
PairOutputs.Pair<Long,Long> pair = result.output;
docsStart = pair.output1;
docFreq = pair.output2.intValue();

// Naive!! This just scans... would be better to do
// up-front scan to build in-RAM index
BytesRef b;
while((b = next()) != null) {
final int cmp = b.compareTo(text);
if (cmp == 0) {
ended = false;
if (result.input.equals(text)) {
//System.out.println(" match docsStart=" + docsStart);
return SeekStatus.FOUND;
} else if (cmp > 0) {
ended = false;
} else {
//System.out.println(" not match docsStart=" + docsStart);
return SeekStatus.NOT_FOUND;
}
}
current = null;
ended = true;
return SeekStatus.END;
*/
}

@Override
Expand All @@ -192,56 +159,20 @@ public void cacheCurrentTerm() {
@Override
public BytesRef next() throws IOException {
assert !ended;

if (iter.hasNext()) {
Map.Entry<BytesRef,TermData> ent = iter.next();
current = ent.getKey();
TermData td = ent.getValue();
docFreq = td.docFreq;
docsStart = td.docsStart;
return current;
final BytesRefFSTEnum.InputOutput<PairOutputs.Pair<Long,Long>> result = fstEnum.next();
if (result != null) {
final PairOutputs.Pair<Long,Long> pair = result.output;
docsStart = pair.output1;
docFreq = pair.output2.intValue();
return result.input;
} else {
current = null;
return null;
}

/*
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
ended = true;
current = null;
return null;
} else {
assert scratch.startsWith(TERM): "got " + scratch.utf8ToString();
docsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
if (len > scratch2.length) {
scratch2.grow(len);
}
System.arraycopy(scratch.bytes, TERM.length, scratch2.bytes, 0, len);
scratch2.length = len;
current = scratch2;
docFreq = 0;
long lineStart = 0;
while(true) {
lineStart = in.getFilePointer();
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD) || scratch.startsWith(TERM)) {
break;
}
if (scratch.startsWith(DOC)) {
docFreq++;
}
}
in.seek(lineStart);
return current;
}
*/
}

@Override
public BytesRef term() {
return current;
return fstEnum.current().input;
}

@Override
Expand Down Expand Up @@ -512,10 +443,7 @@ private class SimpleTextTerms extends Terms {
private final String field;
private final long termsStart;
private final boolean omitTF;

// NOTE: horribly, horribly RAM consuming, but then
// SimpleText should never be used in production
private final TreeMap<BytesRef,TermData> allTerms = new TreeMap<BytesRef,TermData>();
private FST<PairOutputs.Pair<Long,Long>> fst;

private final BytesRef scratch = new BytesRef(10);

Expand All @@ -527,6 +455,8 @@ public SimpleTextTerms(String field, long termsStart) throws IOException {
}

private void loadTerms() throws IOException {
PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
Builder<PairOutputs.Pair<Long,Long>> b = new Builder<PairOutputs.Pair<Long,Long>>(FST.INPUT_TYPE.BYTE1, 0, 0, true, new PairOutputs<Long,Long>(posIntOutputs, posIntOutputs));
IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
in.seek(termsStart);
final BytesRef lastTerm = new BytesRef(10);
Expand All @@ -536,16 +466,14 @@ private void loadTerms() throws IOException {
readLine(in, scratch);
if (scratch.equals(END) || scratch.startsWith(FIELD)) {
if (lastDocsStart != -1) {
allTerms.put(new BytesRef(lastTerm),
new TermData(lastDocsStart, docFreq));
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
}
break;
} else if (scratch.startsWith(DOC)) {
docFreq++;
} else if (scratch.startsWith(TERM)) {
if (lastDocsStart != -1) {
allTerms.put(new BytesRef(lastTerm),
new TermData(lastDocsStart, docFreq));
b.add(lastTerm, new PairOutputs.Pair<Long,Long>(lastDocsStart, Long.valueOf(docFreq)));
}
lastDocsStart = in.getFilePointer();
final int len = scratch.length - TERM.length;
Expand All @@ -557,11 +485,23 @@ private void loadTerms() throws IOException {
docFreq = 0;
}
}
fst = b.finish();
/*
PrintStream ps = new PrintStream("out.dot");
fst.toDot(ps);
ps.close();
System.out.println("SAVED out.dot");
*/
//System.out.println("FST " + fst.sizeInBytes());
}

@Override
public TermsEnum iterator() throws IOException {
return new SimpleTextTermsEnum(allTerms, omitTF);
if (fst != null) {
return new SimpleTextTermsEnum(fst, omitTF);
} else {
return TermsEnum.EMPTY;
}
}

@Override
Expand Down
28 changes: 27 additions & 1 deletion lucene/src/java/org/apache/lucene/util/ArrayUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

import java.util.Collection;
import java.util.Comparator;
import java.lang.reflect.Array;

/**
* Methods for manipulating arrays.
Expand Down Expand Up @@ -392,7 +393,7 @@ public static int hashCode(char[] array, int start, int end) {
}

/**
* Returns hash of chars in range start (inclusive) to
* Returns hash of bytes in range start (inclusive) to
* end (inclusive)
*/
public static int hashCode(byte[] array, int start, int end) {
Expand Down Expand Up @@ -429,6 +430,31 @@ public static boolean equals(char[] left, int offsetLeft, char[] right, int offs
return false;
}

public static <T> T[] grow(T[] array, int minSize) {
if (array.length < minSize) {
@SuppressWarnings("unchecked") final T[] newArray =
(T[]) Array.newInstance(array.getClass().getComponentType(), oversize(minSize, RamUsageEstimator.NUM_BYTES_OBJ_REF));
System.arraycopy(array, 0, newArray, 0, array.length);
return newArray;
} else
return array;
}

public static <T> T[] grow(T[] array) {
return grow(array, 1 + array.length);
}

public static <T> T[] shrink(T[] array, int targetSize) {
final int newSize = getShrinkSize(array.length, targetSize, RamUsageEstimator.NUM_BYTES_OBJ_REF);
if (newSize != array.length) {
@SuppressWarnings("unchecked") final T[] newArray =
(T[]) Array.newInstance(array.getClass().getComponentType(), newSize);
System.arraycopy(array, 0, newArray, 0, newSize);
return newArray;
} else
return array;
}

// Since Arrays.equals doesn't implement offsets for equals
/**
* See if two array slices are the same.
Expand Down
41 changes: 40 additions & 1 deletion lucene/src/java/org/apache/lucene/util/IntsRef.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* existing int[].
*
* @lucene.internal */
public final class IntsRef {
public final class IntsRef implements Comparable<IntsRef> {

public int[] ints;
public int offset;
Expand Down Expand Up @@ -81,6 +81,31 @@ public boolean intsEquals(IntsRef other) {
}
}

/** Signed int order comparison */
public int compareTo(IntsRef other) {
if (this == other) return 0;

final int[] aInts = this.ints;
int aUpto = this.offset;
final int[] bInts = other.ints;
int bUpto = other.offset;

final int aStop = aUpto + Math.min(this.length, other.length);

while(aUpto < aStop) {
int aInt = aInts[aUpto++];
int bInt = bInts[bUpto++];
if (aInt > bInt) {
return 1;
} else if (aInt < bInt) {
return -1;
}
}

// One is a prefix of the other, or, they are equal:
return this.length - other.length;
}

public void copy(IntsRef other) {
if (ints == null) {
ints = new int[other.length];
Expand All @@ -97,4 +122,18 @@ public void grow(int newLength) {
ints = ArrayUtil.grow(ints, newLength);
}
}

public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('[');
final int end = offset + length;
for(int i=offset;i<end;i++) {
if (i > offset) {
sb.append(' ');
}
sb.append(Integer.toHexString(ints[i]));
}
sb.append(']');
return sb.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,7 @@ public synchronized byte[] getByteBlock() {
@Override
public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start);
final int size = freeBlocks + numBlocks;
if (size >= freeByteBlocks.length) {
final byte[][] newBlocks = new byte[ArrayUtil.oversize(size,
RamUsageEstimator.NUM_BYTES_OBJ_REF)][];
System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks);
freeByteBlocks = newBlocks;
}
freeByteBlocks = ArrayUtil.grow(freeByteBlocks, freeBlocks + numBlocks);
final int stop = start + numBlocks;
for (int i = start; i < stop; i++) {
freeByteBlocks[freeBlocks++] = blocks[i];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@
import java.util.Set;

import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.RamUsageEstimator;

/**
* Finite-state automaton with regular expression operations.
Expand Down Expand Up @@ -281,9 +280,7 @@ public State[] getNumberedStates() {
worklist.add(t.to);
t.to.number = upto;
if (upto == numberedStates.length) {
final State[] newArray = new State[ArrayUtil.oversize(1+upto, RamUsageEstimator.NUM_BYTES_OBJ_REF)];
System.arraycopy(numberedStates, 0, newArray, 0, upto);
numberedStates = newArray;
numberedStates = ArrayUtil.grow(numberedStates);
}
numberedStates[upto] = t.to;
upto++;
Expand Down
Loading

0 comments on commit 994aaec

Please sign in to comment.