Skip to content

Commit

Permalink
Backport: Move group-varint encoding/decoding logic to DataOutput/Dat…
Browse files Browse the repository at this point in the history
…aInput (#12841)
  • Loading branch information
uschindler committed Dec 29, 2023
1 parent 86573e5 commit 41e2993
Show file tree
Hide file tree
Showing 16 changed files with 371 additions and 180 deletions.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ static void readVIntBlock(
boolean indexHasFreq,
boolean decodeFreq)
throws IOException {
GroupVIntReader.readValues(docIn, docBuffer, num);
docIn.readGroupVInts(docBuffer, num);
if (indexHasFreq && decodeFreq) {
for (int i = 0; i < num; ++i) {
freqBuffer[i] = docBuffer[i] & 0x01;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ public final class Lucene99PostingsWriter extends PushPostingsWriterBase {
private final PForUtil pforUtil;
private final ForDeltaUtil forDeltaUtil;
private final Lucene99SkipWriter skipWriter;
private final GroupVIntWriter docGroupVIntWriter;

private boolean fieldHasNorms;
private NumericDocValues norms;
Expand Down Expand Up @@ -173,7 +172,6 @@ public Lucene99PostingsWriter(SegmentWriteState state) throws IOException {
skipWriter =
new Lucene99SkipWriter(
MAX_SKIP_LEVELS, BLOCK_SIZE, state.segmentInfo.maxDoc(), docOut, posOut, payOut);
docGroupVIntWriter = new GroupVIntWriter();
}

@Override
Expand Down Expand Up @@ -378,7 +376,7 @@ public void finishTerm(BlockTermState _state) throws IOException {
docDeltaBuffer[i] = (docDeltaBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0);
}
}
docGroupVIntWriter.writeValues(docOut, docDeltaBuffer, docBufferUpto);
docOut.writeGroupVInts(docDeltaBuffer, docBufferUpto);
if (writeFreqs) {
for (int i = 0; i < docBufferUpto; i++) {
final int freq = (int) freqBuffer[i];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import org.apache.lucene.util.GroupVIntUtil;

/** Base implementation class for buffered {@link IndexInput}. */
public abstract class BufferedIndexInput extends IndexInput implements RandomAccessInput {
Expand Down Expand Up @@ -149,6 +150,16 @@ public final int readInt() throws IOException {
}
}

@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
final int len =
GroupVIntUtil.readGroupVInt(
this, buffer.remaining(), p -> buffer.getInt((int) p), buffer.position(), dst, offset);
if (len > 0) {
buffer.position(buffer.position() + len);
}
}

@Override
public final long readLong() throws IOException {
if (Long.BYTES <= buffer.remaining()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import java.util.Locale;
import java.util.stream.Collectors;
import org.apache.lucene.util.Accountable;
import org.apache.lucene.util.GroupVIntUtil;
import org.apache.lucene.util.RamUsageEstimator;

/**
Expand Down Expand Up @@ -212,6 +213,25 @@ public long readLong() throws IOException {
}
}

@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
final ByteBuffer block = blocks[blockIndex(pos)];
final int blockOffset = blockOffset(pos);
// We MUST save the return value to local variable, could not use pos += readGroupVInt(...).
// because `pos +=` in java will move current value(not address) of pos to register first,
// then call the function, but we will update pos value in function via readByte(), then
// `pos +=` will use an old pos value plus return value, thereby missing 1 byte.
final int len =
GroupVIntUtil.readGroupVInt(
this,
block.limit() - blockOffset,
p -> block.getInt((int) p),
blockOffset,
dst,
offset);
pos += len;
}

@Override
public long length() {
return length;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,12 @@ public void readLongs(long[] dst, int offset, int length) throws IOException {
in.readLongs(dst, offset, length);
}

@Override
protected void readGroupVInt(long[] dst, int offset) throws IOException {
ensureOpen();
in.readGroupVInt(dst, offset);
}

@Override
public IndexInput clone() {
ensureOpen();
Expand Down
27 changes: 27 additions & 0 deletions lucene/core/src/java/org/apache/lucene/store/DataInput.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.GroupVIntUtil;

/**
* Abstract base class for performing read operations of Lucene's low-level data types.
Expand Down Expand Up @@ -98,6 +99,32 @@ public int readInt() throws IOException {
return ((b4 & 0xFF) << 24) | ((b3 & 0xFF) << 16) | ((b2 & 0xFF) << 8) | (b1 & 0xFF);
}

/**
* Read all the group varints, including the tail vints. we need a long[] because this is what
* postings are using, all longs are actually required to be integers.
*
* @param dst the array to read ints into.
* @param limit the number of int values to read.
* @lucene.experimental
*/
public final void readGroupVInts(long[] dst, int limit) throws IOException {
int i;
for (i = 0; i <= limit - 4; i += 4) {
readGroupVInt(dst, i);
}
for (; i < limit; ++i) {
dst[i] = readVInt();
}
}

/**
* Override if you have a efficient implementation. In general this is when the input supports
* random access.
*/
protected void readGroupVInt(long[] dst, int offset) throws IOException {
GroupVIntUtil.readGroupVInt(this, dst, offset);
}

/**
* Reads an int stored in variable-length format. Reads between one and five bytes. Smaller values
* take fewer bytes. Negative numbers are supported, but should be avoided.
Expand Down
41 changes: 41 additions & 0 deletions lucene/core/src/java/org/apache/lucene/store/DataOutput.java
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.Set;
import org.apache.lucene.util.BitUtil;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;

/**
* Abstract base class for performing write operations of Lucene's low-level data types.
Expand All @@ -29,6 +30,7 @@
* internal state like file position).
*/
public abstract class DataOutput {
private final BytesRefBuilder groupVIntBytes = new BytesRefBuilder();

/**
* Writes a single byte.
Expand Down Expand Up @@ -322,4 +324,43 @@ public void writeSetOfStrings(Set<String> set) throws IOException {
writeString(value);
}
}

/**
* Encode integers using group-varint. It uses {@link DataOutput#writeVInt VInt} to encode tail
* values that are not enough for a group. we need a long[] because this is what postings are
* using, all longs are actually required to be integers.
*
* @param values the values to write
* @param limit the number of values to write.
* @lucene.experimental
*/
public void writeGroupVInts(long[] values, int limit) throws IOException {
int off = 0;

// encode each group
while ((limit - off) >= 4) {
byte flag = 0;
groupVIntBytes.setLength(1);
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 6;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 4;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1) << 2;
flag |= (encodeGroupValue(Math.toIntExact(values[off++])) - 1);
groupVIntBytes.setByteAt(0, flag);
writeBytes(groupVIntBytes.bytes(), groupVIntBytes.length());
}

// tail vints
for (; off < limit; off++) {
writeVInt(Math.toIntExact(values[off]));
}
}

private int encodeGroupValue(int v) {
int lastOff = groupVIntBytes.length();
do {
groupVIntBytes.append((byte) (v & 0xFF));
v >>>= 8;
} while (v != 0);
return groupVIntBytes.length() - lastOff;
}
}
Loading

0 comments on commit 41e2993

Please sign in to comment.