Skip to content

Commit

Permalink
Improve halfbyte transposition performance, marginally improving bbq …
Browse files Browse the repository at this point in the history
…performance (elastic#117350) (elastic#118293)

The transposition of the bits in half-byte queries for BBQ is pretty
convoluted and slow. This commit greatly simplifies & improves
performance for this small part of bbq queries and indexing.

Here are the results of a small JMH benchmark for this particular
function.

```
TransposeBinBenchmark.transposeBinNew     1024  thrpt    5  857.779 ± 44.031  ops/ms
TransposeBinBenchmark.transposeBinOrig    1024  thrpt    5   94.950 ±  2.898  ops/ms
```

While this is a huge improvement for this small function, the impact at
query and index time is only marginal. But, the code simplification
itself is enough to warrant this change in my opinion.

(cherry picked from commit e90eb7a)
  • Loading branch information
benwtrent authored Dec 9, 2024
1 parent afd8d84 commit 67332f8
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 49 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/117350.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 117350
summary: "Improve halfbyte transposition performance, marginally improving bbq performance"
area: Vector Search
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -23,56 +23,38 @@
public class BQSpaceUtils {

public static final short B_QUERY = 4;
// the first four bits masked
private static final int B_QUERY_MASK = 15;

/**
* Copied from Lucene, replace with Lucene's implementation sometime after Lucene 10
* Transpose the query vector into a byte array allowing for efficient bitwise operations with the
* index bit vectors. The idea here is to organize the query vector bits such that the first bit
* of every dimension is in the first set dimensions bits, or (dimensions/8) bytes. The second,
* third, and fourth bits are in the second, third, and fourth set of dimensions bits,
* respectively. This allows for direct bitwise comparisons with the stored index vectors through
* summing the bitwise results with the relative required bit shifts.
*
* @param q the query vector, assumed to be half-byte quantized with values between 0 and 15
* @param dimensions the number of dimensions in the query vector
* @param quantQueryByte the byte array to store the transposed query vector
*/
public static void transposeBin(byte[] q, int dimensions, byte[] quantQueryByte) {
// TODO: rewrite this in Panama Vector API
int qOffset = 0;
final byte[] v1 = new byte[4];
final byte[] v = new byte[32];
for (int i = 0; i < dimensions; i += 32) {
// for every four bytes we shift left (with remainder across those bytes)
for (int j = 0; j < v.length; j += 4) {
v[j] = (byte) (q[qOffset + j] << B_QUERY | ((q[qOffset + j] >>> B_QUERY) & B_QUERY_MASK));
v[j + 1] = (byte) (q[qOffset + j + 1] << B_QUERY | ((q[qOffset + j + 1] >>> B_QUERY) & B_QUERY_MASK));
v[j + 2] = (byte) (q[qOffset + j + 2] << B_QUERY | ((q[qOffset + j + 2] >>> B_QUERY) & B_QUERY_MASK));
v[j + 3] = (byte) (q[qOffset + j + 3] << B_QUERY | ((q[qOffset + j + 3] >>> B_QUERY) & B_QUERY_MASK));
}
for (int j = 0; j < B_QUERY; j++) {
moveMaskEpi8Byte(v, v1);
for (int k = 0; k < 4; k++) {
quantQueryByte[(B_QUERY - j - 1) * (dimensions / 8) + i / 8 + k] = v1[k];
v1[k] = 0;
}
for (int k = 0; k < v.length; k += 4) {
v[k] = (byte) (v[k] + v[k]);
v[k + 1] = (byte) (v[k + 1] + v[k + 1]);
v[k + 2] = (byte) (v[k + 2] + v[k + 2]);
v[k + 3] = (byte) (v[k + 3] + v[k + 3]);
}
}
qOffset += 32;
}
}

private static void moveMaskEpi8Byte(byte[] v, byte[] v1b) {
int m = 0;
for (int k = 0; k < v.length; k++) {
if ((v[k] & 0b10000000) == 0b10000000) {
v1b[m] |= 0b00000001;
}
if (k % 8 == 7) {
m++;
} else {
v1b[m] <<= 1;
public static void transposeHalfByte(byte[] q, byte[] quantQueryByte) {
for (int i = 0; i < q.length;) {
assert q[i] >= 0 && q[i] <= 15;
int lowerByte = 0;
int lowerMiddleByte = 0;
int upperMiddleByte = 0;
int upperByte = 0;
for (int j = 7; j >= 0 && i < q.length; j--) {
lowerByte |= (q[i] & 1) << j;
lowerMiddleByte |= ((q[i] >> 1) & 1) << j;
upperMiddleByte |= ((q[i] >> 2) & 1) << j;
upperByte |= ((q[i] >> 3) & 1) << j;
i++;
}
int index = ((i + 7) / 8) - 1;
quantQueryByte[index] = (byte) lowerByte;
quantQueryByte[index + quantQueryByte.length / 4] = (byte) lowerMiddleByte;
quantQueryByte[index + quantQueryByte.length / 2] = (byte) upperMiddleByte;
quantQueryByte[index + 3 * quantQueryByte.length / 4] = (byte) upperByte;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -225,9 +225,7 @@ public QueryAndIndexResults quantizeQueryAndIndex(float[] vector, byte[] indexDe

// q¯ = Δ · q¯𝑢 + 𝑣𝑙 · 1𝐷
// q¯ is an approximation of q′ (scalar quantized approximation)
// FIXME: vectors need to be padded but that's expensive; update transponseBin to deal
byteQuery = BQVectorUtils.pad(byteQuery, discretizedDimensions);
BQSpaceUtils.transposeBin(byteQuery, discretizedDimensions, queryDestination);
BQSpaceUtils.transposeHalfByte(byteQuery, queryDestination);
QueryFactors factors = new QueryFactors(quantResult.quantizedSum, distToC, lower, width, normVmC, vDotC);
final float[] indexCorrections;
if (similarityFunction == EUCLIDEAN) {
Expand Down Expand Up @@ -368,9 +366,7 @@ public QueryFactors quantizeForQuery(float[] vector, byte[] destination, float[]

// q¯ = Δ · q¯𝑢 + 𝑣𝑙 · 1𝐷
// q¯ is an approximation of q′ (scalar quantized approximation)
// FIXME: vectors need to be padded but that's expensive; update transponseBin to deal
byteQuery = BQVectorUtils.pad(byteQuery, discretizedDimensions);
BQSpaceUtils.transposeBin(byteQuery, discretizedDimensions, destination);
BQSpaceUtils.transposeHalfByte(byteQuery, destination);

QueryFactors factors;
if (similarityFunction != EUCLIDEAN) {
Expand Down

0 comments on commit 67332f8

Please sign in to comment.