diff --git a/.dir-locals.el b/.dir-locals.el
new file mode 100644
index 000000000000..c51e1232603b
--- /dev/null
+++ b/.dir-locals.el
@@ -0,0 +1,3 @@
+;; set up Lucene style for emacs
+((java-mode . ((c-basic-offset . 2))))
+
diff --git a/.gitignore b/.gitignore
index 319761963836..2e61830567ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,7 +7,8 @@ build
dist
lib
test-lib
-/*~
+*~
+.#*
/build.properties
/.idea
lucene/**/*.iml
diff --git a/gradle/documentation/render-javadoc.gradle b/gradle/documentation/render-javadoc.gradle
index 914fd8be58b2..d6acaa7497fa 100644
--- a/gradle/documentation/render-javadoc.gradle
+++ b/gradle/documentation/render-javadoc.gradle
@@ -157,7 +157,8 @@ configure(project(":lucene:backward-codecs")) {
"org.apache.lucene.codecs.lucene60",
"org.apache.lucene.codecs.lucene80",
"org.apache.lucene.codecs.lucene84",
- "org.apache.lucene.codecs.lucene86"
+ "org.apache.lucene.codecs.lucene86",
+ "org.apache.lucene.codecs.lucene87"
]
}
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java
index bc4e5f360d8e..bfb51df4c5cd 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java
@@ -20,6 +20,7 @@
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@@ -128,4 +129,9 @@ public final DocValuesFormat docValuesFormat() {
public final NormsFormat normsFormat() {
return normsFormat;
}
+
+ @Override
+ public final VectorFormat vectorFormat() {
+ return VectorFormat.EMPTY;
+ }
}
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
index 90918c163d2f..46c8372494a7 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java
@@ -23,6 +23,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@@ -136,6 +137,11 @@ public PointsFormat pointsFormat() {
return new Lucene60PointsFormat();
}
+ @Override
+ public VectorFormat vectorFormat() {
+ return VectorFormat.EMPTY;
+ }
+
/** Returns the postings format that should be used for writing
* new segments of field
.
*
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java
index e2974655e75d..8ca5bb65afbf 100644
--- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java
@@ -24,6 +24,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@@ -136,6 +137,11 @@ public final PointsFormat pointsFormat() {
return pointsFormat;
}
+ @Override
+ public final VectorFormat vectorFormat() {
+ return VectorFormat.EMPTY;
+ }
+
/** Returns the postings format that should be used for writing
* new segments of field
.
*
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java
similarity index 97%
rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java
rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java
index 5ff407384e22..813ae92c33d4 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java
@@ -31,6 +31,7 @@
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
@@ -137,6 +138,9 @@ public final PointsFormat pointsFormat() {
return pointsFormat;
}
+ @Override
+ public final VectorFormat vectorFormat() { return VectorFormat.EMPTY; }
+
/** Returns the postings format that should be used for writing
* new segments of field
.
*
diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/package.html
new file mode 100644
index 000000000000..3474ef9e52f7
--- /dev/null
+++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/package.html
@@ -0,0 +1,42 @@
+
+
+
+
+
+
+
+
+
+Lucene 8.7 file format.
+
+
diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index d6732336efd5..21452ff60b49 100644
--- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -16,3 +16,4 @@
org.apache.lucene.codecs.lucene80.Lucene80Codec
org.apache.lucene.codecs.lucene84.Lucene84Codec
org.apache.lucene.codecs.lucene86.Lucene86Codec
+org.apache.lucene.codecs.lucene87.Lucene87Codec
\ No newline at end of file
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
index e44b046aa294..b973cf42aff9 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java
@@ -29,7 +29,7 @@
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.lucene87.Lucene87Codec;
+import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
@@ -138,7 +138,7 @@ public static IndexWriterConfig createWriterConfig(Config config, PerfRunData ru
if (defaultCodec == null && postingsFormat != null) {
try {
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
- iwConf.setCodec(new Lucene87Codec() {
+ iwConf.setCodec(new Lucene90Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postingsFormatChosen;
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java
index 109fec980ca9..266e0d296548 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java
@@ -21,6 +21,7 @@
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
@@ -46,6 +47,7 @@ public final class SimpleTextCodec extends Codec {
private final DocValuesFormat dvFormat = new SimpleTextDocValuesFormat();
private final CompoundFormat compoundFormat = new SimpleTextCompoundFormat();
private final PointsFormat pointsFormat = new SimpleTextPointsFormat();
+ private final VectorFormat vectorFormat = new SimpleTextVectorFormat();
public SimpleTextCodec() {
super("SimpleText");
@@ -100,4 +102,9 @@ public CompoundFormat compoundFormat() {
public PointsFormat pointsFormat() {
return pointsFormat;
}
+
+ @Override
+ public VectorFormat vectorFormat() {
+ return vectorFormat;
+ }
}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java
index 8d178130a91c..e68a7e502540 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java
@@ -30,6 +30,7 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
@@ -67,6 +68,8 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
static final BytesRef DATA_DIM_COUNT = new BytesRef(" data dimensional count ");
static final BytesRef INDEX_DIM_COUNT = new BytesRef(" index dimensional count ");
static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes ");
+ static final BytesRef VECTOR_NUM_DIMS = new BytesRef(" vector number of dimensions ");
+ static final BytesRef VECTOR_SCORE_FUNC = new BytesRef(" vector score function ");
static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes ");
@Override
@@ -146,13 +149,23 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm
assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES);
int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch));
+ SimpleTextUtil.readLine(input, scratch);
+ assert StringHelper.startsWith(scratch.get(), VECTOR_NUM_DIMS);
+ int vectorNumDimensions = Integer.parseInt(readString(VECTOR_NUM_DIMS.length, scratch));
+
+ SimpleTextUtil.readLine(input, scratch);
+ assert StringHelper.startsWith(scratch.get(), VECTOR_SCORE_FUNC);
+ String scoreFunction = readString(VECTOR_SCORE_FUNC.length, scratch);
+ VectorValues.ScoreFunction vectorDistFunc = distanceFunction(scoreFunction);
+
SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts),
- dimensionalCount, indexDimensionalCount, dimensionalNumBytes, isSoftDeletesField);
+ dimensionalCount, indexDimensionalCount, dimensionalNumBytes,
+ vectorNumDimensions, vectorDistFunc, isSoftDeletesField);
}
SimpleTextUtil.checkFooter(input);
@@ -172,6 +185,10 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm
public DocValuesType docValuesType(String dvType) {
return DocValuesType.valueOf(dvType);
}
+
+ public VectorValues.ScoreFunction distanceFunction(String scoreFunction) {
+ return VectorValues.ScoreFunction.valueOf(scoreFunction);
+ }
private String readString(int offset, BytesRefBuilder scratch) {
return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8);
@@ -253,6 +270,14 @@ public void write(Directory directory, SegmentInfo segmentInfo, String segmentSu
SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch);
SimpleTextUtil.writeNewline(out);
+ SimpleTextUtil.write(out, VECTOR_NUM_DIMS);
+ SimpleTextUtil.write(out, Integer.toString(fi.getVectorDimension()), scratch);
+ SimpleTextUtil.writeNewline(out);
+
+ SimpleTextUtil.write(out, VECTOR_SCORE_FUNC);
+ SimpleTextUtil.write(out, fi.getVectorScoreFunction().name(), scratch);
+ SimpleTextUtil.writeNewline(out);
+
SimpleTextUtil.write(out, SOFT_DELETES);
SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
SimpleTextUtil.writeNewline(out);
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorFormat.java
new file mode 100644
index 000000000000..3d4b5fedc38c
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorFormat.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.simpletext;
+
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.VectorFormat;
+import org.apache.lucene.codecs.VectorReader;
+import org.apache.lucene.codecs.VectorWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+/** For debugging, curiosity, transparency only!! Do not use this codec in production.
+ *
+ * This codec stores all data in a single human-readable text file (_N.vec). You can view this in
+ * any text editor, and even edit it to alter your index.
+ *
+ * @lucene.experimental */
+public final class SimpleTextVectorFormat extends VectorFormat {
+
+ @Override
+ public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException {
+ return new SimpleTextVectorWriter(state);
+ }
+
+ @Override
+ public VectorReader fieldsReader(SegmentReadState state) throws IOException {
+ return new SimpleTextVectorReader(state);
+ }
+
+ /** Extension of points data file */
+ static final String VECTOR_EXTENSION = "vec";
+
+ /** Extension of points index file */
+ static final String META_EXTENSION = "gri";
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java
new file mode 100644
index 000000000000..11494c97ed97
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java
@@ -0,0 +1,304 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.simpletext;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.VectorReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.BufferedChecksumIndexInput;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.StringHelper;
+
+import static org.apache.lucene.codecs.simpletext.SimpleTextVectorWriter.*;
+
+/**
+ * Reads vector values from a simple text format. All vectors are read up front and cached in RAM in order to support
+ * random access.
+ * FOR RECREATIONAL USE ONLY
+ * @lucene.experimental
+ */
+public class SimpleTextVectorReader extends VectorReader {
+
+ private static final BytesRef EMPTY = new BytesRef("");
+
+ private final SegmentReadState readState;
+ private final IndexInput dataIn;
+ private final BytesRefBuilder scratch = new BytesRefBuilder();
+ private final Map fieldEntries = new HashMap<>();
+
+ SimpleTextVectorReader(SegmentReadState readState) throws IOException {
+ this.readState = readState;
+ String metaFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION);
+ try (ChecksumIndexInput in = readState.directory.openChecksumInput(metaFileName, IOContext.DEFAULT)) {
+ int fieldNumber = readInt(in, FIELD_NUMBER);
+ while (fieldNumber != -1) {
+ String fieldName = readString(in, FIELD_NAME);
+ String scoreFunctionName = readString(in, SCORE_FUNCTION);
+ VectorValues.ScoreFunction scoreFunction = VectorValues.ScoreFunction.valueOf(scoreFunctionName);
+ long vectorDataOffset = readLong(in, VECTOR_DATA_OFFSET);
+ long vectorDataLength = readLong(in, VECTOR_DATA_LENGTH);
+ int dimension = readInt(in, VECTOR_DIMENSION);
+ int size = readInt(in, SIZE);
+ int[] docIds = new int[size];
+ for (int i = 0; i < size; i++) {
+ docIds[i] = readInt(in, EMPTY);
+ }
+ assert fieldEntries.containsKey(fieldName) == false;
+ fieldEntries.put(fieldName, new FieldEntry(dimension, scoreFunction, vectorDataOffset, vectorDataLength, docIds));
+ fieldNumber = readInt(in, FIELD_NUMBER);
+ }
+ SimpleTextUtil.checkFooter(in);
+ }
+
+ String vectorFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION);
+ dataIn = readState.directory.openInput(vectorFileName, IOContext.DEFAULT);
+ }
+
+ @Override
+ public VectorValues getVectorValues(String field) throws IOException {
+ FieldInfo info = readState.fieldInfos.fieldInfo(field);
+ if (info == null) {
+ throw new IllegalStateException("No vectors indexed for field=\"" + field + "\"");
+ }
+ int dimension = info.getVectorDimension();
+ if (dimension == 0) {
+ return VectorValues.EMPTY;
+ }
+ FieldEntry fieldEntry = fieldEntries.get(field);
+ if (fieldEntry == null) {
+ throw new IllegalStateException("No entry found for vector field=\"" + field + "\"");
+ }
+ if (dimension != fieldEntry.dimension) {
+ throw new IllegalStateException("Inconsistent vector dimension for field=\"" + field + "\"; " + dimension + " != " + fieldEntry.dimension);
+ }
+ IndexInput bytesSlice = dataIn.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
+ return new SimpleTextVectorValues(fieldEntry, bytesSlice);
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ IndexInput clone = dataIn.clone();
+ clone.seek(0);
+
+ // checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included in SimpleTextUtil.CHECKSUM):
+ long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21);
+ ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
+ while (true) {
+ SimpleTextUtil.readLine(input, scratch);
+ if (input.getFilePointer() >= footerStartPos) {
+ // Make sure we landed at precisely the right location:
+ if (input.getFilePointer() != footerStartPos) {
+ throw new CorruptIndexException("SimpleText failure: footer does not start at expected position current=" + input.getFilePointer() + " vs expected=" + footerStartPos, input);
+ }
+ SimpleTextUtil.checkFooter(input);
+ break;
+ }
+ }
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ dataIn.close();
+ }
+
+ private static class FieldEntry {
+
+ final int dimension;
+ final VectorValues.ScoreFunction scoreFunction;
+
+ final long vectorDataOffset;
+ final long vectorDataLength;
+ final int[] ordToDoc;
+
+ FieldEntry(int dimension, VectorValues.ScoreFunction scoreFunction,
+ long vectorDataOffset, long vectorDataLength, int[] ordToDoc) {
+ this.dimension = dimension;
+ this.scoreFunction = scoreFunction;
+ this.vectorDataOffset = vectorDataOffset;
+ this.vectorDataLength = vectorDataLength;
+ this.ordToDoc = ordToDoc;
+ }
+
+ int size() {
+ return ordToDoc.length;
+ }
+ }
+
+ private static class SimpleTextVectorValues extends VectorValues implements VectorValues.RandomAccess {
+
+ private final BytesRefBuilder scratch = new BytesRefBuilder();
+ private final FieldEntry entry;
+ private final IndexInput in;
+ private final BytesRef binaryValue;
+ private final float[][] values;
+
+ int curOrd;
+
+ SimpleTextVectorValues(FieldEntry entry, IndexInput in) throws IOException {
+ this.entry = entry;
+ this.in = in;
+ values = new float[entry.size()][entry.dimension];
+ binaryValue = new BytesRef(entry.dimension * Float.BYTES);
+ binaryValue.length = binaryValue.bytes.length;
+ curOrd = -1;
+ readAllVectors();
+ }
+
+ @Override
+ public int dimension() {
+ return entry.dimension;
+ }
+
+ @Override
+ public int size() {
+ return entry.size();
+ }
+
+ @Override
+ public ScoreFunction scoreFunction() {
+ return entry.scoreFunction;
+ }
+
+ @Override
+ public float[] vectorValue() {
+ return values[curOrd];
+ }
+
+ @Override
+ public BytesRef binaryValue() {
+ ByteBuffer.wrap(binaryValue.bytes).asFloatBuffer().get(values[curOrd]);
+ return binaryValue;
+ }
+
+ @Override
+ public RandomAccess randomAccess() {
+ return this;
+ }
+
+ @Override
+ public int docID() {
+ if (curOrd == -1) {
+ return -1;
+ }
+ return entry.ordToDoc[curOrd];
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ if (++curOrd < entry.size()) {
+ return docID();
+ }
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return slowAdvance(target);
+ }
+
+ @Override
+ public long cost() {
+ return size();
+ }
+
+ private void readAllVectors() throws IOException {
+ for (int i = 0; i < values.length; i++) {
+ readVector(values[i]);
+ }
+ }
+
+ private void readVector(float[] value) throws IOException {
+ SimpleTextUtil.readLine(in, scratch);
+ // skip leading " [" and strip trailing "]"
+ String s = new BytesRef(scratch.bytes(), 2, scratch.length() - 3).utf8ToString();
+ String[] floatStrings = s.split(",");
+ assert floatStrings.length == value.length : " read " + s + " when expecting " + value.length + " floats";
+ for (int i = 0; i < floatStrings.length; i++) {
+ value[i] = Float.parseFloat(floatStrings[i]);
+ }
+ }
+
+ @Override
+ public float[] vectorValue(int targetOrd) throws IOException {
+ return values[targetOrd];
+ }
+
+ @Override
+ public BytesRef binaryValue(int targetOrd) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public TopDocs search(float[] target, int k, int fanout) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ private int readInt(IndexInput in, BytesRef field) throws IOException {
+ SimpleTextUtil.readLine(in, scratch);
+ return parseInt(field);
+ }
+
+ private long readLong(IndexInput in, BytesRef field) throws IOException {
+ SimpleTextUtil.readLine(in, scratch);
+ return parseLong(field);
+ }
+
+ private String readString(IndexInput in, BytesRef field) throws IOException {
+ SimpleTextUtil.readLine(in, scratch);
+ return stripPrefix(field);
+ }
+
+ private boolean startsWith(BytesRef prefix) {
+ return StringHelper.startsWith(scratch.get(), prefix);
+ }
+
+ private int parseInt(BytesRef prefix) {
+ assert startsWith(prefix);
+ return Integer.parseInt(stripPrefix(prefix));
+ }
+
+ private long parseLong(BytesRef prefix) {
+ assert startsWith(prefix);
+ return Long.parseLong(stripPrefix(prefix));
+ }
+
+ private String stripPrefix(BytesRef prefix) {
+ int prefixLen = prefix.length;
+ return new String(scratch.bytes(), prefixLen, scratch.length() - prefixLen, StandardCharsets.UTF_8);
+ }
+}
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java
new file mode 100644
index 000000000000..3f076cf4a0db
--- /dev/null
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java
@@ -0,0 +1,148 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.simpletext;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.codecs.VectorWriter;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+/**
+ * Writes vector-valued fields in a plain text format
+ */
+public class SimpleTextVectorWriter extends VectorWriter {
+
+ static final BytesRef FIELD_NUMBER = new BytesRef("field-number ");
+ static final BytesRef FIELD_NAME = new BytesRef("field-name ");
+ static final BytesRef SCORE_FUNCTION = new BytesRef("score-function ");
+ static final BytesRef VECTOR_DATA_OFFSET = new BytesRef("vector-data-offset ");
+ static final BytesRef VECTOR_DATA_LENGTH = new BytesRef("vector-data-length ");
+ static final BytesRef VECTOR_DIMENSION = new BytesRef("vector-dimension ");
+ static final BytesRef SIZE = new BytesRef("size ");
+
+ private final IndexOutput meta, vectorData;
+ private final BytesRefBuilder scratch = new BytesRefBuilder();
+
+ SimpleTextVectorWriter(SegmentWriteState state) throws IOException {
+ assert state.fieldInfos.hasVectorValues();
+
+ String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION);
+ meta = state.directory.createOutput(metaFileName, state.context);
+
+ String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION);
+ vectorData = state.directory.createOutput(vectorDataFileName, state.context);
+ }
+
+ @Override
+ public void writeField(FieldInfo fieldInfo, VectorValues vectors) throws IOException {
+ long vectorDataOffset = vectorData.getFilePointer();
+ List docIds = new ArrayList<>();
+ int docV, ord = 0;
+ for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), ord++) {
+ writeVectorValue(vectors);
+ docIds.add(docV);
+ }
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
+ }
+
+ private void writeVectorValue(VectorValues vectors) throws IOException {
+ // write vector value
+ float[] value = vectors.vectorValue();
+ assert value.length == vectors.dimension();
+ write(vectorData, Arrays.toString(value));
+ newline(vectorData);
+ }
+
+ private void writeMeta(FieldInfo field, long vectorDataOffset, long vectorDataLength, List docIds) throws IOException {
+ writeField(meta, FIELD_NUMBER, field.number);
+ writeField(meta, FIELD_NAME, field.name);
+ writeField(meta, SCORE_FUNCTION, field.getVectorScoreFunction().name());
+ writeField(meta, VECTOR_DATA_OFFSET, vectorDataOffset);
+ writeField(meta, VECTOR_DATA_LENGTH, vectorDataLength);
+ writeField(meta, VECTOR_DIMENSION, field.getVectorDimension());
+ writeField(meta, SIZE, docIds.size());
+ for (Integer docId : docIds) {
+ writeInt(meta, docId);
+ newline(meta);
+ }
+ writeField(meta, FIELD_NUMBER, -1);
+ }
+
+ @Override
+ public void finish() throws IOException {
+ SimpleTextUtil.writeChecksum(meta, scratch);
+ SimpleTextUtil.writeChecksum(vectorData, scratch);
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.close(vectorData, meta);
+ }
+
+ private void writeField(IndexOutput out, BytesRef fieldName, int value) throws IOException {
+ write(out, fieldName);
+ writeInt(out, value);
+ newline(out);
+ }
+
+ private void writeField(IndexOutput out, BytesRef fieldName, long value) throws IOException {
+ write(out, fieldName);
+ writeLong(out, value);
+ newline(out);
+ }
+
+ private void writeField(IndexOutput out, BytesRef fieldName, String value) throws IOException {
+ write(out, fieldName);
+ write(out, value);
+ newline(out);
+ }
+
+ private void write(IndexOutput out, String s) throws IOException {
+ SimpleTextUtil.write(out, s, scratch);
+ }
+
+ private void writeInt(IndexOutput out, int x) throws IOException {
+ SimpleTextUtil.write(out, Integer.toString(x), scratch);
+ }
+
+ private void writeLong(IndexOutput out, long x) throws IOException {
+ SimpleTextUtil.write(out, Long.toString(x), scratch);
+ }
+
+ private void write(IndexOutput out, BytesRef b) throws IOException {
+ SimpleTextUtil.write(out, b);
+ }
+
+ private void newline(IndexOutput out) throws IOException {
+ SimpleTextUtil.writeNewline(out);
+ }
+
+}
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java
index e3e1d06f7f6f..f2117785522b 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java
@@ -24,6 +24,7 @@
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.store.ByteBuffersIndexOutput;
import org.apache.lucene.util.BytesRef;
@@ -120,6 +121,8 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) {
0,
0,
0,
+ 0,
+ VectorValues.ScoreFunction.NONE,
true
);
}
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
index 5707fb4f6a03..c08bb5506935 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java
@@ -42,6 +42,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.Directory;
@@ -203,6 +204,8 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) {
0,
0,
0,
+ 0,
+ VectorValues.ScoreFunction.NONE,
false
);
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
index 14fa7935f9f9..3a2bc3ff593f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java
@@ -56,8 +56,7 @@ static NamedSPILoader getLoader() {
return LOADER;
}
- // TODO: should we use this, or maybe a system property is better?
- static Codec defaultCodec = LOADER.lookup("Lucene87");
+ static Codec defaultCodec = LOADER.lookup("Lucene90");
}
private final String name;
@@ -110,6 +109,9 @@ public final String getName() {
/** Encodes/decodes points index */
public abstract PointsFormat pointsFormat();
+
+ /** Encodes/decodes numeric vector fields */
+ public abstract VectorFormat vectorFormat();
/** looks up a codec by name */
public static Codec forName(String name) {
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java
index 9abd8d4f3313..4a5e934f7247 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java
@@ -108,4 +108,9 @@ public CompoundFormat compoundFormat() {
public PointsFormat pointsFormat() {
return delegate.pointsFormat();
}
+
+ @Override
+ public VectorFormat vectorFormat() {
+ return delegate.vectorFormat();
+ }
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/VectorFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/VectorFormat.java
new file mode 100644
index 000000000000..a7a64e1f06c2
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/VectorFormat.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs;
+
+import java.io.IOException;
+
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.VectorValues;
+
+/**
+ * Encodes/decodes per-document vector and any associated indexing structures required to support nearest-neighbor search
+ */
+public abstract class VectorFormat {
+
+ /** Sole constructor */
+ protected VectorFormat() {}
+
+ /**
+ * Returns a {@link VectorWriter} to write the vectors to the index.
+ */
+ public abstract VectorWriter fieldsWriter(SegmentWriteState state) throws IOException;
+
+ /**
+ * Returns a {@link VectorReader} to read the vectors from the index.
+ */
+ public abstract VectorReader fieldsReader(SegmentReadState state) throws IOException;
+
+ /**
+ * EMPTY throws an exception when written. It acts as a sentinel indicating a Codec that does not support vectors.
+ */
+ public static final VectorFormat EMPTY = new VectorFormat() {
+ @Override
+ public VectorWriter fieldsWriter(SegmentWriteState state) {
+ throw new UnsupportedOperationException("Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene90");
+ }
+
+ @Override
+ public VectorReader fieldsReader(SegmentReadState state) {
+ return new VectorReader() {
+ @Override
+ public void checkIntegrity() {
+ }
+
+ @Override
+ public VectorValues getVectorValues(String field) {
+ return VectorValues.EMPTY;
+ }
+
+ @Override
+ public void close() throws IOException {
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 0;
+ }
+ };
+ }
+ };
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/VectorReader.java b/lucene/core/src/java/org/apache/lucene/codecs/VectorReader.java
new file mode 100644
index 000000000000..15a3d4659929
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/VectorReader.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.util.Accountable;
+
+/**
+ * Reads vectors from an index.
+ */
+public abstract class VectorReader implements Closeable, Accountable {
+
+ /** Sole constructor */
+ protected VectorReader() {}
+
+ /**
+ * Checks consistency of this reader.
+ *
+ * Note that this may be costly in terms of I/O, e.g.
+ * may involve computing a checksum value against large data files.
+ * @lucene.internal
+ */
+ public abstract void checkIntegrity() throws IOException;
+
+ /** Returns the {@link VectorValues} for the given {@code field} */
+ public abstract VectorValues getVectorValues(String field) throws IOException;
+
+ /**
+ * Returns an instance optimized for merging. This instance may only be
+ * consumed in the thread that called {@link #getMergeInstance()}.
+ *
+ * The default implementation returns {@code this} */
+ public VectorReader getMergeInstance() {
+ return this;
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/VectorWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/VectorWriter.java
new file mode 100644
index 000000000000..5dda312466b0
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/VectorWriter.java
@@ -0,0 +1,283 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.lucene.index.DocIDMerger;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.MergeState;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.BytesRef;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+/**
+ * Writes vectors to an index.
+ */
+public abstract class VectorWriter implements Closeable {
+
+ /** Sole constructor */
+ protected VectorWriter() {}
+
+ /** Write all values contained in the provided reader */
+ public abstract void writeField(FieldInfo fieldInfo, VectorValues values) throws IOException;
+
+ /** Called once at the end before close */
+ public abstract void finish() throws IOException;
+
+ /** Merge the vector values from multiple segments, for all fields */
+ public void merge(MergeState mergeState) throws IOException {
+ for (int i = 0; i < mergeState.fieldInfos.length; i++) {
+ VectorReader reader = mergeState.vectorReaders[i];
+ assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false;
+ if (reader != null) {
+ reader.checkIntegrity();
+ }
+ }
+ for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) {
+ if (fieldInfo.hasVectorValues()) {
+ mergeVectors(fieldInfo, mergeState);
+ }
+ }
+ finish();
+ }
+
+ private void mergeVectors(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException {
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo);
+ }
+ List subs = new ArrayList<>();
+ int dimension = -1;
+ VectorValues.ScoreFunction scoreFunction = null;
+ int nonEmptySegmentIndex = 0;
+ for (int i = 0; i < mergeState.vectorReaders.length; i++) {
+ VectorReader vectorReader = mergeState.vectorReaders[i];
+ if (vectorReader != null) {
+ if (mergeFieldInfo != null && mergeFieldInfo.hasVectorValues()) {
+ int segmentDimension = mergeFieldInfo.getVectorDimension();
+ VectorValues.ScoreFunction segmentScoreFunction = mergeFieldInfo.getVectorScoreFunction();
+ if (dimension == -1) {
+ dimension = segmentDimension;
+ scoreFunction = mergeFieldInfo.getVectorScoreFunction();
+ } else if (dimension != segmentDimension) {
+ throw new IllegalStateException("Varying dimensions for vector-valued field " + mergeFieldInfo.name
+ + ": " + dimension + "!=" + segmentDimension);
+ } else if (scoreFunction != segmentScoreFunction) {
+ throw new IllegalStateException("Varying score functions for vector-valued field " + mergeFieldInfo.name
+ + ": " + scoreFunction + "!=" + segmentScoreFunction);
+ }
+ VectorValues values = vectorReader.getVectorValues(mergeFieldInfo.name);
+ if (values != null) {
+ subs.add(new VectorValuesSub(nonEmptySegmentIndex++, mergeState.docMaps[i], values));
+ }
+ }
+ }
+ }
+ // Create a new VectorValues by iterating over the sub vectors, mapping the resulting
+ // docids using docMaps in the mergeState.
+ if (subs.size() > 0) {
+ writeField(mergeFieldInfo, new VectorValuesMerger(subs, mergeState));
+ }
+ if (mergeState.infoStream.isEnabled("VV")) {
+ mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo);
+ }
+ }
+
+ /** Tracks state of one sub-reader that we are merging */
+ private static class VectorValuesSub extends DocIDMerger.Sub {
+
+ final MergeState.DocMap docMap;
+ final VectorValues values;
+ final int segmentIndex;
+ int count;
+
+ VectorValuesSub(int segmentIndex, MergeState.DocMap docMap, VectorValues values) {
+ super(docMap);
+ this.values = values;
+ this.segmentIndex = segmentIndex;
+ this.docMap = docMap;
+ assert values.docID() == -1;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ int docId = values.nextDoc();
+ if (docId != NO_MORE_DOCS) {
+ // Note: this does count deleted docs since they are present in the to-be-merged segment
+ ++count;
+ }
+ return docId;
+ }
+ }
+
+ /**
+ * View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a reverse ordinal
+ * mapping for documents having values in order to support random access by dense ordinal.
+ */
+ private static class VectorValuesMerger extends VectorValues {
+ private final List subs;
+ private final DocIDMerger docIdMerger;
+ private final int[] ordBase;
+ private final int cost;
+ private final int size;
+
+ private int docId;
+ private VectorValuesSub current;
+ // For each doc with a vector, record its ord in the segments being merged. This enables random access into the
+ // unmerged segments using the ords from the merged segment.
+ private int[] ordMap;
+ private int ord;
+
+ VectorValuesMerger(List subs, MergeState mergeState) throws IOException {
+ this.subs = subs;
+ docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);
+ int totalCost = 0, totalSize = 0;
+ for (VectorValuesSub sub : subs) {
+ totalCost += sub.values.cost();
+ totalSize += sub.values.size();
+ }
+ cost = totalCost;
+ size = totalSize;
+ ordMap = new int[size];
+ ordBase = new int[subs.size()];
+ int lastBase = 0;
+ for (int k = 0; k < subs.size(); k++) {
+ int size = subs.get(k).values.size();
+ ordBase[k] = lastBase;
+ lastBase += size;
+ }
+ docId = -1;
+ }
+
+ @Override
+ public int docID() {
+ return docId;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ current = docIdMerger.next();
+ if (current == null) {
+ docId = NO_MORE_DOCS;
+ } else {
+ docId = current.mappedDocID;
+ ordMap[ord++] = ordBase[current.segmentIndex] + current.count - 1;
+ }
+ return docId;
+ }
+
+ @Override
+ public float[] vectorValue() throws IOException {
+ return current.values.vectorValue();
+ }
+
+ @Override
+ public BytesRef binaryValue() throws IOException {
+ return current.values.binaryValue();
+ }
+
+ @Override
+ public RandomAccess randomAccess() {
+ return new MergerRandomAccess();
+ }
+
+ @Override
+ public int advance(int target) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ @Override
+ public long cost() {
+ return cost;
+ }
+
+ @Override
+ public int dimension() {
+ return subs.get(0).values.dimension();
+ }
+
+ @Override
+ public VectorValues.ScoreFunction scoreFunction() {
+ return subs.get(0).values.scoreFunction();
+ }
+
+ class MergerRandomAccess implements VectorValues.RandomAccess {
+
+ private final List raSubs;
+
+ MergerRandomAccess() {
+ raSubs = new ArrayList<>(subs.size());
+ for (VectorValuesSub sub : subs) {
+ raSubs.add(sub.values.randomAccess());
+ }
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ @Override
+ public int dimension() {
+ return VectorValuesMerger.this.dimension();
+ }
+
+ @Override
+ public ScoreFunction scoreFunction() {
+ return VectorValuesMerger.this.scoreFunction();
+ }
+
+ @Override
+ public float[] vectorValue(int target) throws IOException {
+ int unmappedOrd = ordMap[target];
+ int segmentOrd = Arrays.binarySearch(ordBase, unmappedOrd);
+ if (segmentOrd < 0) {
+ // get the index of the greatest lower bound
+ segmentOrd = -2 - segmentOrd;
+ }
+ while(segmentOrd < ordBase.length - 1 && ordBase[segmentOrd + 1] == ordBase[segmentOrd]) {
+ // forward over empty segments which will share the same ordBase
+ segmentOrd++;
+ }
+ return raSubs.get(segmentOrd).vectorValue(unmappedOrd - ordBase[segmentOrd]);
+ }
+
+ @Override
+ public BytesRef binaryValue(int targetOrd) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public TopDocs search(float[] target, int k, int fanout) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
index f8368bc5e148..0f0fe8c642fd 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java
@@ -31,6 +31,7 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
@@ -148,7 +149,8 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm
lastAttributes = attributes;
try {
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
- indexOptions, docValuesType, dvGen, attributes, 0, 0, 0, false);
+ indexOptions, docValuesType, dvGen, attributes, 0, 0, 0,
+ 0, VectorValues.ScoreFunction.NONE, false);
} catch (IllegalStateException e) {
throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
index e63873a63c66..0c869e9a6dc1 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java
@@ -17,7 +17,7 @@
/**
* Components from the Lucene 5.0 index format
- * See {@link org.apache.lucene.codecs.lucene80} for an overview
+ * See {@link org.apache.lucene.codecs.lucene90} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene50;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java
index 3b97c2670192..4f6f42a2f99d 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java
@@ -31,6 +31,7 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.DataOutput;
import org.apache.lucene.store.Directory;
@@ -164,7 +165,8 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm
try {
infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
indexOptions, docValuesType, dvGen, attributes,
- pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, isSoftDeletesField);
+ pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes,
+ 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField);
} catch (IllegalStateException e) {
throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
index d807058f6468..9c82b1273145 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene86}
+ * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene90}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene60;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java
index c2c31534a79c..f654630c097f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java
@@ -17,7 +17,7 @@
/**
* Components from the Lucene 8.0 index format
- * See {@link org.apache.lucene.codecs.lucene84} for an overview
+ * See {@link org.apache.lucene.codecs.lucene90} for an overview
* of the index format.
*/
package org.apache.lucene.codecs.lucene80;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java
index 5940a47dca83..00d7edd04d5c 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java
@@ -16,7 +16,7 @@
*/
/**
- * Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene86}
+ * Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene90}
* for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene84;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java
index 13f35a189e6d..d486d3796a8f 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java
@@ -16,401 +16,7 @@
*/
/**
- * Lucene 8.6 file format.
- *
- * Apache Lucene - Index File Formats
- *
- *
- * Introduction
- *
- *
This document defines the index file formats used in this version of Lucene.
- * If you are using a different version of Lucene, please consult the copy of
- * docs/
that was distributed with
- * the version you are using.
- *
This document attempts to provide a high-level definition of the Apache
- * Lucene file formats.
- *
- *
- * Definitions
- *
- *
The fundamental concepts in Lucene are index, document, field and term.
- *
An index contains a sequence of documents.
- *
- * - A document is a sequence of fields.
- * - A field is a named sequence of terms.
- * - A term is a sequence of bytes.
- *
- *
The same sequence of bytes in two different fields is considered a different
- * term. Thus terms are represented as a pair: the string naming the field, and the
- * bytes within the field.
- *
- *
Inverted Indexing
- *
The index stores statistics about terms in order to make term-based search
- * more efficient. Lucene's index falls into the family of indexes known as an
- * inverted index. This is because it can list, for a term, the documents
- * that contain it. This is the inverse of the natural relationship, in which
- * documents list terms.
- *
- *
Types of Fields
- *
In Lucene, fields may be stored, in which case their text is stored
- * in the index literally, in a non-inverted manner. Fields that are inverted are
- * called indexed. A field may be both stored and indexed.
- *
The text of a field may be tokenized into terms to be indexed, or the
- * text of a field may be used literally as a term to be indexed. Most fields are
- * tokenized, but sometimes it is useful for certain identifier fields to be
- * indexed literally.
- *
See the {@link org.apache.lucene.document.Field Field}
- * java docs for more information on Fields.
- *
- *
Segments
- *
Lucene indexes may be composed of multiple sub-indexes, or segments.
- * Each segment is a fully independent index, which could be searched separately.
- * Indexes evolve by:
- *
- * - Creating new segments for newly added documents.
- * - Merging existing segments.
- *
- *
Searches may involve multiple segments and/or multiple indexes, each index
- * potentially composed of a set of segments.
- *
- *
Document Numbers
- *
Internally, Lucene refers to documents by an integer document number.
- * The first document added to an index is numbered zero, and each subsequent
- * document added gets a number one greater than the previous.
- *
Note that a document's number may change, so caution should be taken when
- * storing these numbers outside of Lucene. In particular, numbers may change in
- * the following situations:
- *
- * -
- *
The numbers stored in each segment are unique only within the segment, and
- * must be converted before they can be used in a larger context. The standard
- * technique is to allocate each segment a range of values, based on the range of
- * numbers used in that segment. To convert a document number from a segment to an
- * external value, the segment's base document number is added. To convert
- * an external value back to a segment-specific value, the segment is identified
- * by the range that the external value is in, and the segment's base value is
- * subtracted. For example two five document segments might be combined, so that
- * the first segment has a base value of zero, and the second of five. Document
- * three from the second segment would have an external value of eight.
- *
- * -
- *
When documents are deleted, gaps are created in the numbering. These are
- * eventually removed as the index evolves through merging. Deleted documents are
- * dropped when segments are merged. A freshly-merged segment thus has no gaps in
- * its numbering.
- *
- *
- *
- *
- * Index Structure Overview
- *
- *
Each segment index maintains the following:
- *
- * -
- * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
- * This contains metadata about a segment, such as the number of documents,
- * what files it uses, and information about how the segment is sorted
- *
- * -
- * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Field names}.
- * This contains the set of field names used in the index.
- *
- * -
- * Stored Field values.
- * This contains, for each document, a list of attribute-value pairs, where the attributes
- * are field names. These are used to store auxiliary information about the document, such as
- * its title, url, or an identifier to access a database. The set of stored fields are what is
- * returned for each hit when searching. This is keyed by document number.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
- * A dictionary containing all of the terms used in all of the
- * indexed fields of all of the documents. The dictionary also contains the number
- * of documents which contain the term, and pointers to the term's frequency and
- * proximity data.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
- * For each term in the dictionary, the numbers of all the
- * documents that contain that term, and the frequency of the term in that
- * document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
- *
- * -
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
- * For each term in the dictionary, the positions that the
- * term occurs in each document. Note that this will not exist if all fields in
- * all documents omit position data.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
- * For each field in each document, a value is stored
- * that is multiplied into the score for hits on that field.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
- * For each field in each document, the term vector (sometimes
- * called document vector) may be stored. A term vector consists of term text and
- * term frequency. To add Term Vectors to your index see the
- * {@link org.apache.lucene.document.Field Field} constructors
- *
- * -
- * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
- * Like stored values, these are also keyed by document
- * number, but are generally intended to be loaded into main memory for fast
- * access. Whereas stored values are generally intended for summary results from
- * searches, per-document values are useful for things like scoring factors.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
- * An optional file indicating which documents are live.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
- * Optional pair of files, recording dimensionally indexed fields, to enable fast
- * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
- * and geographic shape intersection (2D, 3D).
- *
- *
- *
Details on each of these are provided in their linked pages.
- *
- *
- * File Naming
- *
- *
All files belonging to a segment have the same name with varying extensions.
- * The extensions correspond to the different file formats described below. When
- * using the Compound File format (default for small segments) these files (except
- * for the Segment info file, the Lock file, and Deleted documents file) are collapsed
- * into a single .cfs file (see below for details)
- *
Typically, all segments in an index are stored in a single directory,
- * although this is not required.
- *
File names are never re-used. That is, when any file is saved
- * to the Directory it is given a never before used filename. This is achieved
- * using a simple generations approach. For example, the first segments file is
- * segments_1, then segments_2, etc. The generation is a sequential long integer
- * represented in alpha-numeric (base 36) form.
- *
- *
- * Summary of File Extensions
- *
- *
The following table summarizes the names and extensions of the files in
- * Lucene:
- *
- * lucene filenames by extension
- *
- * Name |
- * Extension |
- * Brief Description |
- *
- *
- * {@link org.apache.lucene.index.SegmentInfos Segments File} |
- * segments_N |
- * Stores information about a commit point |
- *
- *
- * Lock File |
- * write.lock |
- * The Write lock prevents multiple IndexWriters from writing to the same
- * file. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info} |
- * .si |
- * Stores metadata about a segment |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File} |
- * .cfs, .cfe |
- * An optional "virtual" file consisting of all the other index files for
- * systems that frequently run out of file handles. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields} |
- * .fnm |
- * Stores information about the fields |
- *
- *
- * Field Index |
- * .fdx |
- * Contains pointers to field data |
- *
- *
- * Field Data |
- * .fdt |
- * The stored fields for documents |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary} |
- * .tim |
- * The term dictionary, stores term info |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index} |
- * .tip |
- * The index into the Term Dictionary |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies} |
- * .doc |
- * Contains the list of docs which contain each term along with frequency |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions} |
- * .pos |
- * Stores position information about where a term occurs in the index |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads} |
- * .pay |
- * Stores additional per-position metadata information such as character offsets and user payloads |
- *
- *
- * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms} |
- * .nvd, .nvm |
- * Encodes length and boost factors for docs and fields |
- *
- *
- * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values} |
- * .dvd, .dvm |
- * Encodes additional scoring factors or other per-document information. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index} |
- * .tvx |
- * Stores offset into the document data file |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data} |
- * .tvd |
- * Contains term vector data. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents} |
- * .liv |
- * Info about what documents are live |
- *
- *
- * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values} |
- * .dii, .dim |
- * Holds indexed points, if any |
- *
- *
- *
- *
- * Lock File
- * The write lock, which is stored in the index directory by default, is named
- * "write.lock". If the lock directory is different from the index directory then
- * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
- * derived from the full path to the index directory. When this file is present, a
- * writer is currently modifying the index (adding or removing documents). This
- * lock file ensures that only one writer is modifying the index at a time.
- *
- * History
- * Compatibility notes are provided in this document, describing how file
- * formats have changed from prior versions:
- *
- * - In version 2.1, the file format was changed to allow lock-less commits (ie,
- * no more commit lock). The change is fully backwards compatible: you can open a
- * pre-2.1 index for searching or adding/deleting of docs. When the new segments
- * file is saved (committed), it will be written in the new file format (meaning
- * no specific "upgrade" process is needed). But note that once a commit has
- * occurred, pre-2.1 Lucene will not be able to read the index.
- * - In version 2.3, the file format was changed to allow segments to share a
- * single set of doc store (vectors & stored fields) files. This allows for
- * faster indexing in certain cases. The change is fully backwards compatible (in
- * the same way as the lock-less commits change in 2.1).
- * - In version 2.4, Strings are now written as true UTF-8 byte sequence, not
- * Java's modified UTF-8. See
- * LUCENE-510 for details.
- * - In version 2.9, an optional opaque Map<String,String> CommitUserData
- * may be passed to IndexWriter's commit methods (and later retrieved), which is
- * recorded in the segments_N file. See
- * LUCENE-1382 for details. Also,
- * diagnostics were added to each segment written recording details about why it
- * was written (due to flush, merge; which OS/JRE was used; etc.). See issue
- * LUCENE-1654 for details.
- * - In version 3.0, compressed fields are no longer written to the index (they
- * can still be read, but on merge the new segment will write them, uncompressed).
- * See issue LUCENE-1960
- * for details.
- * - In version 3.1, segments records the code version that created them. See
- * LUCENE-2720 for details.
- * Additionally segments track explicitly whether or not they have term vectors.
- * See LUCENE-2811
- * for details.
- * - In version 3.2, numeric fields are written as natively to stored fields
- * file, previously they were stored in text format only.
- * - In version 3.4, fields can omit position data while still indexing term
- * frequencies.
- * - In version 4.0, the format of the inverted index became extensible via
- * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
- * ({@code DocValues}) was introduced. Normalization factors need no longer be a
- * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
- * Terms need not be unicode strings, they can be any byte sequence. Term offsets
- * can optionally be indexed into the postings lists. Payloads can be stored in the
- * term vectors.
- * - In version 4.1, the format of the postings list changed to use either
- * of FOR compression or variable-byte encoding, depending upon the frequency
- * of the term. Terms appearing only once were changed to inline directly into
- * the term dictionary. Stored fields are compressed by default.
- * - In version 4.2, term vectors are compressed by default. DocValues has
- * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
- * on multi-valued fields.
- * - In version 4.5, DocValues were extended to explicitly represent missing values.
- * - In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
- * allow updating NumericDocValues fields.
- * - In version 4.8, checksum footers were added to the end of each index file
- * for improved data integrity. Specifically, the last 8 bytes of every index file
- * contain the zlib-crc32 checksum of the file.
- * - In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
- * that is suitable for faceting/sorting/analytics.
- *
- In version 5.4, DocValues have been improved to store more information on disk:
- * addresses for binary fields and ord indexes for multi-valued fields.
- *
- In version 6.0, Points were added, for multi-dimensional range/distance search.
- *
- In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
- *
- In version 7.0, DocValues have been improved to better support sparse doc values
- * thanks to an iterator API.
- * - In version 8.0, postings have been enhanced to record, for each block of
- * doc ids, the (term freq, normalization factor) pairs that may trigger the
- * maximum score of the block. This information is recorded alongside skip data
- * in order to be able to skip blocks of doc ids if they may not produce high
- * enough scores.
- * Additionally doc values and norms has been extended with jump-tables to make access O(1)
- * instead of O(n), where n is the number of elements to skip when advancing in the data.
- * - In version 8.4, postings, positions, offsets and payload lengths have move to a more
- * performant encoding that is vectorized.
- * - In version 8.6, index sort serialization is delegated to the sorts themselves, to
- * allow user-defined sorts to be used
- *
- *
- * Limitations
- *
- *
Lucene uses a Java int
to refer to
- * document numbers, and the index file format uses an Int32
- * on-disk to store document numbers. This is a limitation
- * of both the index file format and the current implementation. Eventually these
- * should be replaced with either UInt64
values, or
- * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.
- *
+ * Components from the Lucene 8.6 index format. See {@link org.apache.lucene.codecs.lucene90}
+ * for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene86;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java
index 75facdb2fb41..0df8615a21e9 100644
--- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java
@@ -16,401 +16,7 @@
*/
/**
- * Lucene 8.7 file format.
- *
- * Apache Lucene - Index File Formats
- *
- *
- * Introduction
- *
- *
This document defines the index file formats used in this version of Lucene.
- * If you are using a different version of Lucene, please consult the copy of
- * docs/
that was distributed with
- * the version you are using.
- *
This document attempts to provide a high-level definition of the Apache
- * Lucene file formats.
- *
- *
- * Definitions
- *
- *
The fundamental concepts in Lucene are index, document, field and term.
- *
An index contains a sequence of documents.
- *
- * - A document is a sequence of fields.
- * - A field is a named sequence of terms.
- * - A term is a sequence of bytes.
- *
- *
The same sequence of bytes in two different fields is considered a different
- * term. Thus terms are represented as a pair: the string naming the field, and the
- * bytes within the field.
- *
- *
Inverted Indexing
- *
The index stores statistics about terms in order to make term-based search
- * more efficient. Lucene's index falls into the family of indexes known as an
- * inverted index. This is because it can list, for a term, the documents
- * that contain it. This is the inverse of the natural relationship, in which
- * documents list terms.
- *
- *
Types of Fields
- *
In Lucene, fields may be stored, in which case their text is stored
- * in the index literally, in a non-inverted manner. Fields that are inverted are
- * called indexed. A field may be both stored and indexed.
- *
The text of a field may be tokenized into terms to be indexed, or the
- * text of a field may be used literally as a term to be indexed. Most fields are
- * tokenized, but sometimes it is useful for certain identifier fields to be
- * indexed literally.
- *
See the {@link org.apache.lucene.document.Field Field}
- * java docs for more information on Fields.
- *
- *
Segments
- *
Lucene indexes may be composed of multiple sub-indexes, or segments.
- * Each segment is a fully independent index, which could be searched separately.
- * Indexes evolve by:
- *
- * - Creating new segments for newly added documents.
- * - Merging existing segments.
- *
- *
Searches may involve multiple segments and/or multiple indexes, each index
- * potentially composed of a set of segments.
- *
- *
Document Numbers
- *
Internally, Lucene refers to documents by an integer document number.
- * The first document added to an index is numbered zero, and each subsequent
- * document added gets a number one greater than the previous.
- *
Note that a document's number may change, so caution should be taken when
- * storing these numbers outside of Lucene. In particular, numbers may change in
- * the following situations:
- *
- * -
- *
The numbers stored in each segment are unique only within the segment, and
- * must be converted before they can be used in a larger context. The standard
- * technique is to allocate each segment a range of values, based on the range of
- * numbers used in that segment. To convert a document number from a segment to an
- * external value, the segment's base document number is added. To convert
- * an external value back to a segment-specific value, the segment is identified
- * by the range that the external value is in, and the segment's base value is
- * subtracted. For example two five document segments might be combined, so that
- * the first segment has a base value of zero, and the second of five. Document
- * three from the second segment would have an external value of eight.
- *
- * -
- *
When documents are deleted, gaps are created in the numbering. These are
- * eventually removed as the index evolves through merging. Deleted documents are
- * dropped when segments are merged. A freshly-merged segment thus has no gaps in
- * its numbering.
- *
- *
- *
- *
- * Index Structure Overview
- *
- *
Each segment index maintains the following:
- *
- * -
- * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
- * This contains metadata about a segment, such as the number of documents,
- * what files it uses, and information about how the segment is sorted
- *
- * -
- * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Field names}.
- * This contains the set of field names used in the index.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Stored Field values}.
- * This contains, for each document, a list of attribute-value pairs, where the attributes
- * are field names. These are used to store auxiliary information about the document, such as
- * its title, url, or an identifier to access a database. The set of stored fields are what is
- * returned for each hit when searching. This is keyed by document number.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
- * A dictionary containing all of the terms used in all of the
- * indexed fields of all of the documents. The dictionary also contains the number
- * of documents which contain the term, and pointers to the term's frequency and
- * proximity data.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
- * For each term in the dictionary, the numbers of all the
- * documents that contain that term, and the frequency of the term in that
- * document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
- *
- * -
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
- * For each term in the dictionary, the positions that the
- * term occurs in each document. Note that this will not exist if all fields in
- * all documents omit position data.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
- * For each field in each document, a value is stored
- * that is multiplied into the score for hits on that field.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
- * For each field in each document, the term vector (sometimes
- * called document vector) may be stored. A term vector consists of term text and
- * term frequency. To add Term Vectors to your index see the
- * {@link org.apache.lucene.document.Field Field} constructors
- *
- * -
- * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
- * Like stored values, these are also keyed by document
- * number, but are generally intended to be loaded into main memory for fast
- * access. Whereas stored values are generally intended for summary results from
- * searches, per-document values are useful for things like scoring factors.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
- * An optional file indicating which documents are live.
- *
- * -
- * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
- * Optional pair of files, recording dimensionally indexed fields, to enable fast
- * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
- * and geographic shape intersection (2D, 3D).
- *
- *
- *
Details on each of these are provided in their linked pages.
- *
- *
- * File Naming
- *
- *
All files belonging to a segment have the same name with varying extensions.
- * The extensions correspond to the different file formats described below. When
- * using the Compound File format (default for small segments) these files (except
- * for the Segment info file, the Lock file, and Deleted documents file) are collapsed
- * into a single .cfs file (see below for details)
- *
Typically, all segments in an index are stored in a single directory,
- * although this is not required.
- *
File names are never re-used. That is, when any file is saved
- * to the Directory it is given a never before used filename. This is achieved
- * using a simple generations approach. For example, the first segments file is
- * segments_1, then segments_2, etc. The generation is a sequential long integer
- * represented in alpha-numeric (base 36) form.
- *
- *
- * Summary of File Extensions
- *
- *
The following table summarizes the names and extensions of the files in
- * Lucene:
- *
- * lucene filenames by extension
- *
- * Name |
- * Extension |
- * Brief Description |
- *
- *
- * {@link org.apache.lucene.index.SegmentInfos Segments File} |
- * segments_N |
- * Stores information about a commit point |
- *
- *
- * Lock File |
- * write.lock |
- * The Write lock prevents multiple IndexWriters from writing to the same
- * file. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info} |
- * .si |
- * Stores metadata about a segment |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File} |
- * .cfs, .cfe |
- * An optional "virtual" file consisting of all the other index files for
- * systems that frequently run out of file handles. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields} |
- * .fnm |
- * Stores information about the fields |
- *
- *
- * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index} |
- * .fdx |
- * Contains pointers to field data |
- *
- *
- * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data} |
- * .fdt |
- * The stored fields for documents |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary} |
- * .tim |
- * The term dictionary, stores term info |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index} |
- * .tip |
- * The index into the Term Dictionary |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies} |
- * .doc |
- * Contains the list of docs which contain each term along with frequency |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions} |
- * .pos |
- * Stores position information about where a term occurs in the index |
- *
- *
- * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads} |
- * .pay |
- * Stores additional per-position metadata information such as character offsets and user payloads |
- *
- *
- * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms} |
- * .nvd, .nvm |
- * Encodes length and boost factors for docs and fields |
- *
- *
- * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values} |
- * .dvd, .dvm |
- * Encodes additional scoring factors or other per-document information. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index} |
- * .tvx |
- * Stores offset into the document data file |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data} |
- * .tvd |
- * Contains term vector data. |
- *
- *
- * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents} |
- * .liv |
- * Info about what documents are live |
- *
- *
- * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values} |
- * .dii, .dim |
- * Holds indexed points, if any |
- *
- *
- *
- *
- * Lock File
- * The write lock, which is stored in the index directory by default, is named
- * "write.lock". If the lock directory is different from the index directory then
- * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
- * derived from the full path to the index directory. When this file is present, a
- * writer is currently modifying the index (adding or removing documents). This
- * lock file ensures that only one writer is modifying the index at a time.
- *
- * History
- * Compatibility notes are provided in this document, describing how file
- * formats have changed from prior versions:
- *
- * - In version 2.1, the file format was changed to allow lock-less commits (ie,
- * no more commit lock). The change is fully backwards compatible: you can open a
- * pre-2.1 index for searching or adding/deleting of docs. When the new segments
- * file is saved (committed), it will be written in the new file format (meaning
- * no specific "upgrade" process is needed). But note that once a commit has
- * occurred, pre-2.1 Lucene will not be able to read the index.
- * - In version 2.3, the file format was changed to allow segments to share a
- * single set of doc store (vectors & stored fields) files. This allows for
- * faster indexing in certain cases. The change is fully backwards compatible (in
- * the same way as the lock-less commits change in 2.1).
- * - In version 2.4, Strings are now written as true UTF-8 byte sequence, not
- * Java's modified UTF-8. See
- * LUCENE-510 for details.
- * - In version 2.9, an optional opaque Map<String,String> CommitUserData
- * may be passed to IndexWriter's commit methods (and later retrieved), which is
- * recorded in the segments_N file. See
- * LUCENE-1382 for details. Also,
- * diagnostics were added to each segment written recording details about why it
- * was written (due to flush, merge; which OS/JRE was used; etc.). See issue
- * LUCENE-1654 for details.
- * - In version 3.0, compressed fields are no longer written to the index (they
- * can still be read, but on merge the new segment will write them, uncompressed).
- * See issue LUCENE-1960
- * for details.
- * - In version 3.1, segments records the code version that created them. See
- * LUCENE-2720 for details.
- * Additionally segments track explicitly whether or not they have term vectors.
- * See LUCENE-2811
- * for details.
- * - In version 3.2, numeric fields are written as natively to stored fields
- * file, previously they were stored in text format only.
- * - In version 3.4, fields can omit position data while still indexing term
- * frequencies.
- * - In version 4.0, the format of the inverted index became extensible via
- * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
- * ({@code DocValues}) was introduced. Normalization factors need no longer be a
- * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
- * Terms need not be unicode strings, they can be any byte sequence. Term offsets
- * can optionally be indexed into the postings lists. Payloads can be stored in the
- * term vectors.
- * - In version 4.1, the format of the postings list changed to use either
- * of FOR compression or variable-byte encoding, depending upon the frequency
- * of the term. Terms appearing only once were changed to inline directly into
- * the term dictionary. Stored fields are compressed by default.
- * - In version 4.2, term vectors are compressed by default. DocValues has
- * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
- * on multi-valued fields.
- * - In version 4.5, DocValues were extended to explicitly represent missing values.
- * - In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
- * allow updating NumericDocValues fields.
- * - In version 4.8, checksum footers were added to the end of each index file
- * for improved data integrity. Specifically, the last 8 bytes of every index file
- * contain the zlib-crc32 checksum of the file.
- * - In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
- * that is suitable for faceting/sorting/analytics.
- *
- In version 5.4, DocValues have been improved to store more information on disk:
- * addresses for binary fields and ord indexes for multi-valued fields.
- *
- In version 6.0, Points were added, for multi-dimensional range/distance search.
- *
- In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
- *
- In version 7.0, DocValues have been improved to better support sparse doc values
- * thanks to an iterator API.
- * - In version 8.0, postings have been enhanced to record, for each block of
- * doc ids, the (term freq, normalization factor) pairs that may trigger the
- * maximum score of the block. This information is recorded alongside skip data
- * in order to be able to skip blocks of doc ids if they may not produce high
- * enough scores.
- * Additionally doc values and norms has been extended with jump-tables to make access O(1)
- * instead of O(n), where n is the number of elements to skip when advancing in the data.
- * - In version 8.4, postings, positions, offsets and payload lengths have move to a more
- * performant encoding that is vectorized.
- * - In version 8.6, index sort serialization is delegated to the sorts themselves, to
- * allow user-defined sorts to be used
- *
- *
- * Limitations
- *
- *
Lucene uses a Java int
to refer to
- * document numbers, and the index file format uses an Int32
- * on-disk to store document numbers. This is a limitation
- * of both the index file format and the current implementation. Eventually these
- * should be replaced with either UInt64
values, or
- * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.
- *
+ * Components from the Lucene 8.7 index format. See {@link org.apache.lucene.codecs.lucene90}
+ * for an overview of the current index format.
*/
package org.apache.lucene.codecs.lucene87;
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java
new file mode 100644
index 000000000000..fa8f7afae2db
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java
@@ -0,0 +1,183 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90;
+
+import java.util.Objects;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.codecs.CompoundFormat;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.codecs.FilterCodec;
+import org.apache.lucene.codecs.VectorFormat;
+import org.apache.lucene.codecs.LiveDocsFormat;
+import org.apache.lucene.codecs.NormsFormat;
+import org.apache.lucene.codecs.PointsFormat;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.SegmentInfoFormat;
+import org.apache.lucene.codecs.StoredFieldsFormat;
+import org.apache.lucene.codecs.TermVectorsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
+import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
+import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat;
+import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
+import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat;
+import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat;
+import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
+import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
+import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
+
+/**
+ * Implements the Lucene 9.0 index format
+ *
+ * If you want to reuse functionality of this codec in another codec, extend
+ * {@link FilterCodec}.
+ *
+ * @see org.apache.lucene.codecs.lucene90 package documentation for file format details.
+ *
+ * @lucene.experimental
+ */
+public class Lucene90Codec extends Codec {
+ private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat();
+ private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat();
+ private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat();
+ private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat();
+ private final CompoundFormat compoundFormat = new Lucene50CompoundFormat();
+ private final PostingsFormat defaultFormat;
+
+ private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() {
+ @Override
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return Lucene90Codec.this.getPostingsFormatForField(field);
+ }
+ };
+
+ private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() {
+ @Override
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return Lucene90Codec.this.getDocValuesFormatForField(field);
+ }
+ };
+
+ private final VectorFormat vectorFormat = new Lucene90VectorFormat();
+
+ private final StoredFieldsFormat storedFieldsFormat;
+
+ /**
+ * Instantiates a new codec.
+ */
+ public Lucene90Codec() {
+ this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression
+ * mode to use.
+ * @param mode stored fields compression mode to use for newly
+ * flushed/merged segments.
+ */
+ public Lucene90Codec(Lucene87StoredFieldsFormat.Mode mode) {
+ super("Lucene90");
+ this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode));
+ this.defaultFormat = new Lucene84PostingsFormat();
+ }
+
+ @Override
+ public final StoredFieldsFormat storedFieldsFormat() {
+ return storedFieldsFormat;
+ }
+
+ @Override
+ public final TermVectorsFormat termVectorsFormat() {
+ return vectorsFormat;
+ }
+
+ @Override
+ public final PostingsFormat postingsFormat() {
+ return postingsFormat;
+ }
+ @Override
+ public final FieldInfosFormat fieldInfosFormat() {
+ return fieldInfosFormat;
+ }
+
+ @Override
+ public final SegmentInfoFormat segmentInfoFormat() {
+ return segmentInfosFormat;
+ }
+
+ @Override
+ public final LiveDocsFormat liveDocsFormat() {
+ return liveDocsFormat;
+ }
+
+ @Override
+ public final CompoundFormat compoundFormat() {
+ return compoundFormat;
+ }
+
+ @Override
+ public final PointsFormat pointsFormat() {
+ return new Lucene86PointsFormat();
+ }
+
+ @Override
+ public final VectorFormat vectorFormat() {
+ return vectorFormat;
+ }
+
+ /** Returns the postings format that should be used for writing
+ * new segments of field
.
+ *
+ * The default implementation always returns "Lucene84".
+ *
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation,
+ */
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return defaultFormat;
+ }
+
+ /** Returns the docvalues format that should be used for writing
+ * new segments of field
.
+ *
+ * The default implementation always returns "Lucene80".
+ *
+ * WARNING: if you subclass, you are responsible for index
+ * backwards compatibility: future version of Lucene are only
+ * guaranteed to be able to read the default implementation.
+ */
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return defaultDVFormat;
+ }
+
+ @Override
+ public final DocValuesFormat docValuesFormat() {
+ return docValuesFormat;
+ }
+
+ private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80");
+
+ private final NormsFormat normsFormat = new Lucene80NormsFormat();
+
+ @Override
+ public final NormsFormat normsFormat() {
+ return normsFormat;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90FieldInfosFormat.java
new file mode 100644
index 000000000000..cab87e9102e3
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90FieldInfosFormat.java
@@ -0,0 +1,339 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.codecs.lucene90;
+
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.FieldInfosFormat;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+
+/**
+ * Lucene 9.0 Field Infos format.
+ *
Field names are stored in the field info file, with suffix .fnm
.
+ *
FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber,
+ * FieldBits,DocValuesBits,DocValuesGen,Attributes,DimensionCount,DimensionNumBytes> FieldsCount,Footer
+ *
Data types:
+ *
+ * - Header --> {@link CodecUtil#checkIndexHeader IndexHeader}
+ * - FieldsCount --> {@link DataOutput#writeVInt VInt}
+ * - FieldName --> {@link DataOutput#writeString String}
+ * - FieldBits, IndexOptions, DocValuesBits --> {@link DataOutput#writeByte Byte}
+ * - FieldNumber, DimensionCount, DimensionNumBytes --> {@link DataOutput#writeInt VInt}
+ * - Attributes --> {@link DataOutput#writeMapOfStrings Map<String,String>}
+ * - DocValuesGen --> {@link DataOutput#writeLong(long) Int64}
+ * - Footer --> {@link CodecUtil#writeFooter CodecFooter}
+ *
+ * Field Descriptions:
+ *
+ * - FieldsCount: the number of fields in this file.
+ * - FieldName: name of the field as a UTF-8 String.
+ * - FieldNumber: the field's number. Note that unlike previous versions of
+ * Lucene, the fields are not numbered implicitly by their order in the
+ * file, instead explicitly.
+ * - FieldBits: a byte containing field options.
+ *
+ * - The low order bit (0x1) is one for fields that have term vectors
+ * stored, and zero for fields without term vectors.
+ * - If the second lowest order-bit is set (0x2), norms are omitted for the
+ * indexed field.
+ * - If the third lowest-order bit is set (0x4), payloads are stored for the
+ * indexed field.
+ *
+ *
+ * - IndexOptions: a byte containing index options.
+ *
+ * - 0: not indexed
+ * - 1: indexed as DOCS_ONLY
+ * - 2: indexed as DOCS_AND_FREQS
+ * - 3: indexed as DOCS_AND_FREQS_AND_POSITIONS
+ * - 4: indexed as DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
+ *
+ *
+ * - DocValuesBits: a byte containing per-document value types. The type
+ * recorded as two four-bit integers, with the high-order bits representing
+ *
norms
options, and the low-order bits representing
+ * {@code DocValues} options. Each four-bit integer can be decoded as such:
+ *
+ * - 0: no DocValues for this field.
+ * - 1: NumericDocValues. ({@link DocValuesType#NUMERIC})
+ * - 2: BinaryDocValues. ({@code DocValuesType#BINARY})
+ * - 3: SortedDocValues. ({@code DocValuesType#SORTED})
+ *
+ *
+ * - DocValuesGen is the generation count of the field's DocValues. If this is -1,
+ * there are no DocValues updates to that field. Anything above zero means there
+ * are updates stored by {@link DocValuesFormat}.
+ * - Attributes: a key-value map of codec-private attributes.
+ * - PointDimensionCount, PointNumBytes: these are non-zero only if the field is
+ * indexed as points, e.g. using {@link org.apache.lucene.document.LongPoint}
+ * - VectorDimension: it is non-zero if the field is indexed as vectors.
+ * - VectorDistFunction: a byte containing distance function used for similarity calculation.
+ *
+ * - 0: no distance function is defined for this field.
+ * - 1: EUCLIDEAN distance. ({@link org.apache.lucene.index.VectorValues.ScoreFunction#EUCLIDEAN})
+ * - 2: DOT_PRODUCT score. ({@link org.apache.lucene.index.VectorValues.ScoreFunction#DOT_PRODUCT})
+ *
+ *
+ *
+ *
+ * @lucene.experimental
+ */
+public final class Lucene90FieldInfosFormat extends FieldInfosFormat {
+
+ /** Sole constructor. */
+ public Lucene90FieldInfosFormat() {
+ }
+
+ @Override
+ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException {
+ final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
+ try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) {
+ Throwable priorE = null;
+ FieldInfo infos[] = null;
+ try {
+ int version = CodecUtil.checkIndexHeader(input,
+ Lucene90FieldInfosFormat.CODEC_NAME,
+ Lucene90FieldInfosFormat.FORMAT_START,
+ Lucene90FieldInfosFormat.FORMAT_CURRENT,
+ segmentInfo.getId(), segmentSuffix);
+
+ final int size = input.readVInt(); //read in the size
+ infos = new FieldInfo[size];
+
+ // previous field's attribute map, we share when possible:
+ Map lastAttributes = Collections.emptyMap();
+
+ for (int i = 0; i < size; i++) {
+ String name = input.readString();
+ final int fieldNumber = input.readVInt();
+ if (fieldNumber < 0) {
+ throw new CorruptIndexException("invalid field number for field: " + name + ", fieldNumber=" + fieldNumber, input);
+ }
+ byte bits = input.readByte();
+ boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0;
+ boolean omitNorms = (bits & OMIT_NORMS) != 0;
+ boolean storePayloads = (bits & STORE_PAYLOADS) != 0;
+ boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0;
+
+ final IndexOptions indexOptions = getIndexOptions(input, input.readByte());
+
+ // DV Types are packed in one byte
+ final DocValuesType docValuesType = getDocValuesType(input, input.readByte());
+ final long dvGen = input.readLong();
+ Map attributes = input.readMapOfStrings();
+ // just use the last field's map if its the same
+ if (attributes.equals(lastAttributes)) {
+ attributes = lastAttributes;
+ }
+ lastAttributes = attributes;
+ int pointDataDimensionCount = input.readVInt();
+ int pointNumBytes;
+ int pointIndexDimensionCount = pointDataDimensionCount;
+ if (pointDataDimensionCount != 0) {
+ if (version >= Lucene90FieldInfosFormat.FORMAT_SELECTIVE_INDEXING) {
+ pointIndexDimensionCount = input.readVInt();
+ }
+ pointNumBytes = input.readVInt();
+ } else {
+ pointNumBytes = 0;
+ }
+ final int vectorDimension = input.readVInt();
+ final VectorValues.ScoreFunction vectorDistFunc = getDistFunc(input, input.readByte());
+
+ try {
+ infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads,
+ indexOptions, docValuesType, dvGen, attributes,
+ pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, vectorDimension, vectorDistFunc, isSoftDeletesField);
+ infos[i].checkConsistency();
+ } catch (IllegalStateException e) {
+ throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e);
+ }
+ }
+ } catch (Throwable exception) {
+ priorE = exception;
+ } finally {
+ CodecUtil.checkFooter(input, priorE);
+ }
+ return new FieldInfos(infos);
+ }
+ }
+
+ static {
+ // We "mirror" DocValues enum values with the constants below; let's try to ensure if we add a new DocValuesType while this format is
+ // still used for writing, we remember to fix this encoding:
+ assert DocValuesType.values().length == 6;
+ }
+
+ private static byte docValuesByte(DocValuesType type) {
+ switch(type) {
+ case NONE:
+ return 0;
+ case NUMERIC:
+ return 1;
+ case BINARY:
+ return 2;
+ case SORTED:
+ return 3;
+ case SORTED_SET:
+ return 4;
+ case SORTED_NUMERIC:
+ return 5;
+ default:
+ // BUG
+ throw new AssertionError("unhandled DocValuesType: " + type);
+ }
+ }
+
+ private static DocValuesType getDocValuesType(IndexInput input, byte b) throws IOException {
+ switch(b) {
+ case 0:
+ return DocValuesType.NONE;
+ case 1:
+ return DocValuesType.NUMERIC;
+ case 2:
+ return DocValuesType.BINARY;
+ case 3:
+ return DocValuesType.SORTED;
+ case 4:
+ return DocValuesType.SORTED_SET;
+ case 5:
+ return DocValuesType.SORTED_NUMERIC;
+ default:
+ throw new CorruptIndexException("invalid docvalues byte: " + b, input);
+ }
+ }
+
+ private static VectorValues.ScoreFunction getDistFunc(IndexInput input, byte b) throws IOException {
+ if (b < 0 || b >= VectorValues.ScoreFunction.values().length) {
+ throw new CorruptIndexException("invalid distance function: " + b, input);
+ }
+ return VectorValues.ScoreFunction.values()[b];
+ }
+
+ static {
+ // We "mirror" IndexOptions enum values with the constants below; let's try to ensure if we add a new IndexOption while this format is
+ // still used for writing, we remember to fix this encoding:
+ assert IndexOptions.values().length == 5;
+ }
+
+ private static byte indexOptionsByte(IndexOptions indexOptions) {
+ switch (indexOptions) {
+ case NONE:
+ return 0;
+ case DOCS:
+ return 1;
+ case DOCS_AND_FREQS:
+ return 2;
+ case DOCS_AND_FREQS_AND_POSITIONS:
+ return 3;
+ case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS:
+ return 4;
+ default:
+ // BUG:
+ throw new AssertionError("unhandled IndexOptions: " + indexOptions);
+ }
+ }
+
+ private static IndexOptions getIndexOptions(IndexInput input, byte b) throws IOException {
+ switch (b) {
+ case 0:
+ return IndexOptions.NONE;
+ case 1:
+ return IndexOptions.DOCS;
+ case 2:
+ return IndexOptions.DOCS_AND_FREQS;
+ case 3:
+ return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
+ case 4:
+ return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
+ default:
+ // BUG
+ throw new CorruptIndexException("invalid IndexOptions byte: " + b, input);
+ }
+ }
+
+ @Override
+ public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException {
+ final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION);
+ try (IndexOutput output = directory.createOutput(fileName, context)) {
+ CodecUtil.writeIndexHeader(output, Lucene90FieldInfosFormat.CODEC_NAME, Lucene90FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix);
+ output.writeVInt(infos.size());
+ for (FieldInfo fi : infos) {
+ fi.checkConsistency();
+
+ output.writeString(fi.name);
+ output.writeVInt(fi.number);
+
+ byte bits = 0x0;
+ if (fi.hasVectors()) bits |= STORE_TERMVECTOR;
+ if (fi.omitsNorms()) bits |= OMIT_NORMS;
+ if (fi.hasPayloads()) bits |= STORE_PAYLOADS;
+ if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD;
+ output.writeByte(bits);
+
+ output.writeByte(indexOptionsByte(fi.getIndexOptions()));
+
+ // pack the DV type and hasNorms in one byte
+ output.writeByte(docValuesByte(fi.getDocValuesType()));
+ output.writeLong(fi.getDocValuesGen());
+ output.writeMapOfStrings(fi.attributes());
+ output.writeVInt(fi.getPointDimensionCount());
+ if (fi.getPointDimensionCount() != 0) {
+ output.writeVInt(fi.getPointIndexDimensionCount());
+ output.writeVInt(fi.getPointNumBytes());
+ }
+ output.writeVInt(fi.getVectorDimension());
+ output.writeByte((byte) fi.getVectorScoreFunction().ordinal());
+ }
+ CodecUtil.writeFooter(output);
+ }
+ }
+
+ /** Extension of field infos */
+ static final String EXTENSION = "fnm";
+
+ // Codec header
+ static final String CODEC_NAME = "Lucene90FieldInfos";
+ static final int FORMAT_START = 0;
+ static final int FORMAT_SOFT_DELETES = 1;
+ static final int FORMAT_SELECTIVE_INDEXING = 2;
+ static final int FORMAT_CURRENT = FORMAT_SELECTIVE_INDEXING;
+
+ // Field flags
+ static final byte STORE_TERMVECTOR = 0x1;
+ static final byte OMIT_NORMS = 0x2;
+ static final byte STORE_PAYLOADS = 0x4;
+ static final byte SOFT_DELETES_FIELD = 0x8;
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorFormat.java
new file mode 100644
index 000000000000..632bc8154d8a
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorFormat.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90;
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.VectorFormat;
+import org.apache.lucene.codecs.VectorReader;
+import org.apache.lucene.codecs.VectorWriter;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+
+/**
+ * Lucene 9.0 vector format, which encodes dense numeric vector values.
+ * TODO: add support for approximate KNN search.
+ */
+public final class Lucene90VectorFormat extends VectorFormat {
+
+ static final String META_CODEC_NAME = "Lucene90VectorFormatMeta";
+ static final String VECTOR_DATA_CODEC_NAME = "Lucene90VectorFormatData";
+
+ static final String META_EXTENSION = "vem";
+ static final String VECTOR_DATA_EXTENSION = "vec";
+
+ static final int VERSION_START = 0;
+ static final int VERSION_CURRENT = VERSION_START;
+
+ /** Sole constructor */
+ public Lucene90VectorFormat() {
+ }
+
+ @Override
+ public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException {
+ return new Lucene90VectorWriter(state);
+ }
+
+ @Override
+ public VectorReader fieldsReader(SegmentReadState state) throws IOException {
+ return new Lucene90VectorReader(state);
+ }
+
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java
new file mode 100644
index 000000000000..4a32a361fb7d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java
@@ -0,0 +1,345 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.FloatBuffer;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.VectorReader;
+import org.apache.lucene.index.CorruptIndexException;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * Reads vectors from the index segments.
+ * @lucene.experimental
+ */
+public final class Lucene90VectorReader extends VectorReader {
+
+ private final FieldInfos fieldInfos;
+ private final Map fields = new HashMap<>();
+ private final IndexInput vectorData;
+ private final int maxDoc;
+
+ Lucene90VectorReader(SegmentReadState state) throws IOException {
+ this.fieldInfos = state.fieldInfos;
+ this.maxDoc = state.segmentInfo.maxDoc();
+
+ String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.META_EXTENSION);
+ int versionMeta = -1;
+ try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName, state.context)) {
+ Throwable priorE = null;
+ try {
+ versionMeta = CodecUtil.checkIndexHeader(meta,
+ Lucene90VectorFormat.META_CODEC_NAME,
+ Lucene90VectorFormat.VERSION_START,
+ Lucene90VectorFormat.VERSION_CURRENT,
+ state.segmentInfo.getId(),
+ state.segmentSuffix);
+ readFields(meta, state.fieldInfos);
+ } catch (Throwable exception) {
+ priorE = exception;
+ } finally {
+ CodecUtil.checkFooter(meta, priorE);
+ }
+ }
+
+ boolean success = false;
+
+ String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.VECTOR_DATA_EXTENSION);
+ this.vectorData = state.directory.openInput(vectorDataFileName, state.context);
+ try {
+ int versionVectorData = CodecUtil.checkIndexHeader(vectorData,
+ Lucene90VectorFormat.VECTOR_DATA_CODEC_NAME,
+ Lucene90VectorFormat.VERSION_START,
+ Lucene90VectorFormat.VERSION_CURRENT,
+ state.segmentInfo.getId(),
+ state.segmentSuffix);
+ if (versionMeta != versionVectorData) {
+ throw new CorruptIndexException("Format versions mismatch: meta=" + versionMeta + ", vector data=" + versionVectorData, vectorData);
+ }
+ CodecUtil.retrieveChecksum(vectorData);
+
+ success = true;
+ } finally {
+ if (!success) {
+ IOUtils.closeWhileHandlingException(this.vectorData);
+ }
+ }
+ }
+
+ private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException {
+ for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) {
+ FieldInfo info = infos.fieldInfo(fieldNumber);
+ if (info == null) {
+ throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta);
+ }
+ int scoreFunctionId = meta.readInt();
+ if (scoreFunctionId < 0 || scoreFunctionId >= VectorValues.ScoreFunction.values().length) {
+ throw new CorruptIndexException("Invalid score function id: " + scoreFunctionId, meta);
+ }
+ VectorValues.ScoreFunction scoreFunction = VectorValues.ScoreFunction.values()[scoreFunctionId];
+ long vectorDataOffset = meta.readVLong();
+ long vectorDataLength = meta.readVLong();
+ int dimension = meta.readInt();
+ int size = meta.readInt();
+ int[] ordToDoc = new int[size];
+ for (int i = 0; i < size; i++) {
+ int doc = meta.readVInt();
+ ordToDoc[i] = doc;
+ }
+ FieldEntry fieldEntry = new FieldEntry(dimension, scoreFunction, maxDoc, vectorDataOffset, vectorDataLength,
+ ordToDoc);
+ fields.put(info.name, fieldEntry);
+ }
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ long totalBytes = RamUsageEstimator.shallowSizeOfInstance(Lucene90VectorReader.class);
+ totalBytes += RamUsageEstimator.sizeOfMap(fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class));
+ for (FieldEntry entry : fields.values()) {
+ totalBytes += RamUsageEstimator.sizeOf(entry.ordToDoc);
+ }
+ return totalBytes;
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ CodecUtil.checksumEntireFile(vectorData);
+ }
+
+ @Override
+ public VectorValues getVectorValues(String field) throws IOException {
+ FieldInfo info = fieldInfos.fieldInfo(field);
+ if (info == null) {
+ return null;
+ }
+ int dimension = info.getVectorDimension();
+ if (dimension == 0) {
+ return VectorValues.EMPTY;
+ }
+ FieldEntry fieldEntry = fields.get(field);
+ if (fieldEntry == null) {
+ // There is a FieldInfo, but no vectors. Should we have deleted the FieldInfo?
+ return null;
+ }
+ if (dimension != fieldEntry.dimension) {
+ throw new IllegalStateException("Inconsistent vector dimension for field=\"" + field + "\"; " + dimension + " != " + fieldEntry.dimension);
+ }
+ long numBytes = (long) fieldEntry.size() * dimension * Float.BYTES;
+ if (numBytes != fieldEntry.vectorDataLength) {
+ throw new IllegalStateException("Vector data length " + fieldEntry.vectorDataLength +
+ " not matching size=" + fieldEntry.size() + " * dim=" + dimension + " * 4 = " +
+ numBytes);
+ }
+ IndexInput bytesSlice = vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength);
+ return new OffHeapVectorValues(fieldEntry, bytesSlice);
+ }
+
+ @Override
+ public void close() throws IOException {
+ vectorData.close();
+ }
+
+ private static class FieldEntry {
+
+ final int dimension;
+ final VectorValues.ScoreFunction scoreFunction;
+ final int maxDoc;
+
+ final long vectorDataOffset;
+ final long vectorDataLength;
+ final int[] ordToDoc;
+
+ FieldEntry(int dimension, VectorValues.ScoreFunction scoreFunction, int maxDoc,
+ long vectorDataOffset, long vectorDataLength, int[] ordToDoc) {
+ this.dimension = dimension;
+ this.scoreFunction = scoreFunction;
+ this.maxDoc = maxDoc;
+ this.vectorDataOffset = vectorDataOffset;
+ this.vectorDataLength = vectorDataLength;
+ this.ordToDoc = ordToDoc;
+ }
+
+ int size() {
+ return ordToDoc.length;
+ }
+ }
+
+ /** Read the vector values from the index input. This supports both iterated and random access. */
+ private final static class OffHeapVectorValues extends VectorValues {
+
+ final FieldEntry fieldEntry;
+ final IndexInput dataIn;
+
+ final BytesRef binaryValue;
+ final ByteBuffer byteBuffer;
+ final FloatBuffer floatBuffer;
+ final int byteSize;
+ final float[] value;
+
+ int ord = -1;
+ int doc = -1;
+
+ OffHeapVectorValues(FieldEntry fieldEntry, IndexInput dataIn) {
+ this.fieldEntry = fieldEntry;
+ this.dataIn = dataIn;
+ byteSize = Float.BYTES * fieldEntry.dimension;
+ byteBuffer = ByteBuffer.allocate(byteSize);
+ floatBuffer = byteBuffer.asFloatBuffer();
+ value = new float[fieldEntry.dimension];
+ binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
+ }
+
+ @Override
+ public int dimension() {
+ return fieldEntry.dimension;
+ }
+
+ @Override
+ public int size() {
+ return fieldEntry.size();
+ }
+
+ @Override
+ public ScoreFunction scoreFunction() {
+ return fieldEntry.scoreFunction;
+ }
+
+ @Override
+ public float[] vectorValue() throws IOException {
+ binaryValue();
+ floatBuffer.position(0);
+ floatBuffer.get(value, 0, fieldEntry.dimension);
+ return value;
+ }
+
+ @Override
+ public BytesRef binaryValue() throws IOException {
+ dataIn.seek(ord * byteSize);
+ dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
+ return binaryValue;
+ }
+
+ @Override
+ public int docID() {
+ return doc;
+ }
+
+ @Override
+ public int nextDoc() {
+ if (++ord >= size()) {
+ doc = NO_MORE_DOCS;
+ } else {
+ doc = fieldEntry.ordToDoc[ord];
+ }
+ return doc;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ // We could do better by log-binary search in ordToDoc, but this is never used
+ return slowAdvance(target);
+ }
+
+ @Override
+ public long cost() {
+ return fieldEntry.size();
+ }
+
+ @Override
+ public RandomAccess randomAccess() {
+ return new OffHeapRandomAccess(dataIn.clone());
+ }
+
+
+ class OffHeapRandomAccess implements VectorValues.RandomAccess {
+
+ final IndexInput dataIn;
+
+ final BytesRef binaryValue;
+ final ByteBuffer byteBuffer;
+ final FloatBuffer floatBuffer;
+ final int byteSize;
+ final float[] value;
+
+ OffHeapRandomAccess(IndexInput dataIn) {
+ this.dataIn = dataIn;
+ byteSize = Float.BYTES * dimension();
+ byteBuffer = ByteBuffer.allocate(byteSize);
+ floatBuffer = byteBuffer.asFloatBuffer();
+ value = new float[dimension()];
+ binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
+ }
+
+ @Override
+ public int size() {
+ return fieldEntry.size();
+ }
+
+ @Override
+ public int dimension() {
+ return fieldEntry.dimension;
+ }
+
+ @Override
+ public VectorValues.ScoreFunction scoreFunction() {
+ return fieldEntry.scoreFunction;
+ }
+
+ @Override
+ public float[] vectorValue(int targetOrd) throws IOException {
+ readValue(targetOrd);
+ floatBuffer.position(0);
+ floatBuffer.get(value);
+ return value;
+ }
+
+ @Override
+ public BytesRef binaryValue(int targetOrd) throws IOException {
+ readValue(targetOrd);
+ return binaryValue;
+ }
+
+ private void readValue(int targetOrd) throws IOException {
+ long offset = targetOrd * byteSize;
+ dataIn.seek(offset);
+ dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize);
+ }
+
+ @Override
+ public TopDocs search(float[] vector, int topK, int fanout) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ }
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorWriter.java
new file mode 100644
index 000000000000..cdafb665251e
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorWriter.java
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.codecs.lucene90;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.VectorWriter;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.VectorValues;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IOUtils;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+/**
+ * Writes vector values and knn graphs to index segments.
+ * @lucene.experimental
+ */
+public final class Lucene90VectorWriter extends VectorWriter {
+
+ private final IndexOutput meta, vectorData;
+
+ private boolean finished;
+
+ Lucene90VectorWriter(SegmentWriteState state) throws IOException {
+ assert state.fieldInfos.hasVectorValues();
+
+ String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.META_EXTENSION);
+ meta = state.directory.createOutput(metaFileName, state.context);
+
+ String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.VECTOR_DATA_EXTENSION);
+ vectorData = state.directory.createOutput(vectorDataFileName, state.context);
+
+ try {
+ CodecUtil.writeIndexHeader(meta,
+ Lucene90VectorFormat.META_CODEC_NAME,
+ Lucene90VectorFormat.VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+ CodecUtil.writeIndexHeader(vectorData,
+ Lucene90VectorFormat.VECTOR_DATA_CODEC_NAME,
+ Lucene90VectorFormat.VERSION_CURRENT,
+ state.segmentInfo.getId(), state.segmentSuffix);
+ } catch (IOException e) {
+ IOUtils.closeWhileHandlingException(this);
+ }
+ }
+
+ @Override
+ public void writeField(FieldInfo fieldInfo, VectorValues vectors) throws IOException {
+ long vectorDataOffset = vectorData.getFilePointer();
+ // TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index
+ List docIds = new ArrayList<>();
+ int docV, ord = 0;
+ for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), ord++) {
+ writeVectorValue(vectors);
+ docIds.add(docV);
+ // TODO: write knn graph value
+ }
+ long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset;
+ if (vectorDataLength > 0) {
+ writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds);
+ }
+ }
+
+ private void writeVectorValue(VectorValues vectors) throws IOException {
+ // write vector value
+ BytesRef binaryValue = vectors.binaryValue();
+ assert binaryValue.length == vectors.dimension() * Float.BYTES;
+ vectorData.writeBytes(binaryValue.bytes, binaryValue.offset, binaryValue.length);
+ }
+
+ private void writeMeta(FieldInfo field, long vectorDataOffset, long vectorDataLength, List docIds) throws IOException {
+ meta.writeInt(field.number);
+ meta.writeInt(field.getVectorScoreFunction().ordinal());
+ meta.writeVLong(vectorDataOffset);
+ meta.writeVLong(vectorDataLength);
+ meta.writeInt(field.getVectorDimension());
+ meta.writeInt(docIds.size());
+ for (Integer docId : docIds) {
+ // TODO: delta-encode, or write as bitset
+ meta.writeVInt(docId);
+ }
+ }
+
+ @Override
+ public void finish() throws IOException {
+ if (finished) {
+ throw new IllegalStateException("already finished");
+ }
+ finished = true;
+
+ if (meta != null) {
+ // write end of fields marker
+ meta.writeInt(-1);
+ CodecUtil.writeFooter(meta);
+ }
+ if (vectorData != null) {
+ CodecUtil.writeFooter(vectorData);
+ }
+ }
+
+ @Override
+ public void close() throws IOException {
+ IOUtils.close(meta, vectorData);
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
new file mode 100644
index 000000000000..c6722c854a5f
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java
@@ -0,0 +1,429 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Lucene 9.0 file format.
+ *
+ * Apache Lucene - Index File Formats
+ *
+ *
+ * Introduction
+ *
+ *
This document defines the index file formats used in this version of Lucene.
+ * If you are using a different version of Lucene, please consult the copy of
+ * docs/
that was distributed with
+ * the version you are using.
+ *
This document attempts to provide a high-level definition of the Apache
+ * Lucene file formats.
+ *
+ *
+ * Definitions
+ *
+ *
The fundamental concepts in Lucene are index, document, field and term.
+ *
An index contains a sequence of documents.
+ *
+ * - A document is a sequence of fields.
+ * - A field is a named sequence of terms.
+ * - A term is a sequence of bytes.
+ *
+ *
The same sequence of bytes in two different fields is considered a different
+ * term. Thus terms are represented as a pair: the string naming the field, and the
+ * bytes within the field.
+ *
+ *
Inverted Indexing
+ *
Lucene's index stores terms and statistics about those terms in order to make
+ * term-based search more efficient. Lucene's terms index falls into the family of indexes known as
+ * an inverted index. This is because it can list, for a term, the documents that contain
+ * it. This is the inverse of the natural relationship, in which documents list terms.
+ *
+ *
Types of Fields
+ *
In Lucene, fields may be stored, in which case their text is stored
+ * in the index literally, in a non-inverted manner. Fields that are inverted are
+ * called indexed. A field may be both stored and indexed.
+ *
The text of a field may be tokenized into terms to be indexed, or the
+ * text of a field may be used literally as a term to be indexed. Most fields are
+ * tokenized, but sometimes it is useful for certain identifier fields to be
+ * indexed literally.
+ *
See the {@link org.apache.lucene.document.Field Field}
+ * java docs for more information on Fields.
+ *
+ *
Segments
+ *
Lucene indexes may be composed of multiple sub-indexes, or segments.
+ * Each segment is a fully independent index, which could be searched separately.
+ * Indexes evolve by:
+ *
+ * - Creating new segments for newly added documents.
+ * - Merging existing segments.
+ *
+ *
Searches may involve multiple segments and/or multiple indexes, each index
+ * potentially composed of a set of segments.
+ *
+ *
Document Numbers
+ *
Internally, Lucene refers to documents by an integer document number.
+ * The first document added to an index is numbered zero, and each subsequent
+ * document added gets a number one greater than the previous.
+ *
Note that a document's number may change, so caution should be taken when
+ * storing these numbers outside of Lucene. In particular, numbers may change in
+ * the following situations:
+ *
+ * -
+ *
The numbers stored in each segment are unique only within the segment, and
+ * must be converted before they can be used in a larger context. The standard
+ * technique is to allocate each segment a range of values, based on the range of
+ * numbers used in that segment. To convert a document number from a segment to an
+ * external value, the segment's base document number is added. To convert
+ * an external value back to a segment-specific value, the segment is identified
+ * by the range that the external value is in, and the segment's base value is
+ * subtracted. For example two five document segments might be combined, so that
+ * the first segment has a base value of zero, and the second of five. Document
+ * three from the second segment would have an external value of eight.
+ *
+ * -
+ *
When documents are deleted, gaps are created in the numbering. These are
+ * eventually removed as the index evolves through merging. Deleted documents are
+ * dropped when segments are merged. A freshly-merged segment thus has no gaps in
+ * its numbering.
+ *
+ *
+ *
+ *
+ * Index Structure Overview
+ *
+ *
Each segment index maintains the following:
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}.
+ * This contains metadata about a segment, such as the number of documents,
+ * what files it uses, and information about how the segment is sorted
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}.
+ * This contains metadata about the set of named fields used in the index.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Stored Field values}.
+ * This contains, for each document, a list of attribute-value pairs, where the attributes
+ * are field names. These are used to store auxiliary information about the document, such as
+ * its title, url, or an identifier to access a database. The set of stored fields are what is
+ * returned for each hit when searching. This is keyed by document number.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}.
+ * A dictionary containing all of the terms used in all of the
+ * indexed fields of all of the documents. The dictionary also contains the number
+ * of documents which contain the term, and pointers to the term's frequency and
+ * proximity data.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}.
+ * For each term in the dictionary, the numbers of all the
+ * documents that contain that term, and the frequency of the term in that
+ * document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS})
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}.
+ * For each term in the dictionary, the positions that the
+ * term occurs in each document. Note that this will not exist if all fields in
+ * all documents omit position data.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}.
+ * For each field in each document, a value is stored
+ * that is multiplied into the score for hits on that field.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}.
+ * For each field in each document, the term vector (sometimes
+ * called document vector) may be stored. A term vector consists of term text and
+ * term frequency. To add Term Vectors to your index see the
+ * {@link org.apache.lucene.document.Field Field} constructors
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}.
+ * Like stored values, these are also keyed by document
+ * number, but are generally intended to be loaded into main memory for fast
+ * access. Whereas stored values are generally intended for summary results from
+ * searches, per-document values are useful for things like scoring factors.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}.
+ * An optional file indicating which documents are live.
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.
+ * Optional pair of files, recording dimensionally indexed fields, to enable fast
+ * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D)
+ * and geographic shape intersection (2D, 3D).
+ *
+ * -
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90VectorFormat Vector values}.
+ * The vector format stores numeric vectors in a format optimized for random access and computation,
+ * supporting high-dimensional nearest-neighbor search.
+ *
+ *
+ *
Details on each of these are provided in their linked pages.
+ *
+ *
+ * File Naming
+ *
+ *
All files belonging to a segment have the same name with varying extensions.
+ * The extensions correspond to the different file formats described below. When
+ * using the Compound File format (default for small segments) these files (except
+ * for the Segment info file, the Lock file, and Deleted documents file) are collapsed
+ * into a single .cfs file (see below for details)
+ *
Typically, all segments in an index are stored in a single directory,
+ * although this is not required.
+ *
File names are never re-used. That is, when any file is saved
+ * to the Directory it is given a never before used filename. This is achieved
+ * using a simple generations approach. For example, the first segments file is
+ * segments_1, then segments_2, etc. The generation is a sequential long integer
+ * represented in alpha-numeric (base 36) form.
+ *
+ *
+ * Summary of File Extensions
+ *
+ *
The following table summarizes the names and extensions of the files in
+ * Lucene:
+ *
+ * lucene filenames by extension
+ *
+ * Name |
+ * Extension |
+ * Brief Description |
+ *
+ *
+ * {@link org.apache.lucene.index.SegmentInfos Segments File} |
+ * segments_N |
+ * Stores information about a commit point |
+ *
+ *
+ * Lock File |
+ * write.lock |
+ * The Write lock prevents multiple IndexWriters from writing to the same
+ * file. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info} |
+ * .si |
+ * Stores metadata about a segment |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File} |
+ * .cfs, .cfe |
+ * An optional "virtual" file consisting of all the other index files for
+ * systems that frequently run out of file handles. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields} |
+ * .fnm |
+ * Stores information about the fields |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index} |
+ * .fdx |
+ * Contains pointers to field data |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data} |
+ * .fdt |
+ * The stored fields for documents |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary} |
+ * .tim |
+ * The term dictionary, stores term info |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index} |
+ * .tip |
+ * The index into the Term Dictionary |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies} |
+ * .doc |
+ * Contains the list of docs which contain each term along with frequency |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions} |
+ * .pos |
+ * Stores position information about where a term occurs in the index |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads} |
+ * .pay |
+ * Stores additional per-position metadata information such as character offsets and user payloads |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms} |
+ * .nvd, .nvm |
+ * Encodes length and boost factors for docs and fields |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values} |
+ * .dvd, .dvm |
+ * Encodes additional scoring factors or other per-document information. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index} |
+ * .tvx |
+ * Stores offset into the document data file |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data} |
+ * .tvd |
+ * Contains term vector data. |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents} |
+ * .liv |
+ * Info about what documents are live |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values} |
+ * .dii, .dim |
+ * Holds indexed points |
+ *
+ *
+ * {@link org.apache.lucene.codecs.lucene90.Lucene90VectorFormat Vector values} |
+ * .vec, .vem |
+ * Holds indexed vectors; .vec files contain the raw vector data, and
+ * .vem the vector metadata |
+ *
+ *
+ *
+ *
+ * Lock File
+ * The write lock, which is stored in the index directory by default, is named
+ * "write.lock". If the lock directory is different from the index directory then
+ * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix
+ * derived from the full path to the index directory. When this file is present, a
+ * writer is currently modifying the index (adding or removing documents). This
+ * lock file ensures that only one writer is modifying the index at a time.
+ *
+ * History
+ * Compatibility notes are provided in this document, describing how file
+ * formats have changed from prior versions:
+ *
+ * - In version 2.1, the file format was changed to allow lock-less commits (ie,
+ * no more commit lock). The change is fully backwards compatible: you can open a
+ * pre-2.1 index for searching or adding/deleting of docs. When the new segments
+ * file is saved (committed), it will be written in the new file format (meaning
+ * no specific "upgrade" process is needed). But note that once a commit has
+ * occurred, pre-2.1 Lucene will not be able to read the index.
+ * - In version 2.3, the file format was changed to allow segments to share a
+ * single set of doc store (vectors & stored fields) files. This allows for
+ * faster indexing in certain cases. The change is fully backwards compatible (in
+ * the same way as the lock-less commits change in 2.1).
+ * - In version 2.4, Strings are now written as true UTF-8 byte sequence, not
+ * Java's modified UTF-8. See
+ * LUCENE-510 for details.
+ * - In version 2.9, an optional opaque Map<String,String> CommitUserData
+ * may be passed to IndexWriter's commit methods (and later retrieved), which is
+ * recorded in the segments_N file. See
+ * LUCENE-1382 for details. Also,
+ * diagnostics were added to each segment written recording details about why it
+ * was written (due to flush, merge; which OS/JRE was used; etc.). See issue
+ * LUCENE-1654 for details.
+ * - In version 3.0, compressed fields are no longer written to the index (they
+ * can still be read, but on merge the new segment will write them, uncompressed).
+ * See issue LUCENE-1960
+ * for details.
+ * - In version 3.1, segments records the code version that created them. See
+ * LUCENE-2720 for details.
+ * Additionally segments track explicitly whether or not they have term vectors.
+ * See LUCENE-2811
+ * for details.
+ * - In version 3.2, numeric fields are written as natively to stored fields
+ * file, previously they were stored in text format only.
+ * - In version 3.4, fields can omit position data while still indexing term
+ * frequencies.
+ * - In version 4.0, the format of the inverted index became extensible via
+ * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage
+ * ({@code DocValues}) was introduced. Normalization factors need no longer be a
+ * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}.
+ * Terms need not be unicode strings, they can be any byte sequence. Term offsets
+ * can optionally be indexed into the postings lists. Payloads can be stored in the
+ * term vectors.
+ * - In version 4.1, the format of the postings list changed to use either
+ * of FOR compression or variable-byte encoding, depending upon the frequency
+ * of the term. Terms appearing only once were changed to inline directly into
+ * the term dictionary. Stored fields are compressed by default.
+ * - In version 4.2, term vectors are compressed by default. DocValues has
+ * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining
+ * on multi-valued fields.
+ * - In version 4.5, DocValues were extended to explicitly represent missing values.
+ * - In version 4.6, FieldInfos were extended to support per-field DocValues generation, to
+ * allow updating NumericDocValues fields.
+ * - In version 4.8, checksum footers were added to the end of each index file
+ * for improved data integrity. Specifically, the last 8 bytes of every index file
+ * contain the zlib-crc32 checksum of the file.
+ * - In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric)
+ * that is suitable for faceting/sorting/analytics.
+ * - In version 5.4, DocValues have been improved to store more information on disk:
+ * addresses for binary fields and ord indexes for multi-valued fields.
+ * - In version 6.0, Points were added, for multi-dimensional range/distance search.
+ * - In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting.
+ *
- In version 7.0, DocValues have been improved to better support sparse doc values
+ * thanks to an iterator API.
+ * - In version 8.0, postings have been enhanced to record, for each block of
+ * doc ids, the (term freq, normalization factor) pairs that may trigger the
+ * maximum score of the block. This information is recorded alongside skip data
+ * in order to be able to skip blocks of doc ids if they may not produce high
+ * enough scores.
+ * Additionally doc values and norms has been extended with jump-tables to make access O(1)
+ * instead of O(n), where n is the number of elements to skip when advancing in the data.
+ * - In version 8.4, postings, positions, offsets and payload lengths have move to a more
+ * performant encoding that is vectorized.
+ * - In version 8.6, index sort serialization is delegated to the sorts themselves, to
+ * allow user-defined sorts to be used
+ * - In version 8.7, stored fields compression became adaptive to better handle documents with
+ * smaller stored fields.
+ * - In version 9.0, vector-valued fields were added.
+ *
+ *
+ * Limitations
+ *
+ *
Lucene uses a Java int
to refer to
+ * document numbers, and the index file format uses an Int32
+ * on-disk to store document numbers. This is a limitation
+ * of both the index file format and the current implementation. Eventually these
+ * should be replaced with either UInt64
values, or
+ * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.
+ *
+ */
+package org.apache.lucene.codecs.lucene90;
diff --git a/lucene/core/src/java/org/apache/lucene/document/FieldType.java b/lucene/core/src/java/org/apache/lucene/document/FieldType.java
index 82a6454a525a..21d9edeb77ae 100644
--- a/lucene/core/src/java/org/apache/lucene/document/FieldType.java
+++ b/lucene/core/src/java/org/apache/lucene/document/FieldType.java
@@ -25,6 +25,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableFieldType;
import org.apache.lucene.index.PointValues;
+import org.apache.lucene.index.VectorValues;
/**
* Describes the properties of a field.
@@ -44,6 +45,8 @@ public class FieldType implements IndexableFieldType {
private int dimensionCount;
private int indexDimensionCount;
private int dimensionNumBytes;
+ private int vectorDimension;
+ private VectorValues.ScoreFunction vectorScoreFunction = VectorValues.ScoreFunction.NONE;
private Map attributes;
/**
@@ -62,6 +65,8 @@ public FieldType(IndexableFieldType ref) {
this.dimensionCount = ref.pointDimensionCount();
this.indexDimensionCount = ref.pointIndexDimensionCount();
this.dimensionNumBytes = ref.pointNumBytes();
+ this.vectorDimension = ref.vectorDimension();
+ this.vectorScoreFunction = ref.vectorScoreFunction();
if (ref.getAttributes() != null) {
this.attributes = new HashMap<>(ref.getAttributes());
}
@@ -295,6 +300,7 @@ public void setDimensions(int dimensionCount, int dimensionNumBytes) {
* Enables points indexing with selectable dimension indexing.
*/
public void setDimensions(int dimensionCount, int indexDimensionCount, int dimensionNumBytes) {
+ checkIfFrozen();
if (dimensionCount < 0) {
throw new IllegalArgumentException("dimensionCount must be >= 0; got " + dimensionCount);
}
@@ -351,6 +357,28 @@ public int pointNumBytes() {
return dimensionNumBytes;
}
+ void setVectorDimensionsAndScoreFunction(int numDimensions, VectorValues.ScoreFunction distFunc) {
+ checkIfFrozen();
+ if (numDimensions <= 0) {
+ throw new IllegalArgumentException("vector numDimensions must be > 0; got " + numDimensions);
+ }
+ if (numDimensions > VectorValues.MAX_DIMENSIONS) {
+ throw new IllegalArgumentException("vector numDimensions must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + numDimensions);
+ }
+ this.vectorDimension = numDimensions;
+ this.vectorScoreFunction = distFunc;
+ }
+
+ @Override
+ public int vectorDimension() {
+ return vectorDimension;
+ }
+
+ @Override
+ public VectorValues.ScoreFunction vectorScoreFunction() {
+ return vectorScoreFunction;
+ }
+
/**
* Puts an attribute value.
*
diff --git a/lucene/core/src/java/org/apache/lucene/document/VectorField.java b/lucene/core/src/java/org/apache/lucene/document/VectorField.java
new file mode 100644
index 000000000000..24d9bd8ddafc
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/document/VectorField.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.document;
+
+import org.apache.lucene.index.VectorValues;
+
+/** A field that contains a single floating-point numeric vector (or none) for each document.
+ * Vectors are dense - that is, every dimension of a vector contains an explicit value, stored
+ * packed into an array (of type float[]) whose length is the vector dimension. Values can be
+ * retrieved using {@link VectorValues}, which is a forward-only docID-based iterator and also
+ * offers random-access by dense ordinal (not docId). VectorValues.ScoreFunctions may be
+ * used to compare vectors at query time (for example as part of result ranking). A VectorField may
+ * be associated with a score function that defines the metric used for nearest-neighbor search
+ * among vectors of that field, but at the moment this association is purely nominal: it is intended
+ * for future use by the to-be-implemented nearest neighbors search.
+ */
+public class VectorField extends Field {
+
+ private static FieldType getType(float[] v, VectorValues.ScoreFunction scoreFunction) {
+ if (v == null) {
+ throw new IllegalArgumentException("vector value must not be null");
+ }
+ int dimension = v.length;
+ if (dimension == 0) {
+ throw new IllegalArgumentException("cannot index an empty vector");
+ }
+ if (dimension > VectorValues.MAX_DIMENSIONS) {
+ throw new IllegalArgumentException("cannot index vectors with dimension greater than " + VectorValues.MAX_DIMENSIONS);
+ }
+ if (scoreFunction == null) {
+ throw new IllegalArgumentException("score function must not be null");
+ }
+ FieldType type = new FieldType();
+ type.setVectorDimensionsAndScoreFunction(dimension, scoreFunction);
+ type.freeze();
+ return type;
+ }
+
+ /** Creates a numeric vector field. Fields are single-valued: each document has either one value
+ * or no value. Vectors of a single field share the same dimension and score function.
+ *
+ * @param name field name
+ * @param vector value
+ * @param scoreFunction a function defining vector proximity.
+ * @throws IllegalArgumentException if any parameter is null, or the vector is empty or has dimension > 1024.
+ */
+ public VectorField(String name, float[] vector, VectorValues.ScoreFunction scoreFunction) {
+ super(name, getType(vector, scoreFunction));
+ fieldsData = vector;
+ }
+
+ /** Creates a numeric vector field with the default EUCLIDEAN (L2) score function. Fields are
+ * single-valued: each document has either one value or no value. Vectors of a single field share
+ * the same dimension and score function.
+ *
+ * @param name field name
+ * @param vector value
+ * @throws IllegalArgumentException if any parameter is null, or the vector is empty or has dimension > 1024.
+ */
+ public VectorField(String name, float[] vector) {
+ this(name, vector, VectorValues.ScoreFunction.EUCLIDEAN);
+ }
+
+ /**
+ * Return the vector value of this field
+ */
+ public float[] vectorValue() {
+ return (float[]) fieldsData;
+ }
+
+ /**
+ * Set the vector value of this field
+ * @param value the value to set; must not be null, and length must match the field type
+ */
+ public void setVectorValue(float[] value) {
+ if (value == null) {
+ throw new IllegalArgumentException("value must not be null");
+ }
+ if (value.length != type.vectorDimension()) {
+ throw new IllegalArgumentException("value length " + value.length + " must match field dimension " + type.vectorDimension());
+ }
+ fieldsData = value;
+ }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
index 3b1f533bc477..aff5a1fc46fb 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java
@@ -220,6 +220,9 @@ public static class SegmentInfoStatus {
/** Status of index sort */
public IndexSortStatus indexSortStatus;
+
+ /** Status of vectors */
+ public VectorValuesStatus vectorValuesStatus;
}
/**
@@ -374,7 +377,25 @@ public static final class PointsStatus {
/** Total number of fields with points. */
public int totalValueFields;
- /** Exception thrown during doc values test (null on success) */
+ /** Exception thrown during point values test (null on success) */
+ public Throwable error = null;
+ }
+
+ /**
+ * Status from testing VectorValues
+ */
+ public static final class VectorValuesStatus {
+
+ VectorValuesStatus() {
+ }
+
+ /** Total number of vector values tested. */
+ public long totalVectorValues;
+
+ /** Total number of fields with vectors. */
+ public int totalVectorFields;
+
+ /** Exception thrown during vector values test (null on success) */
public Throwable error = null;
}
@@ -731,6 +752,9 @@ public Status checkIndex(List onlySegments) throws IOException {
// Test PointValues
segInfoStat.pointsStatus = testPoints(reader, infoStream, failFast);
+ // Test VectorValues
+ segInfoStat.vectorValuesStatus = testVectors(reader, infoStream, failFast);
+
// Test index sort
segInfoStat.indexSortStatus = testSort(reader, indexSort, infoStream, failFast);
@@ -1955,6 +1979,65 @@ public static Status.PointsStatus testPoints(CodecReader reader, PrintStream inf
return status;
}
+ /**
+ * Test the vectors index
+ * @lucene.experimental
+ */
+ public static Status.VectorValuesStatus testVectors(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException {
+ if (infoStream != null) {
+ infoStream.print(" test: vectors..............");
+ }
+ long startNS = System.nanoTime();
+ FieldInfos fieldInfos = reader.getFieldInfos();
+ Status.VectorValuesStatus status = new Status.VectorValuesStatus();
+ try {
+
+ if (fieldInfos.hasVectorValues()) {
+ for (FieldInfo fieldInfo : fieldInfos) {
+ if (fieldInfo.hasVectorValues()) {
+ int dimension = fieldInfo.getVectorDimension();
+ if (dimension <= 0) {
+ throw new RuntimeException("Field \"" + fieldInfo.name + "\" has vector values but dimension is " + dimension);
+ }
+ VectorValues values = reader.getVectorValues(fieldInfo.name);
+ if (values == null) {
+ continue;
+ }
+
+ status.totalVectorFields++;
+
+ int docCount = 0;
+ while (values.nextDoc() != NO_MORE_DOCS) {
+ int valueLength = values.vectorValue().length;
+ if (valueLength != dimension) {
+ throw new RuntimeException("Field \"" + fieldInfo.name + "\" has a value whose dimension=" + valueLength + " not matching the field's dimension=" + dimension);
+ }
+ ++docCount;
+ }
+ if (docCount != values.size()) {
+ throw new RuntimeException("Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " + docCount + " docs with values");
+ }
+ status.totalVectorValues += docCount;
+ }
+ }
+ }
+
+ msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d vectors] [took %.3f sec]", status.totalVectorFields, status.totalVectorValues, nsToSec(System.nanoTime()-startNS)));
+
+ } catch (Throwable e) {
+ if (failFast) {
+ throw IOUtils.rethrowAlways(e);
+ }
+ msg(infoStream, "ERROR: " + e);
+ status.error = e;
+ if (infoStream != null) {
+ e.printStackTrace(infoStream);
+ }
+ }
+
+ return status;
+ }
+
/** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries.
*
* @lucene.internal */
diff --git a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
index 4459ab11edd6..ab3b93f80a9d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java
@@ -26,6 +26,7 @@
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -77,6 +78,12 @@ protected CodecReader() {}
* @lucene.internal
*/
public abstract PointsReader getPointsReader();
+
+ /**
+ * Expert: retrieve underlying VectorReader
+ * @lucene.internal
+ */
+ public abstract VectorReader getVectorReader();
@Override
public final void document(int docID, StoredFieldVisitor visitor) throws IOException {
@@ -202,6 +209,18 @@ public final PointValues getPointValues(String field) throws IOException {
return getPointsReader().getValues(field);
}
+ @Override
+ public final VectorValues getVectorValues(String field) throws IOException {
+ ensureOpen();
+ FieldInfo fi = getFieldInfos().fieldInfo(field);
+ if (fi == null || fi.getVectorDimension() == 0) {
+ // Field does not exist or does not index vectors
+ return null;
+ }
+
+ return getVectorReader().getVectorValues(field);
+ }
+
@Override
protected void doClose() throws IOException {
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java
index 93b7f4988d68..f7f79e0b9e7d 100644
--- a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java
@@ -47,6 +47,11 @@ public final PointValues getPointValues(String field) throws IOException {
throw new UnsupportedOperationException();
}
+ @Override
+ public final VectorValues getVectorValues(String field) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
@Override
public final void checkIntegrity() throws IOException {
throw new UnsupportedOperationException();
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
index b8fe341fc6bb..4a3463c58fcc 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java
@@ -54,6 +54,9 @@ public final class FieldInfo {
private int pointIndexDimensionCount;
private int pointNumBytes;
+ private int vectorDimension; // if it is a positive value, it means this field indexes vectors
+ private VectorValues.ScoreFunction vectorScoreFunction = VectorValues.ScoreFunction.NONE;
+
// whether this field is used as the soft-deletes field
private final boolean softDeletesField;
@@ -64,7 +67,8 @@ public final class FieldInfo {
*/
public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads,
IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map attributes,
- int pointDimensionCount, int pointIndexDimensionCount, int pointNumBytes, boolean softDeletesField) {
+ int pointDimensionCount, int pointIndexDimensionCount, int pointNumBytes,
+ int vectorDimension, VectorValues.ScoreFunction vectorScoreFunction, boolean softDeletesField) {
this.name = Objects.requireNonNull(name);
this.number = number;
this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")");
@@ -83,6 +87,8 @@ public FieldInfo(String name, int number, boolean storeTermVector, boolean omitN
this.pointDimensionCount = pointDimensionCount;
this.pointIndexDimensionCount = pointIndexDimensionCount;
this.pointNumBytes = pointNumBytes;
+ this.vectorDimension = vectorDimension;
+ this.vectorScoreFunction = vectorScoreFunction;
this.softDeletesField = softDeletesField;
this.checkConsistency();
}
@@ -137,6 +143,14 @@ public boolean checkConsistency() {
throw new IllegalStateException("field '" + name + "' cannot have a docvalues update generation without having docvalues");
}
+ if (vectorDimension < 0) {
+ throw new IllegalStateException("vectorDimension must be >=0; got " + vectorDimension);
+ }
+
+ if (vectorDimension == 0 && vectorScoreFunction != VectorValues.ScoreFunction.NONE) {
+ throw new IllegalStateException("vector score function must be NONE when dimension = 0; got " + vectorScoreFunction);
+ }
+
return true;
}
@@ -232,6 +246,40 @@ public int getPointNumBytes() {
return pointNumBytes;
}
+ /** Record that this field is indexed with vectors, with the specified num of dimensions and distance function */
+ public void setVectorDimensionAndScoreFunction(int dimension, VectorValues.ScoreFunction scoreFunction) {
+ if (dimension < 0) {
+ throw new IllegalArgumentException("vector dimension must be >= 0; got " + dimension);
+ }
+ if (dimension > VectorValues.MAX_DIMENSIONS) {
+ throw new IllegalArgumentException("vector dimension must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + dimension);
+ }
+ if (dimension == 0 && scoreFunction != VectorValues.ScoreFunction.NONE) {
+ throw new IllegalArgumentException("vector score function must be NONE when the vector dimension = 0; got " + scoreFunction);
+ }
+ if (vectorDimension != 0 && vectorDimension != dimension) {
+ throw new IllegalArgumentException("cannot change vector dimension from " + vectorDimension + " to " + dimension + " for field=\"" + name + "\"");
+ }
+ if (vectorScoreFunction != VectorValues.ScoreFunction.NONE && vectorScoreFunction != scoreFunction) {
+ throw new IllegalArgumentException("cannot change vector score function from " + vectorScoreFunction + " to " + scoreFunction + " for field=\"" + name + "\"");
+ }
+
+ this.vectorDimension = dimension;
+ this.vectorScoreFunction = scoreFunction;
+
+ assert checkConsistency();
+ }
+
+ /** Returns the number of dimensions of the vector value */
+ public int getVectorDimension() {
+ return vectorDimension;
+ }
+
+ /** Returns {@link org.apache.lucene.index.VectorValues.ScoreFunction} for the field */
+ public VectorValues.ScoreFunction getVectorScoreFunction() {
+ return vectorScoreFunction;
+ }
+
/** Record that this field is indexed with docvalues, with the specified type */
public void setDocValuesType(DocValuesType type) {
if (type == null) {
@@ -336,6 +384,13 @@ public boolean hasPayloads() {
public boolean hasVectors() {
return storeTermVector;
}
+
+ /**
+ * Returns whether any (numeric) vector values exist for this field
+ */
+ public boolean hasVectorValues() {
+ return vectorDimension > 0;
+ }
/**
* Get a codec attribute value, or null if it does not exist
diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
index 4b266d008a22..8d8ff15ee5ad 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java
@@ -48,6 +48,7 @@ public class FieldInfos implements Iterable {
private final boolean hasNorms;
private final boolean hasDocValues;
private final boolean hasPointValues;
+ private final boolean hasVectorValues;
private final String softDeletesField;
// used only by fieldInfo(int)
@@ -68,6 +69,7 @@ public FieldInfos(FieldInfo[] infos) {
boolean hasNorms = false;
boolean hasDocValues = false;
boolean hasPointValues = false;
+ boolean hasVectorValues = false;
String softDeletesField = null;
int size = 0; // number of elements in byNumberTemp, number of used array slots
@@ -99,6 +101,7 @@ public FieldInfos(FieldInfo[] infos) {
hasDocValues |= info.getDocValuesType() != DocValuesType.NONE;
hasPayloads |= info.hasPayloads();
hasPointValues |= (info.getPointDimensionCount() != 0);
+ hasVectorValues |= (info.getVectorDimension() != 0);
if (info.isSoftDeletesField()) {
if (softDeletesField != null && softDeletesField.equals(info.name) == false) {
throw new IllegalArgumentException("multiple soft-deletes fields [" + info.name + ", " + softDeletesField + "]");
@@ -115,6 +118,7 @@ public FieldInfos(FieldInfo[] infos) {
this.hasNorms = hasNorms;
this.hasDocValues = hasDocValues;
this.hasPointValues = hasPointValues;
+ this.hasVectorValues = hasVectorValues;
this.softDeletesField = softDeletesField;
List valuesTemp = new ArrayList<>();
@@ -204,6 +208,11 @@ public boolean hasPointValues() {
return hasPointValues;
}
+ /** Returns true if any fields have VectorValues */
+ public boolean hasVectorValues() {
+ return hasVectorValues;
+ }
+
/** Returns the soft-deletes field name if exists; otherwise returns null */
public String getSoftDeletesField() {
return softDeletesField;
@@ -261,6 +270,16 @@ public FieldDimensions(int dimensionCount, int indexDimensionCount, int dimensio
this.dimensionNumBytes = dimensionNumBytes;
}
}
+
+ static final class FieldVectorProperties {
+ final int numDimensions;
+ final VectorValues.ScoreFunction scoreFunction;
+
+ FieldVectorProperties(int numDimensions, VectorValues.ScoreFunction scoreFunction) {
+ this.numDimensions = numDimensions;
+ this.scoreFunction = scoreFunction;
+ }
+ }
static final class FieldNumbers {
@@ -274,6 +293,8 @@ static final class FieldNumbers {
private final Map dimensions;
+ private final Map vectorProps;
+
// TODO: we should similarly catch an attempt to turn
// norms back on after they were already committed; today
// we silently discard the norm but this is badly trappy
@@ -288,6 +309,7 @@ static final class FieldNumbers {
this.indexOptions = new HashMap<>();
this.docValuesType = new HashMap<>();
this.dimensions = new HashMap<>();
+ this.vectorProps = new HashMap<>();
this.softDeletesFieldName = softDeletesFieldName;
}
@@ -297,7 +319,7 @@ static final class FieldNumbers {
* number assigned if possible otherwise the first unassigned field number
* is used as the field number.
*/
- synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int indexDimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) {
+ synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int indexDimensionCount, int dimensionNumBytes, int vectorDimension, VectorValues.ScoreFunction scoreFunction, boolean isSoftDeletesField) {
if (indexOptions != IndexOptions.NONE) {
IndexOptions currentOpts = this.indexOptions.get(fieldName);
if (currentOpts == null) {
@@ -330,6 +352,19 @@ synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptio
dimensions.put(fieldName, new FieldDimensions(dimensionCount, indexDimensionCount, dimensionNumBytes));
}
}
+ if (vectorDimension != 0) {
+ FieldVectorProperties props = vectorProps.get(fieldName);
+ if (props != null) {
+ if (props.numDimensions != vectorDimension) {
+ throw new IllegalArgumentException("cannot change vector dimension from " + props.numDimensions + " to " + vectorDimension + " for field=\"" + fieldName + "\"");
+ }
+ if (props.scoreFunction != scoreFunction) {
+ throw new IllegalArgumentException("cannot change vector score function from " + props.scoreFunction + " to " + scoreFunction + " for field=\"" + fieldName + "\"");
+ }
+ } else {
+ vectorProps.put(fieldName, new FieldVectorProperties(vectorDimension, scoreFunction));
+ }
+ }
Integer fieldNumber = nameToNumber.get(fieldName);
if (fieldNumber == null) {
final Integer preferredBoxed = Integer.valueOf(preferredFieldNumber);
@@ -408,6 +443,24 @@ synchronized void verifyConsistentDimensions(Integer number, String name, int da
}
}
+ synchronized void verifyConsistentVectorProperties(Integer number, String name, int numDimensions, VectorValues.ScoreFunction scoreFunction) {
+ if (name.equals(numberToName.get(number)) == false) {
+ throw new IllegalArgumentException("field number " + number + " is already mapped to field name \"" + numberToName.get(number) + "\", not \"" + name + "\"");
+ }
+ if (number.equals(nameToNumber.get(name)) == false) {
+ throw new IllegalArgumentException("field name \"" + name + "\" is already mapped to field number \"" + nameToNumber.get(name) + "\", not \"" + number + "\"");
+ }
+ FieldVectorProperties props = vectorProps.get(name);
+ if (props != null) {
+ if (props.numDimensions != numDimensions) {
+ throw new IllegalArgumentException("cannot change vector dimension from " + props.numDimensions + " to " + numDimensions + " for field=\"" + name + "\"");
+ }
+ if (props.scoreFunction != scoreFunction) {
+ throw new IllegalArgumentException("cannot change vector score function from " + props.scoreFunction + " to " + scoreFunction + " for field=\"" + name + "\"");
+ }
+ }
+ }
+
/**
* Returns true if the {@code fieldName} exists in the map and is of the
* same {@code dvType}.
@@ -456,6 +509,17 @@ synchronized void setDimensions(int number, String name, int dimensionCount, int
verifyConsistentDimensions(number, name, dimensionCount, indexDimensionCount, dimensionNumBytes);
dimensions.put(name, new FieldDimensions(dimensionCount, indexDimensionCount, dimensionNumBytes));
}
+
+ synchronized void setVectorDimensionsAndScoreFunction(int number, String name, int numDimensions, VectorValues.ScoreFunction scoreFunction) {
+ if (numDimensions <= 0) {
+ throw new IllegalArgumentException("vector numDimensions must be > 0; got " + numDimensions);
+ }
+ if (numDimensions > VectorValues.MAX_DIMENSIONS) {
+ throw new IllegalArgumentException("vector numDimensions must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + numDimensions);
+ }
+ verifyConsistentVectorProperties(number, name, numDimensions, scoreFunction);
+ vectorProps.put(name, new FieldVectorProperties(numDimensions, scoreFunction));
+ }
}
static final class Builder {
@@ -489,8 +553,8 @@ public FieldInfo getOrAdd(String name) {
// before then we'll get the same name and number,
// else we'll allocate a new one:
final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName);
- final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, 0, isSoftDeletesField);
- fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, 0, isSoftDeletesField);
+ final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField);
+ fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField);
assert !byName.containsKey(fi.name);
globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE);
byName.put(fi.name, fi);
@@ -505,6 +569,7 @@ private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber,
DocValuesType docValues, long dvGen,
Map attributes,
int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes,
+ int vectorDimension, VectorValues.ScoreFunction vectorScoreFunction,
boolean isSoftDeletesField) {
assert assertNotFinished();
if (docValues == null) {
@@ -522,8 +587,8 @@ private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber,
// number for this field. If the field was seen
// before then we'll get the same name and number,
// else we'll allocate a new one:
- final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
- fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField);
+ final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, vectorDimension, vectorScoreFunction, isSoftDeletesField);
+ fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, vectorDimension, vectorScoreFunction, isSoftDeletesField);
assert !byName.containsKey(fi.name);
globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType());
byName.put(fi.name, fi);
@@ -558,6 +623,7 @@ public FieldInfo add(FieldInfo fi, long dvGen) {
fi.getIndexOptions(), fi.getDocValuesType(), dvGen,
fi.attributes(),
fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
+ fi.getVectorDimension(), fi.getVectorScoreFunction(),
fi.isSoftDeletesField());
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
index ff3ea186ac18..2814c117d0a1 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java
@@ -23,6 +23,7 @@
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -101,6 +102,11 @@ public PointsReader getPointsReader() {
return in.getPointsReader();
}
+ @Override
+ public VectorReader getVectorReader() {
+ return in.getVectorReader();
+ }
+
@Override
public int numDocs() {
return in.numDocs();
diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java
index 39087e4828dd..fc50130d582b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java
@@ -330,6 +330,11 @@ public PointValues getPointValues(String field) throws IOException {
return in.getPointValues(field);
}
+ @Override
+ public VectorValues getVectorValues(String field) throws IOException {
+ return in.getVectorValues(field);
+ }
+
@Override
public Fields getTermVectors(int docID)
throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
index b1a53464aba2..9c557184629b 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
@@ -1184,7 +1184,8 @@ private FieldNumbers getFieldNumberMap() throws IOException {
for(SegmentCommitInfo info : segmentInfos) {
FieldInfos fis = readFieldInfos(info);
for(FieldInfo fi : fis) {
- map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
+ map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
+ fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField());
}
}
@@ -1921,7 +1922,7 @@ private DocValuesUpdate[] buildDocValuesUpdate(Term term, Field[] updates) {
if (globalFieldNumberMap.contains(f.name(), dvType) == false) {
// if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we
// get a consistent error message as if you try to do that during an indexing operation.
- globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, 0, f.name().equals(config.softDeletesField));
+ globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, f.name().equals(config.softDeletesField));
assert globalFieldNumberMap.contains(f.name(), dvType);
}
if (config.getIndexSortFields().contains(f.name())) {
@@ -2966,7 +2967,9 @@ public long addIndexes(Directory... dirs) throws IOException {
FieldInfos fis = readFieldInfos(info);
for(FieldInfo fi : fis) {
// This will throw exceptions if any of the incoming fields have an illegal schema change:
- globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
+ globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(),
+ fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
+ fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField());
}
infos.add(copySegmentAsIs(info, newSegName, context));
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java
index 9eb7a1574dd9..9f85d04c7aa5 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java
@@ -114,6 +114,16 @@ public interface IndexableFieldType {
*/
public int pointNumBytes();
+ /**
+ * The number of dimensions of the field's vector value
+ */
+ public int vectorDimension();
+
+ /**
+ * The {@link org.apache.lucene.index.VectorValues.ScoreFunction} of the field's vector value
+ */
+ public VectorValues.ScoreFunction vectorScoreFunction();
+
/**
* Attributes for the field type.
*
diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
index f40303b46409..071e6ceeda92 100644
--- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
+++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java
@@ -32,12 +32,15 @@
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesFormat;
+import org.apache.lucene.codecs.VectorFormat;
+import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsFormat;
import org.apache.lucene.codecs.PointsWriter;
import org.apache.lucene.document.FieldType;
+import org.apache.lucene.document.VectorField;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
@@ -180,7 +183,6 @@ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException
public FieldInfos getFieldInfos() {
return fieldInfos.finish();
}
-
};
}
@@ -230,6 +232,12 @@ Sorter.DocMap flush(SegmentWriteState state) throws IOException {
if (infoStream.isEnabled("IW")) {
infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points");
}
+
+ t0 = System.nanoTime();
+ writeVectors(state, sortMap);
+ if (infoStream.isEnabled("IW")) {
+ infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write vectors");
+ }
// it's possible all docs hit non-aborting exceptions...
t0 = System.nanoTime();
@@ -374,6 +382,50 @@ private void writeDocValues(SegmentWriteState state, Sorter.DocMap sortMap) thro
}
}
+ /** Writes all buffered vectors. */
+ private void writeVectors(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException {
+ VectorWriter vectorWriter = null;
+ boolean success = false;
+ try {
+ for (int i = 0; i {
// Non-null if this field ever had points in this segment:
PointValuesWriter pointValuesWriter;
+ // Non-null if this field ever had vector values in this segment:
+ VectorValuesWriter vectorValuesWriter;
+
/** We use this to know when a PerField is seen for the
* first time in the current document. */
long fieldGen = -1;
diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
index e1c31354f007..daab05555bd3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java
@@ -203,6 +203,10 @@ public final PostingsEnum postings(Term term) throws IOException {
* used by a single thread. */
public abstract NumericDocValues getNormValues(String field) throws IOException;
+ /** Returns {@link VectorValues} for this field, or null if no {@link VectorValues} were indexed.
+ * The returned instance should only be used by a single thread. */
+ public abstract VectorValues getVectorValues(String field) throws IOException;
+
/**
* Get the {@link FieldInfos} describing all fields in
* this reader.
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java
index afa2612b7abe..9b854828f6cd 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java
@@ -194,6 +194,11 @@ public PointValues getPointValues(String fieldName) throws IOException {
return in.getPointValues(fieldName);
}
+ @Override
+ public VectorValues getVectorValues(String fieldName) throws IOException {
+ return in.getVectorValues(fieldName);
+ }
+
@Override
public int numDocs() {
return in.numDocs();
diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
index 0b291e72ccb6..a0052e78c381 100644
--- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java
+++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java
@@ -24,6 +24,7 @@
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -77,6 +78,9 @@ public class MergeState {
/** Point readers to merge */
public final PointsReader[] pointsReaders;
+ /** Vector readers to merge */
+ public final VectorReader[] vectorReaders;
+
/** Max docs per reader */
public final int[] maxDocs;
@@ -103,6 +107,7 @@ public class MergeState {
termVectorsReaders = new TermVectorsReader[numReaders];
docValuesProducers = new DocValuesProducer[numReaders];
pointsReaders = new PointsReader[numReaders];
+ vectorReaders = new VectorReader[numReaders];
fieldInfos = new FieldInfos[numReaders];
liveDocs = new Bits[numReaders];
@@ -139,6 +144,12 @@ public class MergeState {
if (pointsReaders[i] != null) {
pointsReaders[i] = pointsReaders[i].getMergeInstance();
}
+
+ vectorReaders[i] = reader.getVectorReader();
+ if (vectorReaders[i] != null) {
+ vectorReaders[i] = vectorReaders[i].getMergeInstance();
+ }
+
numDocs += reader.numDocs();
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
index 25f200a42431..1c61713f99d3 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java
@@ -369,6 +369,13 @@ public PointValues getPointValues(String fieldName) throws IOException {
return reader == null ? null : reader.getPointValues(fieldName);
}
+ @Override
+ public VectorValues getVectorValues(String fieldName) throws IOException {
+ ensureOpen();
+ LeafReader reader = fieldToReader.get(fieldName);
+ return reader == null ? null : reader.getVectorValues(fieldName);
+ }
+
@Override
public void checkIntegrity() throws IOException {
ensureOpen();
diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
index 1c2838f5162c..10127fb2b174 100644
--- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
+++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java
@@ -657,7 +657,8 @@ public synchronized boolean writeFieldUpdates(Directory dir, FieldInfos.FieldNum
private FieldInfo cloneFieldInfo(FieldInfo fi, int fieldNumber) {
return new FieldInfo(fi.name, fieldNumber, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(),
fi.getIndexOptions(), fi.getDocValuesType(), fi.getDocValuesGen(), new HashMap<>(fi.attributes()),
- fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField());
+ fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
+ fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField());
}
private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException {
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
index 1f128218fa6e..da5eb34d827c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java
@@ -30,6 +30,7 @@
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.CompoundDirectory;
import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.PostingsFormat;
@@ -61,6 +62,7 @@ final class SegmentCoreReaders {
final StoredFieldsReader fieldsReaderOrig;
final TermVectorsReader termVectorsReaderOrig;
final PointsReader pointsReader;
+ final VectorReader vectorReader;
final CompoundDirectory cfsReader;
final String segment;
/**
@@ -137,6 +139,13 @@ protected TermVectorsReader initialValue() {
} else {
pointsReader = null;
}
+
+ if (coreFieldInfos.hasVectorValues()) {
+ vectorReader = codec.vectorFormat().fieldsReader(segmentReadState);
+ } else {
+ vectorReader = null;
+ }
+
success = true;
} catch (EOFException | FileNotFoundException e) {
throw new CorruptIndexException("Problem reading index from " + dir, dir.toString(), e);
@@ -168,7 +177,7 @@ void decRef() throws IOException {
if (ref.decrementAndGet() == 0) {
try (Closeable finalizer = this::notifyCoreClosedListeners){
IOUtils.close(termVectorsLocal, fieldsReaderLocal, fields, termVectorsReaderOrig, fieldsReaderOrig,
- cfsReader, normsProducer, pointsReader);
+ cfsReader, normsProducer, pointsReader, vectorReader);
}
}
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
index 4f5549734d9b..ed04ae01f47f 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java
@@ -23,6 +23,7 @@
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.codecs.NormsConsumer;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsWriter;
@@ -98,15 +99,8 @@ MergeState merge() throws IOException {
throw new IllegalStateException("Merge would result in 0 document segment");
}
mergeFieldInfos();
- long t0 = 0;
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
- int numMerged = mergeFields();
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge stored fields [" + numMerged + " docs]");
- }
+
+ int numMerged = mergeWithLogging(this::mergeFields, "stored fields");
assert numMerged == mergeState.segmentInfo.maxDoc(): "numMerged=" + numMerged + " vs mergeState.segmentInfo.maxDoc()=" + mergeState.segmentInfo.maxDoc();
final SegmentWriteState segmentWriteState = new SegmentWriteState(mergeState.infoStream, directory, mergeState.segmentInfo,
@@ -115,77 +109,29 @@ MergeState merge() throws IOException {
IOContext.READ, segmentWriteState.segmentSuffix);
if (mergeState.mergeFieldInfos.hasNorms()) {
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
- mergeNorms(segmentWriteState);
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge norms [" + numMerged + " docs]");
- }
+ mergeWithLogging(() -> mergeNorms(segmentWriteState), "norms", numMerged);
}
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
- try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms()
- ? codec.normsFormat().normsProducer(segmentReadState)
- : null) {
- NormsProducer normsMergeInstance = null;
- if (norms != null) {
- // Use the merge instance in order to reuse the same IndexInput for all terms
- normsMergeInstance = norms.getMergeInstance();
- }
- mergeTerms(segmentWriteState, normsMergeInstance);
- }
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge postings [" + numMerged + " docs]");
- }
+ mergeWithLogging(() -> mergeTerms(segmentWriteState, segmentReadState), "postings", numMerged);
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
if (mergeState.mergeFieldInfos.hasDocValues()) {
- mergeDocValues(segmentWriteState);
- }
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge doc values [" + numMerged + " docs]");
+ mergeWithLogging(() -> mergeDocValues(segmentWriteState), "doc values", numMerged);
}
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
if (mergeState.mergeFieldInfos.hasPointValues()) {
- mergePoints(segmentWriteState);
+ mergeWithLogging(() -> mergePoints(segmentWriteState), "points", numMerged);
}
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge points [" + numMerged + " docs]");
+
+ if (mergeState.mergeFieldInfos.hasVectorValues()) {
+ mergeWithLogging(() -> mergeVectorValues(segmentWriteState), "numeric vectors", numMerged);
}
if (mergeState.mergeFieldInfos.hasVectors()) {
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
- numMerged = mergeVectors();
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge vectors [" + numMerged + " docs]");
- }
- assert numMerged == mergeState.segmentInfo.maxDoc();
+ mergeWithLogging(this::mergeTermVectors, "term vectors");
}
-
+
// write the merged infos
- if (mergeState.infoStream.isEnabled("SM")) {
- t0 = System.nanoTime();
- }
- codec.fieldInfosFormat().write(directory, mergeState.segmentInfo, "", mergeState.mergeFieldInfos, context);
- if (mergeState.infoStream.isEnabled("SM")) {
- long t1 = System.nanoTime();
- mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to write field infos [" + numMerged + " docs]");
- }
+ mergeWithLogging(() -> codec.fieldInfosFormat().write(directory, mergeState.segmentInfo, "", mergeState.mergeFieldInfos, context), "field infos", numMerged);
return mergeState;
}
@@ -207,7 +153,22 @@ private void mergeNorms(SegmentWriteState segmentWriteState) throws IOException
consumer.merge(mergeState);
}
}
-
+
+ private void mergeTerms(SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException {
+ try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms()
+ ? codec.normsFormat().normsProducer(segmentReadState)
+ : null) {
+ NormsProducer normsMergeInstance = null;
+ if (norms != null) {
+ // Use the merge instance in order to reuse the same IndexInput for all terms
+ normsMergeInstance = norms.getMergeInstance();
+ }
+ try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
+ consumer.merge(mergeState, normsMergeInstance);
+ }
+ }
+ }
+
public void mergeFieldInfos() {
for (FieldInfos readerFieldInfos : mergeState.fieldInfos) {
for (FieldInfo fi : readerFieldInfos) {
@@ -233,15 +194,51 @@ private int mergeFields() throws IOException {
* Merge the TermVectors from each of the segments into the new one.
* @throws IOException if there is a low-level IO error
*/
- private int mergeVectors() throws IOException {
+ private int mergeTermVectors() throws IOException {
try (TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, mergeState.segmentInfo, context)) {
- return termVectorsWriter.merge(mergeState);
+ int numMerged = termVectorsWriter.merge(mergeState);
+ assert numMerged == mergeState.segmentInfo.maxDoc();
+ return numMerged;
}
}
- private void mergeTerms(SegmentWriteState segmentWriteState, NormsProducer norms) throws IOException {
- try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) {
- consumer.merge(mergeState, norms);
+ private void mergeVectorValues(SegmentWriteState segmentWriteState) throws IOException {
+ try (VectorWriter writer = codec.vectorFormat().fieldsWriter(segmentWriteState)) {
+ writer.merge(mergeState);
}
}
+
+ private interface Merger {
+ int merge() throws IOException;
+ }
+
+ private interface VoidMerger {
+ void merge() throws IOException;
+ }
+
+ private int mergeWithLogging(Merger merger, String formatName) throws IOException {
+ long t0 = 0;
+ if (mergeState.infoStream.isEnabled("SM")) {
+ t0 = System.nanoTime();
+ }
+ int numMerged = merger.merge();
+ if (mergeState.infoStream.isEnabled("SM")) {
+ long t1 = System.nanoTime();
+ mergeState.infoStream.message("SM", ((t1 - t0) / 1000000) + " msec to merge vector values [" + numMerged + " docs]");
+ }
+ return numMerged;
+ }
+
+ private void mergeWithLogging(VoidMerger merger, String formatName, int numMerged) throws IOException {
+ long t0 = 0;
+ if (mergeState.infoStream.isEnabled("SM")) {
+ t0 = System.nanoTime();
+ }
+ merger.merge();
+ if (mergeState.infoStream.isEnabled("SM")) {
+ long t1 = System.nanoTime();
+ mergeState.infoStream.message("SM", ((t1 - t0) / 1000000) + " msec to merge vector values [" + numMerged + " docs]");
+ }
+ }
+
}
diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
index 25145ff180f0..1da063a0b512 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java
@@ -26,6 +26,7 @@
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -259,6 +260,11 @@ public DocValuesProducer getDocValuesReader() {
return docValuesProducer;
}
+ @Override
+ public VectorReader getVectorReader() {
+ return core.vectorReader;
+ }
+
@Override
public FieldsProducer getPostingsReader() {
ensureOpen();
diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
index 533255780351..b2ce9aa4d80c 100644
--- a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
+++ b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java
@@ -24,6 +24,7 @@
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.NormsProducer;
import org.apache.lucene.codecs.PointsReader;
import org.apache.lucene.codecs.StoredFieldsReader;
@@ -78,6 +79,12 @@ public DocValuesProducer getDocValuesReader() {
return readerToDocValuesProducer(reader);
}
+ @Override
+ public VectorReader getVectorReader() {
+ reader.ensureOpen();
+ return readerToVectorReader(reader);
+ }
+
@Override
public FieldsProducer getPostingsReader() {
reader.ensureOpen();
@@ -160,6 +167,29 @@ public long ramBytesUsed() {
};
}
+
+ private static VectorReader readerToVectorReader(LeafReader reader) {
+ return new VectorReader() {
+ @Override
+ public VectorValues getVectorValues(String field) throws IOException {
+ return reader.getVectorValues(field);
+ }
+
+ @Override
+ public void checkIntegrity() {
+ // We already checkIntegrity the entire reader up front
+ }
+
+ @Override
+ public void close() {
+ }
+
+ @Override
+ public long ramBytesUsed() {
+ return 0L;
+ }
+ };
+ }
private static NormsProducer readerToNormsProducer(final LeafReader reader) {
return new NormsProducer() {
diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorValues.java b/lucene/core/src/java/org/apache/lucene/index/VectorValues.java
new file mode 100644
index 000000000000..45c79fc283db
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/VectorValues.java
@@ -0,0 +1,285 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This class provides access to per-document floating point vector values indexed as {@link
+ * org.apache.lucene.document.VectorField}.
+ *
+ * @lucene.experimental
+ */
+public abstract class VectorValues extends DocIdSetIterator {
+
+ /** The maximum length of a vector */
+ public static int MAX_DIMENSIONS = 1024;
+
+ /** Sole constructor */
+ protected VectorValues() {}
+
+ /**
+ * Return the dimension of the vectors
+ */
+ public abstract int dimension();
+
+ /**
+ * TODO: should we use cost() for this? We rely on its always being exactly the number
+ * of documents having a value for this field, which is not guaranteed by the cost() contract,
+ * but in all the implementations so far they are the same.
+ * @return the number of vectors returned by this iterator
+ */
+ public abstract int size();
+
+ /**
+ * Return the score function used to compare these vectors
+ */
+ public abstract ScoreFunction scoreFunction();
+
+ /**
+ * Return the vector value for the current document ID.
+ * It is illegal to call this method when the iterator is not positioned: before advancing, or after failing to advance.
+ * The returned array may be shared across calls, re-used, and modified as the iterator advances.
+ * @return the vector value
+ */
+ public abstract float[] vectorValue() throws IOException;
+
+ /**
+ * Return the binary encoded vector value for the current document ID. These are the bytes
+ * corresponding to the float array return by {@link #vectorValue}. It is illegal to call this
+ * method when the iterator is not positioned: before advancing, or after failing to advance. The
+ * returned storage may be shared across calls, re-used and modified as the iterator advances.
+ * @return the binary value
+ */
+ public BytesRef binaryValue() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will
+ * have no effect on the progress of the iteration or the values returned by this iterator. Successive calls
+ * will retrieve independent copies that do not overwrite each others' returned values.
+ */
+ public abstract RandomAccess randomAccess();
+
+ /**
+ * Provides random access to vectors by dense ordinal.
+ *
+ * @lucene.experimental
+ */
+ public interface RandomAccess {
+
+ /**
+ * Return the number of vector values
+ */
+ int size();
+
+ /**
+ * Return the dimension of the returned vector values
+ */
+ int dimension();
+
+ /**
+ * Return the score function used to compare these vectors
+ */
+ ScoreFunction scoreFunction();
+
+ /**
+ * Return the vector value indexed at the given ordinal. The provided floating point array may
+ * be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}.
+ * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}.
+ */
+ float[] vectorValue(int targetOrd) throws IOException;
+
+ /**
+ * Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef;
+ * these are the bytes corresponding to the float array in IEEE 754 standard encoding, encoded
+ * using little-endian byte order. The provided bytes may be shared and overwritten by subsequent
+ * calls to this method and {@link #vectorValue(int)}.
+ * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}.
+ */
+ BytesRef binaryValue(int targetOrd) throws IOException;
+
+ /**
+ * Return the dense ordinal of the document if it has a vector. This ordinal ranges from 0 to the one less than the number
+ * of documents having a vector in this iterator, and it is guaranteed to increase with increasing docid.
+ * @param docId the document whose ordinal is returned
+ * @return the ordinal of the given document, or -1 if the document has no vector value
+ */
+ //int ordinal(int docId);
+
+ /**
+ * Return the k nearest neighbor documents as determined by comparison of their vector values
+ * for this field, to the given vector, by the field's score function. If the score function is
+ * reversed, lower values indicate nearer vectors, otherwise higher scores indicate nearer
+ * vectors. Unlike relevance scores, vector scores may be negative.
+ * @param target the vector-valued query
+ * @param k the number of docs to return
+ * @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost
+ * @return the k nearest neighbor documents, along with their (scoreFunction-specific) scores.
+ */
+ TopDocs search(float[] target, int k, int fanout) throws IOException;
+ }
+
+ /**
+ * Score function. This is used during indexing and searching of the vectors to determine the nearest neighbors.
+ * Score values may be negative. By default high scores indicate nearer documents, unless the function is reversed.
+ */
+ public enum ScoreFunction {
+ /** No distance function is used. Note: {@link VectorValues.RandomAccess#search(float[], int, int)}
+ * is not supported for fields specifying this score function. */
+ NONE,
+
+ /** Euclidean distance */
+ EUCLIDEAN(true) {
+ @Override
+ public float score(float[] v1, float[] v2) {
+ assert v1.length == v2.length;
+ float squareSum = 0.0f;
+ int dim = v1.length;
+ for (int i = 0; i < dim; i++) {
+ float diff = v1[i] - v2[i];
+ squareSum += diff * diff;
+ }
+ return squareSum;
+ }
+ },
+
+ /** dot product - note, may be negative; larger values are better */
+ DOT_PRODUCT() {
+ @Override
+ public float score(float[] a, float[] b) {
+ float res = 0f;
+ /*
+ * If length of vector is larger than 8, we use unrolled dot product to accelerate the
+ * calculation.
+ */
+ int i;
+ for (i = 0; i < a.length % 8; i++) {
+ res += b[i] * a[i];
+ }
+ if (a.length < 8) {
+ return res;
+ }
+ float s0 = 0f;
+ float s1 = 0f;
+ float s2 = 0f;
+ float s3 = 0f;
+ float s4 = 0f;
+ float s5 = 0f;
+ float s6 = 0f;
+ float s7 = 0f;
+ for (; i + 7 < a.length; i += 8) {
+ s0 += b[i] * a[i];
+ s1 += b[i + 1] * a[i + 1];
+ s2 += b[i + 2] * a[i + 2];
+ s3 += b[i + 3] * a[i + 3];
+ s4 += b[i + 4] * a[i + 4];
+ s5 += b[i + 5] * a[i + 5];
+ s6 += b[i + 6] * a[i + 6];
+ s7 += b[i + 7] * a[i + 7];
+ }
+ res += s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
+ return res;
+ }
+ };
+
+ /** If reversed, smaller values are better */
+ final public boolean reversed;
+
+ ScoreFunction(boolean reversed) {
+ this.reversed = reversed;
+ }
+
+ ScoreFunction() {
+ this(false);
+ }
+
+ /**
+ * Calculates the score between the specified two vectors.
+ */
+ public float score(float[] v1, float[] v2) {
+ throw new UnsupportedOperationException();
+ }
+
+ }
+
+ /**
+ * Calculates a similarity score between the two vectors with specified function.
+ */
+ public static float compare(float[] v1, float[] v2, ScoreFunction scoreFunction) {
+ assert v1.length == v2.length : "attempt to compare vectors of lengths: " + v1.length + " " + v2.length;
+ return scoreFunction.score(v1, v2);
+ }
+
+ /**
+ * Represents the lack of vector values. It is returned by providers that do not
+ * support VectorValues.
+ */
+ public static final VectorValues EMPTY = new VectorValues() {
+
+ @Override
+ public int size() {
+ return 0;
+ }
+
+ @Override
+ public int dimension() {
+ return 0;
+ }
+
+ @Override
+ public ScoreFunction scoreFunction() {
+ return ScoreFunction.NONE;
+ }
+
+ @Override
+ public float[] vectorValue() {
+ throw new IllegalStateException("Attempt to get vectors from EMPTY values (which was not advanced)");
+ }
+
+ @Override
+ public RandomAccess randomAccess() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int docID() {
+ throw new IllegalStateException("VectorValues is EMPTY, and not positioned on a doc");
+ }
+
+ @Override
+ public int nextDoc() {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int advance(int target) {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public long cost() {
+ return 0;
+ }
+ };
+}
diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java
new file mode 100644
index 000000000000..564b7b61a22d
--- /dev/null
+++ b/lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java
@@ -0,0 +1,322 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.index;
+
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.codecs.VectorWriter;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.Counter;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/** Buffers up pending vector value(s) per doc, then flushes when segment flushes. */
+class VectorValuesWriter {
+
+ private final FieldInfo fieldInfo;
+ private final Counter iwBytesUsed;
+ private final List vectors = new ArrayList<>();
+ private final DocsWithFieldSet docsWithField;
+
+ private int lastDocID = -1;
+
+ private long bytesUsed;
+
+ VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) {
+ this.fieldInfo = fieldInfo;
+ this.iwBytesUsed = iwBytesUsed;
+ this.docsWithField = new DocsWithFieldSet();
+ this.bytesUsed = docsWithField.ramBytesUsed();
+ if (iwBytesUsed != null) {
+ iwBytesUsed.addAndGet(bytesUsed);
+ }
+ }
+
+ /**
+ * Adds a value for the given document. Only a single value may be added.
+ * @param docID the value is added to this document
+ * @param vectorValue the value to add
+ * @throws IllegalArgumentException if a value has already been added to the given document
+ */
+ public void addValue(int docID, float[] vectorValue) {
+ if (docID == lastDocID) {
+ throw new IllegalArgumentException("VectorValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)");
+ }
+ if (vectorValue.length != fieldInfo.getVectorDimension()) {
+ throw new IllegalArgumentException("Attempt to index a vector of dimension " + vectorValue.length +
+ " but \"" + fieldInfo.name + "\" has dimension " + fieldInfo.getVectorDimension());
+ }
+ assert docID > lastDocID;
+ docsWithField.add(docID);
+ vectors.add(ArrayUtil.copyOfSubArray(vectorValue, 0, vectorValue.length));
+ updateBytesUsed();
+ lastDocID = docID;
+ }
+
+ private void updateBytesUsed() {
+ final long newBytesUsed = docsWithField.ramBytesUsed()
+ + vectors.size() * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER)
+ + vectors.size() * vectors.get(0).length * Float.BYTES;
+ if (iwBytesUsed != null) {
+ iwBytesUsed.addAndGet(newBytesUsed - bytesUsed);
+ }
+ bytesUsed = newBytesUsed;
+ }
+
+ /**
+ * Flush this field's values to storage, sorting the values in accordance with sortMap
+ * @param sortMap specifies the order of documents being flushed, or null if they are to be flushed in docid order
+ * @param vectorWriter the Codec's vector writer that handles the actual encoding and I/O
+ * @throws IOException if there is an error writing the field and its values
+ */
+ public void flush(Sorter.DocMap sortMap, VectorWriter vectorWriter) throws IOException {
+ VectorValues vectorValues = new BufferedVectorValues(docsWithField, vectors, fieldInfo.getVectorDimension(), fieldInfo.getVectorScoreFunction());
+ if (sortMap != null) {
+ vectorWriter.writeField(fieldInfo, new SortingVectorValues(vectorValues, sortMap));
+ } else {
+ vectorWriter.writeField(fieldInfo, vectorValues);
+ }
+ }
+
+ private static class SortingVectorValues extends VectorValues {
+
+ private final VectorValues delegate;
+ private final VectorValues.RandomAccess randomAccess;
+ private final int[] docIdOffsets;
+ private final int[] ordMap;
+ private int docId = -1;
+
+ SortingVectorValues(VectorValues delegate, Sorter.DocMap sortMap) throws IOException {
+ this.delegate = delegate;
+ randomAccess = delegate.randomAccess();
+ docIdOffsets = new int[sortMap.size()];
+
+ int offset = 1; // 0 means no vector for this (field, document)
+ int docID;
+ while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) {
+ int newDocID = sortMap.oldToNew(docID);
+ docIdOffsets[newDocID] = offset++;
+ }
+
+ // set up ordMap to map from new dense ordinal to old dense ordinal
+ ordMap = new int[offset - 1];
+ int ord = 0;
+ for (int docIdOffset : docIdOffsets) {
+ if (docIdOffset != 0) {
+ ordMap[ord++] = docIdOffset - 1;
+ }
+ }
+ assert ord == ordMap.length;
+ }
+
+ @Override
+ public int docID() {
+ return docId;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ while (docId < docIdOffsets.length - 1) {
+ ++docId;
+ if (docIdOffsets[docId] != 0) {
+ return docId;
+ }
+ }
+ docId = NO_MORE_DOCS;
+ return docId;
+ }
+
+ @Override
+ public BytesRef binaryValue() throws IOException {
+ return randomAccess.binaryValue(docIdOffsets[docId] - 1);
+ }
+
+ @Override
+ public float[] vectorValue() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int dimension() {
+ return delegate.dimension();
+ }
+
+ @Override
+ public int size() {
+ return delegate.size();
+ }
+
+ @Override
+ public ScoreFunction scoreFunction() {
+ return delegate.scoreFunction();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long cost() {
+ return size();
+ }
+
+ @Override
+ public RandomAccess randomAccess() {
+ RandomAccess ra = delegate.randomAccess();
+ return new RandomAccess() {
+
+ @Override
+ public int size() {
+ return delegate.size();
+ }
+
+ @Override
+ public int dimension() {
+ return delegate.dimension();
+ }
+
+ @Override
+ public ScoreFunction scoreFunction() {
+ return delegate.scoreFunction();
+ }
+
+ @Override
+ public float[] vectorValue(int targetOrd) throws IOException {
+ return ra.vectorValue(ordMap[targetOrd]);
+ }
+
+ @Override
+ public BytesRef binaryValue(int targetOrd) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public TopDocs search(float[] target, int k, int fanout) {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+ }
+
+ private static class BufferedVectorValues extends VectorValues implements VectorValues.RandomAccess {
+
+ final DocsWithFieldSet docsWithField;
+
+ // These are always the vectors of a VectorValuesWriter, which are copied when added to it
+ final List vectors;
+ final VectorValues.ScoreFunction scoreFunction;
+ final int dimension;
+
+ final ByteBuffer buffer;
+ final BytesRef binaryValue;
+ final ByteBuffer raBuffer;
+ final BytesRef raBinaryValue;
+
+ DocIdSetIterator docsWithFieldIter;
+ int ord = -1;
+
+ BufferedVectorValues(DocsWithFieldSet docsWithField, List vectors, int dimension, VectorValues.ScoreFunction scoreFunction) {
+ this.docsWithField = docsWithField;
+ this.vectors = vectors;
+ this.dimension = dimension;
+ this.scoreFunction = scoreFunction;
+ buffer = ByteBuffer.allocate(dimension * Float.BYTES);
+ binaryValue = new BytesRef(buffer.array());
+ raBuffer = ByteBuffer.allocate(dimension * Float.BYTES);
+ raBinaryValue = new BytesRef(raBuffer.array());
+ docsWithFieldIter = docsWithField.iterator();
+ }
+
+ @Override
+ public RandomAccess randomAccess() {
+ return this;
+ }
+
+ @Override
+ public int dimension() {
+ return dimension;
+ }
+
+ @Override
+ public int size() {
+ return vectors.size();
+ }
+
+ @Override
+ public VectorValues.ScoreFunction scoreFunction() {
+ return scoreFunction;
+ }
+
+ @Override
+ public BytesRef binaryValue() {
+ buffer.asFloatBuffer().put(vectorValue());
+ return binaryValue;
+ }
+
+ @Override
+ public BytesRef binaryValue(int targetOrd) {
+ raBuffer.asFloatBuffer().put(vectors.get(targetOrd));
+ return raBinaryValue;
+ }
+
+ @Override
+ public float[] vectorValue() {
+ return vectors.get(ord);
+ }
+
+ @Override
+ public float[] vectorValue(int targetOrd) {
+ return vectors.get(targetOrd);
+ }
+
+ @Override
+ public int docID() {
+ return docsWithFieldIter.docID();
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ int docID = docsWithFieldIter.nextDoc();
+ if (docID != NO_MORE_DOCS) {
+ ++ord;
+ }
+ return docID;
+ }
+
+ @Override
+ public int advance(int target) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public long cost() {
+ return docsWithFieldIter.cost();
+ }
+
+ @Override
+ public TopDocs search(float[] target, int k, int fanout) throws IOException {
+ throw new UnsupportedOperationException();
+ }
+ }
+}
diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
index 2be0f71cc28d..7eec415dffa7 100644
--- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
+++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec
@@ -13,4 +13,4 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-org.apache.lucene.codecs.lucene87.Lucene87Codec
+org.apache.lucene.codecs.lucene90.Lucene90Codec
diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java
index b6dc5a5a3afd..f5dbf405200f 100644
--- a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java
+++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java
@@ -19,6 +19,7 @@
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat.Mode;
+import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.BaseStoredFieldsFormatTestCase;
@@ -32,7 +33,7 @@
public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase {
@Override
protected Codec getCodec() {
- return new Lucene87Codec(Mode.BEST_COMPRESSION);
+ return new Lucene90Codec(Mode.BEST_COMPRESSION);
}
/**
@@ -43,7 +44,7 @@ public void testMixedCompressions() throws Exception {
Directory dir = newDirectory();
for (int i = 0; i < 10; i++) {
IndexWriterConfig iwc = newIndexWriterConfig();
- iwc.setCodec(new Lucene87Codec(RandomPicks.randomFrom(random(), Mode.values())));
+ iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values())));
IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig());
Document doc = new Document();
doc.add(new StoredField("field1", "value1"));
@@ -70,7 +71,7 @@ public void testMixedCompressions() throws Exception {
public void testInvalidOptions() {
expectThrows(NullPointerException.class, () -> {
- new Lucene87Codec(null);
+ new Lucene90Codec(null);
});
expectThrows(NullPointerException.class, () -> {
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
index f71d6a28716a..81e2387d038d 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java
@@ -106,6 +106,16 @@ public int pointNumBytes() {
return 0;
}
+ @Override
+ public int vectorDimension() {
+ return 0;
+ }
+
+ @Override
+ public VectorValues.ScoreFunction vectorScoreFunction() {
+ return VectorValues.ScoreFunction.NONE;
+ }
+
@Override
public Map getAttributes() {
return null;
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java
index a7c681189042..226f199c9dea 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java
@@ -37,6 +37,8 @@
import org.apache.lucene.util.StringHelper;
import org.apache.lucene.util.Version;
+import static org.apache.lucene.index.VectorValues.ScoreFunction.NONE;
+
public class TestPendingSoftDeletes extends TestPendingDeletes {
@Override
@@ -164,7 +166,7 @@ public void testApplyUpdates() throws IOException {
deletes.onNewReader(segmentReader, commitInfo);
reader.close();
writer.close();
- FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, 0, true);
+ FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS);
List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true));
for (DocValuesFieldUpdates update : updates) {
@@ -185,7 +187,7 @@ public void testApplyUpdates() throws IOException {
docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS);
updates = Arrays.asList(singleUpdate(docsDeleted, 10, true));
- fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, 0, true);
+ fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
}
@@ -228,7 +230,7 @@ public void testUpdateAppliedOnlyOnce() throws IOException {
SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo();
PendingDeletes deletes = newPendingDeletes(segmentInfo);
deletes.onNewReader(segmentReader, segmentInfo);
- FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true);
+ FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS);
List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true));
for (DocValuesFieldUpdates update : updates) {
@@ -276,7 +278,7 @@ public void testResetOnUpdate() throws IOException {
SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo();
PendingDeletes deletes = newPendingDeletes(segmentInfo);
deletes.onNewReader(segmentReader, segmentInfo);
- FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true);
+ FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false));
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
@@ -295,7 +297,7 @@ public void testResetOnUpdate() throws IOException {
assertEquals(0, deletes.numPendingDeletes());
segmentInfo.advanceDocValuesGen();
- fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true);
+ fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true);
updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true));
for (DocValuesFieldUpdates update : updates) {
deletes.onDocValuesUpdate(fieldInfo, update.iterator());
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java
index 5ea8f835feeb..2be3536342a8 100644
--- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java
+++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java
@@ -106,6 +106,9 @@ public PointValues getPointValues(String field) {
return null;
}
+ @Override
+ public VectorValues getVectorValues(String field) { return null; }
+
@Override
protected void doClose() {
}
diff --git a/lucene/core/src/test/org/apache/lucene/index/TestVectorValues.java b/lucene/core/src/test/org/apache/lucene/index/TestVectorValues.java
new file mode 100644
index 000000000000..c1494c53511b
--- /dev/null
+++ b/lucene/core/src/test/org/apache/lucene/index/TestVectorValues.java
@@ -0,0 +1,722 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.index;
+
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.StringField;
+import org.apache.lucene.document.VectorField;
+import org.apache.lucene.index.VectorValues.ScoreFunction;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortField;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.TestUtil;
+
+import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
+
+/** Test Indexing/IndexWriter with vectors */
+public class TestVectorValues extends LuceneTestCase {
+
+ private IndexWriterConfig createIndexWriterConfig() {
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setCodec(Codec.forName("Lucene90"));
+ return iwc;
+ }
+
+ // Suddenly add vectors to an existing field:
+ public void testUpgradeFieldToVectors() throws Exception {
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(newStringField("f", "foo", Store.NO));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ }
+ }
+
+ public void testFieldConstructor() {
+ float[] v = new float[1];
+ VectorField field = new VectorField("f", v);
+ assertEquals(1, field.fieldType().vectorDimension());
+ assertEquals(ScoreFunction.EUCLIDEAN, field.fieldType().vectorScoreFunction());
+ assertSame(v, field.vectorValue());
+ }
+
+ public void testFieldConstructorExceptions() {
+ expectThrows(IllegalArgumentException.class, () -> new VectorField(null, new float[1]));
+ expectThrows(IllegalArgumentException.class, () -> new VectorField("f", null));
+ expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[1], null));
+ expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[0]));
+ expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1]));
+ }
+
+ public void testFieldSetValue() {
+ VectorField field = new VectorField("f", new float[1]);
+ float[] v1 = new float[1];
+ field.setVectorValue(v1);
+ assertSame(v1, field.vectorValue());
+ expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[2]));
+ expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(null));
+ }
+
+ // Illegal schema change tests:
+
+ public void testIllegalDimChangeTwoDocs() throws Exception {
+ try (Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ if (random().nextBoolean()) {
+ // sometimes test with two segments
+ w.commit();
+ }
+
+ Document doc2 = new Document();
+ doc2.add(new VectorField("f", new float[3], ScoreFunction.DOT_PRODUCT));
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w.addDocument(doc2));
+ assertEquals("cannot change vector dimension from 4 to 3 for field=\"f\"", expected.getMessage());
+ }
+ }
+
+ public void testIllegalScoreFunctionChange() throws Exception {
+ try (Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ if (random().nextBoolean()) {
+ // sometimes test with two segments
+ w.commit();
+ }
+
+ Document doc2 = new Document();
+ doc2.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w.addDocument(doc2));
+ assertEquals("cannot change vector score function from DOT_PRODUCT to EUCLIDEAN for field=\"f\"", expected.getMessage());
+ }
+ }
+
+ public void testIllegalDimChangeTwoWriters() throws Exception {
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+
+ try (IndexWriter w2 = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc2 = new Document();
+ doc2.add(new VectorField("f", new float[1], ScoreFunction.DOT_PRODUCT));
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w2.addDocument(doc2));
+ assertEquals("cannot change vector dimension from 4 to 1 for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+
+ public void testIllegalScoreFunctionChangeTwoWriters() throws Exception {
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+
+ try (IndexWriter w2 = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc2 = new Document();
+ doc2.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w2.addDocument(doc2));
+ assertEquals("cannot change vector score function from DOT_PRODUCT to EUCLIDEAN for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+
+ public void testAddIndexesDirectory0() throws Exception {
+ String fieldName = "field";
+ Document doc = new Document();
+ doc.add(new VectorField(fieldName, new float[4], ScoreFunction.DOT_PRODUCT));
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ w2.addIndexes(new Directory[]{dir});
+ try (IndexReader reader = w2.getReader()) {
+ LeafReader r = reader.leaves().get(0).reader();
+ VectorValues vectorValues = r.getVectorValues(fieldName);
+ assertEquals(0, vectorValues.nextDoc());
+ assertEquals(0, vectorValues.vectorValue()[0], 0);
+ assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
+ }
+ }
+ }
+ }
+
+ public void testAddIndexesDirectory1() throws Exception {
+ String fieldName = "field";
+ Document doc = new Document();
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ w.addDocument(doc);
+ }
+ doc.add(new VectorField(fieldName, new float[4], ScoreFunction.DOT_PRODUCT));
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ w2.addDocument(doc);
+ w2.addIndexes(new Directory[]{dir});
+ try (IndexReader reader = w2.getReader()) {
+ LeafReader r = reader.leaves().get(0).reader();
+ VectorValues vectorValues = r.getVectorValues(fieldName);
+ assertEquals(0, vectorValues.nextDoc());
+ assertEquals(0, vectorValues.vectorValue()[0], 0);
+ assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
+ }
+ }
+ }
+ }
+
+ public void testAddIndexesDirectory01() throws Exception {
+ String fieldName = "field";
+ float[] vector = new float[1];
+ Document doc = new Document();
+ doc.add(new VectorField(fieldName, vector, ScoreFunction.DOT_PRODUCT));
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ vector[0] = 1;
+ w2.addDocument(doc);
+ w2.addIndexes(new Directory[]{dir});
+ w2.forceMerge(1);
+ try (IndexReader reader = w2.getReader()) {
+ LeafReader r = reader.leaves().get(0).reader();
+ VectorValues vectorValues = r.getVectorValues(fieldName);
+ assertEquals(0, vectorValues.nextDoc());
+ assertEquals(1, vectorValues.vectorValue()[0], 0);
+ assertEquals(1, vectorValues.nextDoc());
+ assertEquals(0, vectorValues.vectorValue()[0], 0);
+ }
+ }
+ }
+ }
+
+ public void testIllegalDimChangeViaAddIndexesDirectory() throws Exception {
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT));
+ w2.addDocument(doc);
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w2.addIndexes(new Directory[]{dir}));
+ assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+
+ public void testIllegalScoreFunctionChangeViaAddIndexesDirectory() throws Exception {
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
+ w2.addDocument(doc);
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w2.addIndexes(dir));
+ assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+
+ public void testIllegalDimChangeViaAddIndexesCodecReader() throws Exception {
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT));
+ w2.addDocument(doc);
+ try (DirectoryReader r = DirectoryReader.open(dir)) {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w2.addIndexes(new CodecReader[]{(CodecReader) getOnlyLeafReader(r)}));
+ assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+ }
+
+ public void testIllegalScoreFunctionChangeViaAddIndexesCodecReader() throws Exception {
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
+ w2.addDocument(doc);
+ try (DirectoryReader r = DirectoryReader.open(dir)) {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w2.addIndexes(new CodecReader[]{(CodecReader) getOnlyLeafReader(r)}));
+ assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+ }
+
+ public void testIllegalDimChangeViaAddIndexesSlowCodecReader() throws Exception {
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT));
+ w2.addDocument(doc);
+ try (DirectoryReader r = DirectoryReader.open(dir)) {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> TestUtil.addIndexesSlowly(w2, r));
+ assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+ }
+
+ public void testIllegalScoreFunctionChangeViaAddIndexesSlowCodecReader() throws Exception {
+ try (Directory dir = newDirectory();
+ Directory dir2 = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN));
+ w2.addDocument(doc);
+ try (DirectoryReader r = DirectoryReader.open(dir)) {
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> TestUtil.addIndexesSlowly(w2, r));
+ assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage());
+ }
+ }
+ }
+ }
+
+ public void testIllegalMultipleValues() throws Exception {
+ try (Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ IllegalArgumentException expected = expectThrows(IllegalArgumentException.class,
+ () -> w.addDocument(doc));
+ assertEquals("VectorValuesField \"f\" appears more than once in this document (only one value is allowed per field)",
+ expected.getMessage());
+ }
+ }
+
+ public void testIllegalDimensionTooLarge() throws Exception {
+ try (Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ expectThrows(IllegalArgumentException.class,
+ () -> doc.add(new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1], ScoreFunction.DOT_PRODUCT)));
+
+ Document doc2 = new Document();
+ doc2.add(new VectorField("f", new float[1], ScoreFunction.EUCLIDEAN));
+ w.addDocument(doc2);
+ }
+ }
+
+ public void testIllegalEmptyVector() throws Exception {
+ try (Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ Exception e = expectThrows(IllegalArgumentException.class,
+ () -> doc.add(new VectorField("f", new float[0], ScoreFunction.NONE)));
+ assertEquals("cannot index an empty vector", e.getMessage());
+
+ Document doc2 = new Document();
+ doc2.add(new VectorField("f", new float[1], ScoreFunction.NONE));
+ w.addDocument(doc2);
+ }
+ }
+
+ // Write vectors, one segment with default codec, another with SimpleText, then forceMerge
+ public void testDifferentCodecs1() throws Exception {
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setCodec(Codec.forName("SimpleText"));
+ try (IndexWriter w = new IndexWriter(dir, iwc)) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ w.forceMerge(1);
+ }
+ }
+ }
+
+ // Write vectors, one segment with with SimpleText, another with default codec, then forceMerge
+ public void testDifferentCodecs2() throws Exception {
+ IndexWriterConfig iwc = newIndexWriterConfig();
+ iwc.setCodec(Codec.forName("SimpleText"));
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, iwc)) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ }
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ w.forceMerge(1);
+ }
+ }
+ }
+
+ public void testInvalidVectorFieldUsage() {
+ VectorField field = new VectorField("field", new float[2], ScoreFunction.NONE);
+
+ expectThrows(IllegalArgumentException.class, () -> field.setIntValue(14));
+
+ expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[1]));
+
+ assertNull(field.numericValue());
+ }
+
+ public void testDeleteAllVectorDocs() throws Exception {
+ try (Directory dir = newDirectory();
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new StringField("id", "0", Store.NO));
+ doc.add(new VectorField("v", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ w.addDocument(new Document());
+ w.commit();
+
+ try (DirectoryReader r = w.getReader()) {
+ assertNotNull(r.leaves().get(0).reader().getVectorValues("v"));
+ }
+ w.deleteDocuments(new Term("id", "0"));
+ w.forceMerge(1);
+ try (DirectoryReader r = w.getReader()) {
+ assertNull(r.leaves().get(0).reader().getVectorValues("v"));
+ }
+ }
+ }
+
+ public void testVectorFieldMissingFromOneSegment() throws Exception {
+ try (Directory dir = FSDirectory.open(createTempDir());
+ IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new StringField("id", "0", Store.NO));
+ doc.add(new VectorField("v0", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ w.commit();
+
+ doc = new Document();
+ doc.add(new VectorField("v1", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT));
+ w.addDocument(doc);
+ w.forceMerge(1);
+ }
+ }
+
+ public void testSparseVectors() throws Exception {
+ int numDocs = atLeast(1000);
+ int numFields = TestUtil.nextInt(random(), 1, 10);
+ int[] fieldDocCounts = new int[numFields];
+ float[] fieldTotals= new float[numFields];
+ int[] fieldDims = new int[numFields];
+ ScoreFunction[] fieldScoreFunctions = new ScoreFunction[numFields];
+ for (int i = 0; i < numFields; i++) {
+ fieldDims[i] = random().nextInt(20) + 1;
+ fieldScoreFunctions[i] = ScoreFunction.values()[random().nextInt(ScoreFunction.values().length)];
+ }
+ try (Directory dir = newDirectory();
+ RandomIndexWriter w = new RandomIndexWriter(random(), dir, createIndexWriterConfig())) {
+ for (int i = 0; i < numDocs; i++) {
+ Document doc = new Document();
+ for (int field = 0; field < numFields; field++) {
+ String fieldName = "int" + field;
+ if (random().nextInt(100) == 17) {
+ float[] v = randomVector(fieldDims[field]);
+ doc.add(new VectorField(fieldName, v, fieldScoreFunctions[field]));
+ fieldDocCounts[field]++;
+ fieldTotals[field] += v[0];
+ }
+ }
+ w.addDocument(doc);
+ }
+
+ try (IndexReader r = w.getReader()) {
+ for (int field = 0; field < numFields; field++) {
+ int docCount = 0;
+ float checksum = 0;
+ String fieldName = "int" + field;
+ for (LeafReaderContext ctx : r.leaves()) {
+ VectorValues vectors = ctx.reader().getVectorValues(fieldName);
+ if (vectors != null) {
+ docCount += vectors.size();
+ while (vectors.nextDoc() != NO_MORE_DOCS) {
+ checksum += vectors.vectorValue()[0];
+ }
+ }
+ }
+ assertEquals(fieldDocCounts[field], docCount);
+ assertEquals(fieldTotals[field], checksum, 1e-5);
+ }
+ }
+ }
+ }
+
+ public void testIndexedValueNotAliased() throws Exception {
+ // We copy indexed values (as for BinaryDocValues) so the input float[] can be reused across
+ // calls to IndexWriter.addDocument.
+ String fieldName = "field";
+ float[] v = { 0 };
+ try (Directory dir = newDirectory();
+ IndexWriter iw = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc1 = new Document();
+ doc1.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN));
+ v[0] = 1;
+ Document doc2 = new Document();
+ doc2.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN));
+ iw.addDocument(doc1);
+ iw.addDocument(doc2);
+ v[0] = 2;
+ Document doc3 = new Document();
+ doc3.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN));
+ iw.addDocument(doc3);
+ try (IndexReader reader = iw.getReader()) {
+ LeafReader r = reader.leaves().get(0).reader();
+ VectorValues vectorValues = r.getVectorValues(fieldName);
+ vectorValues.nextDoc();
+ assertEquals(1, vectorValues.vectorValue()[0], 0);
+ vectorValues.nextDoc();
+ assertEquals(1, vectorValues.vectorValue()[0], 0);
+ vectorValues.nextDoc();
+ assertEquals(2, vectorValues.vectorValue()[0], 0);
+ }
+ }
+ }
+
+ public void testSortedIndex() throws Exception {
+ IndexWriterConfig iwc = createIndexWriterConfig();
+ iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
+ String fieldName = "field";
+ try (Directory dir = newDirectory();
+ IndexWriter iw = new IndexWriter(dir, iwc)) {
+ add(iw, fieldName, 1, 1, new float[]{1});
+ add(iw, fieldName, 4, 4, new float[]{4});
+ add(iw, fieldName, 3, 3, null);
+ add(iw, fieldName, 2, 2, new float[]{2});
+ try (IndexReader reader = iw.getReader()) {
+ LeafReader leaf = reader.leaves().get(0).reader();
+
+ VectorValues vectorValues = leaf.getVectorValues(fieldName);
+ assertEquals(1, vectorValues.dimension());
+ assertEquals(3, vectorValues.size());
+ assertEquals("1", leaf.document(vectorValues.nextDoc()).get("id"));
+ assertEquals(1f, vectorValues.vectorValue()[0], 0);
+ assertEquals("2", leaf.document(vectorValues.nextDoc()).get("id"));
+ assertEquals(2f, vectorValues.vectorValue()[0], 0);
+ assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id"));
+ assertEquals(4f, vectorValues.vectorValue()[0], 0);
+ assertEquals(NO_MORE_DOCS, vectorValues.nextDoc());
+
+ VectorValues.RandomAccess ra = vectorValues.randomAccess();
+ assertEquals(1f, ra.vectorValue(0)[0], 0);
+ assertEquals(2f, ra.vectorValue(1)[0], 0);
+ assertEquals(4f, ra.vectorValue(2)[0], 0);
+ }
+ }
+ }
+
+ /**
+ * Index random vectors, sometimes skipping documents, sometimes deleting a document,
+ * sometimes merging, sometimes sorting the index,
+ * and verify that the expected values can be read back consistently.
+ */
+ public void testRandom() throws Exception {
+ IndexWriterConfig iwc = createIndexWriterConfig();
+ if (random().nextBoolean()) {
+ iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT)));
+ }
+ String fieldName = "field";
+ try (Directory dir = newDirectory();
+ IndexWriter iw = new IndexWriter(dir, iwc)) {
+ int numDoc = atLeast(100);
+ int dimension = atLeast(10);
+ float[] scratch = new float[dimension];
+ int numValues = 0;
+ float[][] values = new float[numDoc][];
+ for (int i = 0; i < numDoc; i++) {
+ if (random().nextInt(7) != 3) {
+ // usually index a vector value for a doc
+ values[i] = randomVector(dimension);
+ ++numValues;
+ }
+ if (random().nextBoolean() && values[i] != null) {
+ // sometimes use a shared scratch array
+ System.arraycopy(values[i], 0, scratch, 0, scratch.length);
+ add(iw, fieldName, i, scratch);
+ } else {
+ add(iw, fieldName, i, values[i]);
+ }
+ if (random().nextInt(10) == 2) {
+ // sometimes delete a random document
+ int idToDelete = random().nextInt(i + 1);
+ iw.deleteDocuments(new Term("id", Integer.toString(idToDelete)));
+ // and remember that it was deleted
+ if (values[idToDelete] != null) {
+ values[idToDelete] = null;
+ --numValues;
+ }
+ }
+ if (random().nextInt(10) == 3) {
+ iw.commit();
+ }
+ }
+ iw.forceMerge(1);
+ try (IndexReader reader = iw.getReader()) {
+ int valueCount = 0, totalSize = 0;
+ for (LeafReaderContext ctx : reader.leaves()) {
+ VectorValues vectorValues = ctx.reader().getVectorValues(fieldName);
+ if (vectorValues == null) {
+ continue;
+ }
+ totalSize += vectorValues.size();
+ int docId;
+ while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) {
+ float[] v = vectorValues.vectorValue();
+ assertEquals(dimension, v.length);
+ String idString = ctx.reader().document(docId).getField("id").stringValue();
+ int id = Integer.parseInt(idString);
+ assertArrayEquals(idString, values[id], v, 0);
+ ++valueCount;
+ }
+ }
+ assertEquals(numValues, valueCount);
+ assertEquals(numValues, totalSize);
+ }
+ }
+ }
+
+ private void add(IndexWriter iw, String field, int id, float[] vector) throws IOException {
+ add(iw, field, id, random().nextInt(100), vector);
+ }
+
+ private void add(IndexWriter iw, String field, int id, int sortkey, float[] vector) throws IOException {
+ Document doc = new Document();
+ if (vector != null) {
+ doc.add(new VectorField(field, vector));
+ }
+ doc.add(new NumericDocValuesField("sortkey", sortkey));
+ doc.add(new StringField("id", Integer.toString(id), Field.Store.YES));
+ iw.addDocument(doc);
+ }
+
+ private float[] randomVector(int dim) {
+ float[] v = new float[dim];
+ for (int i = 0; i < dim; i++) {
+ v[i] = random().nextFloat();
+ }
+ return v;
+ }
+
+ public void testCheckIndexIncludesVectors() throws Exception {
+ try (Directory dir = newDirectory()) {
+ try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) {
+ Document doc = new Document();
+ doc.add(new VectorField("v1", randomVector(3), ScoreFunction.NONE));
+ w.addDocument(doc);
+
+ doc.add(new VectorField("v2", randomVector(3), ScoreFunction.NONE));
+ w.addDocument(doc);
+ }
+
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+ CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, output);
+ assertEquals(1, status.segmentInfos.size());
+ CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0);
+ // total 3 vector values were indexed:
+ assertEquals(3, segStatus.vectorValuesStatus.totalVectorValues);
+ // ... across 2 fields:
+ assertEquals(2, segStatus.vectorValuesStatus.totalVectorFields);
+
+ // Make sure CheckIndex in fact declares that it is testing vectors!
+ assertTrue(output.toString(IOUtils.UTF_8).contains("test: vectors..."));
+ }
+ }
+
+ public void testScoreFunctionIdentifiers() throws Exception {
+ // make sure we don't accidentally mess up score function identifiers by re-ordering their enumerators
+ assertEquals(0, ScoreFunction.NONE.ordinal());
+ assertEquals(1, ScoreFunction.EUCLIDEAN.ordinal());
+ assertEquals(2, ScoreFunction.DOT_PRODUCT.ordinal());
+ assertEquals(3, ScoreFunction.values().length);
+ }
+
+}
diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
index 10319f9eb54e..01570aa01694 100644
--- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
+++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java
@@ -854,7 +854,7 @@ public void testPrimaryKeys() throws Exception {
System.out.println("TEST: cycle=" + cycle);
}
RandomIndexWriter w = new RandomIndexWriter(random(), dir,
- newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+ newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
Document doc = new Document();
Field idField = newStringField("id", "", Field.Store.NO);
doc.add(idField);
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
index 42f7aec1a38e..f5f8c5c8a606 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
@@ -35,6 +35,7 @@
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
@@ -81,7 +82,7 @@ public int size() {
}
FieldInfo fieldInfo = new FieldInfo(field, 0,
true, true, terms.hasPayloads(),
- indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, false);
+ indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false);
fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
}
@@ -139,6 +140,11 @@ public PointValues getPointValues(String fieldName) {
return null;
}
+ @Override
+ public VectorValues getVectorValues(String fieldName) {
+ return null;
+ }
+
@Override
public void checkIntegrity() throws IOException {
}
diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
index 705e0cc744d2..ed00f85f7c34 100644
--- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
+++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java
@@ -501,7 +501,8 @@ private FieldInfo createFieldInfo(String fieldName, int ord, IndexableFieldType
IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads,
indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(),
- fieldType.pointDimensionCount(), fieldType.pointIndexDimensionCount(), fieldType.pointNumBytes(), false);
+ fieldType.pointDimensionCount(), fieldType.pointIndexDimensionCount(), fieldType.pointNumBytes(),
+ fieldType.vectorDimension(), fieldType.vectorScoreFunction(), false);
}
private void storePointValues(Info info, BytesRef pointValue) {
@@ -521,6 +522,7 @@ private void storeDocValues(Info info, DocValuesType docValuesType, Object docVa
info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(),
info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(),
info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointIndexDimensionCount(), info.fieldInfo.getPointNumBytes(),
+ info.fieldInfo.getVectorDimension(), info.fieldInfo.getVectorScoreFunction(),
info.fieldInfo.isSoftDeletesField()
);
} else if (existingDocValuesType != docValuesType) {
@@ -1241,6 +1243,11 @@ public PointValues getPointValues(String fieldName) {
return new MemoryIndexPointValues(info);
}
+ @Override
+ public VectorValues getVectorValues(String fieldName) {
+ return VectorValues.EMPTY;
+ }
+
@Override
public void checkIntegrity() throws IOException {
// no-op
diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
index f4a7c9912b86..5fa5072626cb 100644
--- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
+++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java
@@ -39,7 +39,7 @@
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
-import org.apache.lucene.codecs.lucene87.Lucene87Codec;
+import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntPoint;
@@ -887,7 +887,7 @@ static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, String... sugges
static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set suggestFields) {
IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer);
iwc.setMergePolicy(newLogMergePolicy());
- Codec filterCodec = new Lucene87Codec() {
+ Codec filterCodec = new Lucene90Codec() {
CompletionPostingsFormat.FSTLoadMode fstLoadMode =
RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values());
PostingsFormat postingsFormat = new Completion84PostingsFormat(fstLoadMode);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
index ccc7a000846b..6d6b31158089 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java
@@ -354,7 +354,8 @@ public void testMultiClose() throws IOException {
FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field");
FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(),
proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(),
- proto.getPointDimensionCount(), proto.getPointIndexDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField());
+ proto.getPointDimensionCount(), proto.getPointIndexDimensionCount(), proto.getPointNumBytes(),
+ proto.getVectorDimension(), proto.getVectorScoreFunction(), proto.isSoftDeletesField());
FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } );
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
index dd74a15dbd8e..ed8a1107c13d 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java
@@ -80,6 +80,8 @@ static FieldInfos shuffleInfos(FieldInfos infos, Random random) {
oldInfo.getPointDimensionCount(), // data dimension count
oldInfo.getPointIndexDimensionCount(), // index dimension count
oldInfo.getPointNumBytes(), // dimension numBytes
+ oldInfo.getVectorDimension(), // number of dimensions of the field's vector
+ oldInfo.getVectorScoreFunction(), // distance function for calculating similarity of the field's vector
oldInfo.isSoftDeletesField()); // used as soft-deletes field
shuffled.set(i, newInfo);
}
diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
index 2a21bab87444..366a0d9206f9 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java
@@ -130,7 +130,7 @@ public RandomPostingsTester(Random random) throws IOException {
fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true,
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS,
DocValuesType.NONE, -1, new HashMap<>(),
- 0, 0, 0, false);
+ 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false);
fieldUpto++;
SortedMap postings = new TreeMap<>();
@@ -651,7 +651,7 @@ public FieldsProducer buildIndex(Codec codec, Directory dir, IndexOptions maxAll
DocValuesType.NONE,
-1,
new HashMap<>(),
- 0, 0, 0, false);
+ 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false);
}
FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java
index fdd5fb2f8c23..e4168f3a817c 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java
@@ -35,6 +35,7 @@
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.Version;
@@ -207,6 +208,11 @@ public NumericDocValues getNormValues(String field) throws IOException {
return null;
}
+ @Override
+ public VectorValues getVectorValues(String field) throws IOException {
+ return null;
+ }
+
@Override
public FieldInfos getFieldInfos() {
return FieldInfos.EMPTY;
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
index 81cb328aada2..e15a59a13c8a 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java
@@ -34,7 +34,7 @@
import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec;
import org.apache.lucene.codecs.compressing.CompressingCodec;
import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat;
-import org.apache.lucene.codecs.lucene87.Lucene87Codec;
+import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat;
import org.apache.lucene.codecs.simpletext.SimpleTextCodec;
import org.apache.lucene.index.RandomCodec;
@@ -187,8 +187,8 @@ public String toString() {
codec = new AssertingCodec();
} else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) {
codec = CompressingCodec.randomInstance(random);
- } else if ("Lucene87".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene87"))) {
- codec = new Lucene87Codec(RandomPicks.randomFrom(random, Lucene87StoredFieldsFormat.Mode.values())
+ } else if ("Lucene90".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene90"))) {
+ codec = new Lucene90Codec(RandomPicks.randomFrom(random, Lucene87StoredFieldsFormat.Mode.values())
);
} else if (!"random".equals(TEST_CODEC)) {
codec = Codec.forName(TEST_CODEC);
diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
index 7104a85c2424..be27b0c23c5c 100644
--- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
+++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java
@@ -46,6 +46,8 @@
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
+import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
+import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
@@ -54,7 +56,7 @@
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat;
import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat;
-import org.apache.lucene.codecs.lucene87.Lucene87Codec;
+import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
@@ -101,9 +103,6 @@
import org.apache.lucene.store.NoLockFactory;
import org.junit.Assert;
-import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
-import com.carrotsearch.randomizedtesting.generators.RandomPicks;
-
/**
* General utility methods for Lucene unit tests.
*/
@@ -919,7 +918,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) {
* This may be different than {@link Codec#getDefault()} because that is randomized.
*/
public static Codec getDefaultCodec() {
- return new Lucene87Codec();
+ return new Lucene90Codec();
}
/**
diff --git a/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java b/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java
index 4f1b56cecaaa..33c23a17ddd9 100644
--- a/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java
+++ b/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java
@@ -276,6 +276,12 @@ public PointValues getPointValues(String field) {
return null; // because not supported. Throw UOE?
}
+ @Override
+ public VectorValues getVectorValues(String field) {
+ ensureOpen();
+ return null; // because not supported. Throw UOE?
+ }
+
@Override
public FieldInfos getFieldInfos() {
return fieldInfos;
diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java
index 2f60ccc1441f..cea4536ce4f1 100644
--- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java
+++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java
@@ -26,6 +26,7 @@
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.IndexableFieldType;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.SortField;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.SimpleOrderedMap;
@@ -447,6 +448,16 @@ public int pointNumBytes() {
return 0;
}
+ @Override
+ public int vectorDimension() {
+ return 0;
+ }
+
+ @Override
+ public VectorValues.ScoreFunction vectorScoreFunction() {
+ return VectorValues.ScoreFunction.NONE;
+ }
+
@Override
public Map getAttributes() {
return null;
diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java
index 9a1e740bd8fb..a82a64174eac 100644
--- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java
+++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java
@@ -493,6 +493,8 @@ private static class ReaderWrapper extends FilterLeafReader {
fieldInfo.getPointDimensionCount(),
fieldInfo.getPointIndexDimensionCount(),
fieldInfo.getPointNumBytes(),
+ fieldInfo.getVectorDimension(),
+ fieldInfo.getVectorScoreFunction(),
fieldInfo.isSoftDeletesField());
newInfos.add(f);
} else {
diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java
index 18b760f63ed7..7eae21750a54 100644
--- a/solr/core/src/java/org/apache/solr/search/Insanity.java
+++ b/solr/core/src/java/org/apache/solr/search/Insanity.java
@@ -67,7 +67,8 @@ private static class InsaneReader extends FilterLeafReader {
if (fi.name.equals(insaneField)) {
filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(),
- fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
+ fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
+ fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()));
} else {
filteredInfos.add(fi);
}
diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java
index d83e70ebb5d9..ed62cc4e3fc3 100644
--- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java
+++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java
@@ -284,7 +284,8 @@ public static LeafReader wrap(LeafReader in, Function mapping) {
wrap = true;
newFieldInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(),
fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(),
- fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()));
+ fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(),
+ fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()));
} else {
newFieldInfos.add(fi);
}
diff --git a/solr/core/src/test/org/apache/solr/search/TestDocSet.java b/solr/core/src/test/org/apache/solr/search/TestDocSet.java
index 00ee6ec7aef2..822830f58c41 100644
--- a/solr/core/src/test/org/apache/solr/search/TestDocSet.java
+++ b/solr/core/src/test/org/apache/solr/search/TestDocSet.java
@@ -37,6 +37,7 @@
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.StoredFieldVisitor;
import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.VectorValues;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.TotalHits;
@@ -343,6 +344,11 @@ public PointValues getPointValues(String field) {
return null;
}
+ @Override
+ public VectorValues getVectorValues(String field) {
+ return null;
+ }
+
@Override
protected void doClose() {
}