diff --git a/.dir-locals.el b/.dir-locals.el new file mode 100644 index 000000000000..c51e1232603b --- /dev/null +++ b/.dir-locals.el @@ -0,0 +1,3 @@ +;; set up Lucene style for emacs +((java-mode . ((c-basic-offset . 2)))) + diff --git a/.gitignore b/.gitignore index 319761963836..2e61830567ff 100644 --- a/.gitignore +++ b/.gitignore @@ -7,7 +7,8 @@ build dist lib test-lib -/*~ +*~ +.#* /build.properties /.idea lucene/**/*.iml diff --git a/gradle/documentation/render-javadoc.gradle b/gradle/documentation/render-javadoc.gradle index 914fd8be58b2..d6acaa7497fa 100644 --- a/gradle/documentation/render-javadoc.gradle +++ b/gradle/documentation/render-javadoc.gradle @@ -157,7 +157,8 @@ configure(project(":lucene:backward-codecs")) { "org.apache.lucene.codecs.lucene60", "org.apache.lucene.codecs.lucene80", "org.apache.lucene.codecs.lucene84", - "org.apache.lucene.codecs.lucene86" + "org.apache.lucene.codecs.lucene86", + "org.apache.lucene.codecs.lucene87" ] } } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java index bc4e5f360d8e..bfb51df4c5cd 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene80/Lucene80Codec.java @@ -20,6 +20,7 @@ import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PointsFormat; @@ -128,4 +129,9 @@ public final DocValuesFormat docValuesFormat() { public final NormsFormat normsFormat() { return normsFormat; } + + @Override + public final VectorFormat vectorFormat() { + return VectorFormat.EMPTY; + } } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java index 90918c163d2f..46c8372494a7 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene84/Lucene84Codec.java @@ -23,6 +23,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PointsFormat; @@ -136,6 +137,11 @@ public PointsFormat pointsFormat() { return new Lucene60PointsFormat(); } + @Override + public VectorFormat vectorFormat() { + return VectorFormat.EMPTY; + } + /** Returns the postings format that should be used for writing * new segments of field. * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java index e2974655e75d..8ca5bb65afbf 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene86/Lucene86Codec.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PointsFormat; @@ -136,6 +137,11 @@ public final PointsFormat pointsFormat() { return pointsFormat; } + @Override + public final VectorFormat vectorFormat() { + return VectorFormat.EMPTY; + } + /** Returns the postings format that should be used for writing * new segments of field. * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java index 5ff407384e22..813ae92c33d4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/Lucene87Codec.java @@ -31,6 +31,7 @@ import org.apache.lucene.codecs.SegmentInfoFormat; import org.apache.lucene.codecs.StoredFieldsFormat; import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat; import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat; import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat; @@ -137,6 +138,9 @@ public final PointsFormat pointsFormat() { return pointsFormat; } + @Override + public final VectorFormat vectorFormat() { return VectorFormat.EMPTY; } + /** Returns the postings format that should be used for writing * new segments of field. * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/package.html b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/package.html new file mode 100644 index 000000000000..3474ef9e52f7 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/codecs/lucene87/package.html @@ -0,0 +1,42 @@ + + + + + + + + + +Lucene 8.7 file format. + + diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index d6732336efd5..21452ff60b49 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -16,3 +16,4 @@ org.apache.lucene.codecs.lucene80.Lucene80Codec org.apache.lucene.codecs.lucene84.Lucene84Codec org.apache.lucene.codecs.lucene86.Lucene86Codec +org.apache.lucene.codecs.lucene87.Lucene87Codec \ No newline at end of file diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java index e44b046aa294..b973cf42aff9 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/CreateIndexTask.java @@ -29,7 +29,7 @@ import org.apache.lucene.benchmark.byTask.utils.Config; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene87.Lucene87Codec; +import org.apache.lucene.codecs.lucene90.Lucene90Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexCommit; import org.apache.lucene.index.IndexDeletionPolicy; @@ -138,7 +138,7 @@ public static IndexWriterConfig createWriterConfig(Config config, PerfRunData ru if (defaultCodec == null && postingsFormat != null) { try { final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat); - iwConf.setCodec(new Lucene87Codec() { + iwConf.setCodec(new Lucene90Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return postingsFormatChosen; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java index 109fec980ca9..266e0d296548 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextCodec.java @@ -21,6 +21,7 @@ import org.apache.lucene.codecs.CompoundFormat; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.VectorFormat; import org.apache.lucene.codecs.LiveDocsFormat; import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.PointsFormat; @@ -46,6 +47,7 @@ public final class SimpleTextCodec extends Codec { private final DocValuesFormat dvFormat = new SimpleTextDocValuesFormat(); private final CompoundFormat compoundFormat = new SimpleTextCompoundFormat(); private final PointsFormat pointsFormat = new SimpleTextPointsFormat(); + private final VectorFormat vectorFormat = new SimpleTextVectorFormat(); public SimpleTextCodec() { super("SimpleText"); @@ -100,4 +102,9 @@ public CompoundFormat compoundFormat() { public PointsFormat pointsFormat() { return pointsFormat; } + + @Override + public VectorFormat vectorFormat() { + return vectorFormat; + } } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java index 8d178130a91c..e68a7e502540 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextFieldInfosFormat.java @@ -30,6 +30,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; @@ -67,6 +68,8 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat { static final BytesRef DATA_DIM_COUNT = new BytesRef(" data dimensional count "); static final BytesRef INDEX_DIM_COUNT = new BytesRef(" index dimensional count "); static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes "); + static final BytesRef VECTOR_NUM_DIMS = new BytesRef(" vector number of dimensions "); + static final BytesRef VECTOR_SCORE_FUNC = new BytesRef(" vector score function "); static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes "); @Override @@ -146,13 +149,23 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES); int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch)); + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), VECTOR_NUM_DIMS); + int vectorNumDimensions = Integer.parseInt(readString(VECTOR_NUM_DIMS.length, scratch)); + + SimpleTextUtil.readLine(input, scratch); + assert StringHelper.startsWith(scratch.get(), VECTOR_SCORE_FUNC); + String scoreFunction = readString(VECTOR_SCORE_FUNC.length, scratch); + VectorValues.ScoreFunction vectorDistFunc = distanceFunction(scoreFunction); + SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SOFT_DELETES); boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch)); infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts), - dimensionalCount, indexDimensionalCount, dimensionalNumBytes, isSoftDeletesField); + dimensionalCount, indexDimensionalCount, dimensionalNumBytes, + vectorNumDimensions, vectorDistFunc, isSoftDeletesField); } SimpleTextUtil.checkFooter(input); @@ -172,6 +185,10 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm public DocValuesType docValuesType(String dvType) { return DocValuesType.valueOf(dvType); } + + public VectorValues.ScoreFunction distanceFunction(String scoreFunction) { + return VectorValues.ScoreFunction.valueOf(scoreFunction); + } private String readString(int offset, BytesRefBuilder scratch) { return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8); @@ -253,6 +270,14 @@ public void write(Directory directory, SegmentInfo segmentInfo, String segmentSu SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch); SimpleTextUtil.writeNewline(out); + SimpleTextUtil.write(out, VECTOR_NUM_DIMS); + SimpleTextUtil.write(out, Integer.toString(fi.getVectorDimension()), scratch); + SimpleTextUtil.writeNewline(out); + + SimpleTextUtil.write(out, VECTOR_SCORE_FUNC); + SimpleTextUtil.write(out, fi.getVectorScoreFunction().name(), scratch); + SimpleTextUtil.writeNewline(out); + SimpleTextUtil.write(out, SOFT_DELETES); SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch); SimpleTextUtil.writeNewline(out); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorFormat.java new file mode 100644 index 000000000000..3d4b5fedc38c --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorFormat.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.simpletext; + + +import java.io.IOException; + +import org.apache.lucene.codecs.VectorFormat; +import org.apache.lucene.codecs.VectorReader; +import org.apache.lucene.codecs.VectorWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** For debugging, curiosity, transparency only!! Do not use this codec in production. + * + *

This codec stores all data in a single human-readable text file (_N.vec). You can view this in + * any text editor, and even edit it to alter your index. + * + * @lucene.experimental */ +public final class SimpleTextVectorFormat extends VectorFormat { + + @Override + public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new SimpleTextVectorWriter(state); + } + + @Override + public VectorReader fieldsReader(SegmentReadState state) throws IOException { + return new SimpleTextVectorReader(state); + } + + /** Extension of points data file */ + static final String VECTOR_EXTENSION = "vec"; + + /** Extension of points index file */ + static final String META_EXTENSION = "gri"; +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java new file mode 100644 index 000000000000..11494c97ed97 --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorReader.java @@ -0,0 +1,304 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.simpletext; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.VectorReader; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.BufferedChecksumIndexInput; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.StringHelper; + +import static org.apache.lucene.codecs.simpletext.SimpleTextVectorWriter.*; + +/** + * Reads vector values from a simple text format. All vectors are read up front and cached in RAM in order to support + * random access. + * FOR RECREATIONAL USE ONLY + * @lucene.experimental + */ +public class SimpleTextVectorReader extends VectorReader { + + private static final BytesRef EMPTY = new BytesRef(""); + + private final SegmentReadState readState; + private final IndexInput dataIn; + private final BytesRefBuilder scratch = new BytesRefBuilder(); + private final Map fieldEntries = new HashMap<>(); + + SimpleTextVectorReader(SegmentReadState readState) throws IOException { + this.readState = readState; + String metaFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION); + try (ChecksumIndexInput in = readState.directory.openChecksumInput(metaFileName, IOContext.DEFAULT)) { + int fieldNumber = readInt(in, FIELD_NUMBER); + while (fieldNumber != -1) { + String fieldName = readString(in, FIELD_NAME); + String scoreFunctionName = readString(in, SCORE_FUNCTION); + VectorValues.ScoreFunction scoreFunction = VectorValues.ScoreFunction.valueOf(scoreFunctionName); + long vectorDataOffset = readLong(in, VECTOR_DATA_OFFSET); + long vectorDataLength = readLong(in, VECTOR_DATA_LENGTH); + int dimension = readInt(in, VECTOR_DIMENSION); + int size = readInt(in, SIZE); + int[] docIds = new int[size]; + for (int i = 0; i < size; i++) { + docIds[i] = readInt(in, EMPTY); + } + assert fieldEntries.containsKey(fieldName) == false; + fieldEntries.put(fieldName, new FieldEntry(dimension, scoreFunction, vectorDataOffset, vectorDataLength, docIds)); + fieldNumber = readInt(in, FIELD_NUMBER); + } + SimpleTextUtil.checkFooter(in); + } + + String vectorFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION); + dataIn = readState.directory.openInput(vectorFileName, IOContext.DEFAULT); + } + + @Override + public VectorValues getVectorValues(String field) throws IOException { + FieldInfo info = readState.fieldInfos.fieldInfo(field); + if (info == null) { + throw new IllegalStateException("No vectors indexed for field=\"" + field + "\""); + } + int dimension = info.getVectorDimension(); + if (dimension == 0) { + return VectorValues.EMPTY; + } + FieldEntry fieldEntry = fieldEntries.get(field); + if (fieldEntry == null) { + throw new IllegalStateException("No entry found for vector field=\"" + field + "\""); + } + if (dimension != fieldEntry.dimension) { + throw new IllegalStateException("Inconsistent vector dimension for field=\"" + field + "\"; " + dimension + " != " + fieldEntry.dimension); + } + IndexInput bytesSlice = dataIn.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength); + return new SimpleTextVectorValues(fieldEntry, bytesSlice); + } + + @Override + public void checkIntegrity() throws IOException { + IndexInput clone = dataIn.clone(); + clone.seek(0); + + // checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included in SimpleTextUtil.CHECKSUM): + long footerStartPos = dataIn.length() - (SimpleTextUtil.CHECKSUM.length + 21); + ChecksumIndexInput input = new BufferedChecksumIndexInput(clone); + while (true) { + SimpleTextUtil.readLine(input, scratch); + if (input.getFilePointer() >= footerStartPos) { + // Make sure we landed at precisely the right location: + if (input.getFilePointer() != footerStartPos) { + throw new CorruptIndexException("SimpleText failure: footer does not start at expected position current=" + input.getFilePointer() + " vs expected=" + footerStartPos, input); + } + SimpleTextUtil.checkFooter(input); + break; + } + } + } + + @Override + public long ramBytesUsed() { + return 0; + } + + @Override + public void close() throws IOException { + dataIn.close(); + } + + private static class FieldEntry { + + final int dimension; + final VectorValues.ScoreFunction scoreFunction; + + final long vectorDataOffset; + final long vectorDataLength; + final int[] ordToDoc; + + FieldEntry(int dimension, VectorValues.ScoreFunction scoreFunction, + long vectorDataOffset, long vectorDataLength, int[] ordToDoc) { + this.dimension = dimension; + this.scoreFunction = scoreFunction; + this.vectorDataOffset = vectorDataOffset; + this.vectorDataLength = vectorDataLength; + this.ordToDoc = ordToDoc; + } + + int size() { + return ordToDoc.length; + } + } + + private static class SimpleTextVectorValues extends VectorValues implements VectorValues.RandomAccess { + + private final BytesRefBuilder scratch = new BytesRefBuilder(); + private final FieldEntry entry; + private final IndexInput in; + private final BytesRef binaryValue; + private final float[][] values; + + int curOrd; + + SimpleTextVectorValues(FieldEntry entry, IndexInput in) throws IOException { + this.entry = entry; + this.in = in; + values = new float[entry.size()][entry.dimension]; + binaryValue = new BytesRef(entry.dimension * Float.BYTES); + binaryValue.length = binaryValue.bytes.length; + curOrd = -1; + readAllVectors(); + } + + @Override + public int dimension() { + return entry.dimension; + } + + @Override + public int size() { + return entry.size(); + } + + @Override + public ScoreFunction scoreFunction() { + return entry.scoreFunction; + } + + @Override + public float[] vectorValue() { + return values[curOrd]; + } + + @Override + public BytesRef binaryValue() { + ByteBuffer.wrap(binaryValue.bytes).asFloatBuffer().get(values[curOrd]); + return binaryValue; + } + + @Override + public RandomAccess randomAccess() { + return this; + } + + @Override + public int docID() { + if (curOrd == -1) { + return -1; + } + return entry.ordToDoc[curOrd]; + } + + @Override + public int nextDoc() throws IOException { + if (++curOrd < entry.size()) { + return docID(); + } + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) throws IOException { + return slowAdvance(target); + } + + @Override + public long cost() { + return size(); + } + + private void readAllVectors() throws IOException { + for (int i = 0; i < values.length; i++) { + readVector(values[i]); + } + } + + private void readVector(float[] value) throws IOException { + SimpleTextUtil.readLine(in, scratch); + // skip leading " [" and strip trailing "]" + String s = new BytesRef(scratch.bytes(), 2, scratch.length() - 3).utf8ToString(); + String[] floatStrings = s.split(","); + assert floatStrings.length == value.length : " read " + s + " when expecting " + value.length + " floats"; + for (int i = 0; i < floatStrings.length; i++) { + value[i] = Float.parseFloat(floatStrings[i]); + } + } + + @Override + public float[] vectorValue(int targetOrd) throws IOException { + return values[targetOrd]; + } + + @Override + public BytesRef binaryValue(int targetOrd) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public TopDocs search(float[] target, int k, int fanout) throws IOException { + throw new UnsupportedOperationException(); + } + } + + private int readInt(IndexInput in, BytesRef field) throws IOException { + SimpleTextUtil.readLine(in, scratch); + return parseInt(field); + } + + private long readLong(IndexInput in, BytesRef field) throws IOException { + SimpleTextUtil.readLine(in, scratch); + return parseLong(field); + } + + private String readString(IndexInput in, BytesRef field) throws IOException { + SimpleTextUtil.readLine(in, scratch); + return stripPrefix(field); + } + + private boolean startsWith(BytesRef prefix) { + return StringHelper.startsWith(scratch.get(), prefix); + } + + private int parseInt(BytesRef prefix) { + assert startsWith(prefix); + return Integer.parseInt(stripPrefix(prefix)); + } + + private long parseLong(BytesRef prefix) { + assert startsWith(prefix); + return Long.parseLong(stripPrefix(prefix)); + } + + private String stripPrefix(BytesRef prefix) { + int prefixLen = prefix.length; + return new String(scratch.bytes(), prefixLen, scratch.length() - prefixLen, StandardCharsets.UTF_8); + } +} diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java new file mode 100644 index 000000000000..3f076cf4a0db --- /dev/null +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextVectorWriter.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.simpletext; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.codecs.VectorWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOUtils; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** + * Writes vector-valued fields in a plain text format + */ +public class SimpleTextVectorWriter extends VectorWriter { + + static final BytesRef FIELD_NUMBER = new BytesRef("field-number "); + static final BytesRef FIELD_NAME = new BytesRef("field-name "); + static final BytesRef SCORE_FUNCTION = new BytesRef("score-function "); + static final BytesRef VECTOR_DATA_OFFSET = new BytesRef("vector-data-offset "); + static final BytesRef VECTOR_DATA_LENGTH = new BytesRef("vector-data-length "); + static final BytesRef VECTOR_DIMENSION = new BytesRef("vector-dimension "); + static final BytesRef SIZE = new BytesRef("size "); + + private final IndexOutput meta, vectorData; + private final BytesRefBuilder scratch = new BytesRefBuilder(); + + SimpleTextVectorWriter(SegmentWriteState state) throws IOException { + assert state.fieldInfos.hasVectorValues(); + + String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.META_EXTENSION); + meta = state.directory.createOutput(metaFileName, state.context); + + String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SimpleTextVectorFormat.VECTOR_EXTENSION); + vectorData = state.directory.createOutput(vectorDataFileName, state.context); + } + + @Override + public void writeField(FieldInfo fieldInfo, VectorValues vectors) throws IOException { + long vectorDataOffset = vectorData.getFilePointer(); + List docIds = new ArrayList<>(); + int docV, ord = 0; + for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), ord++) { + writeVectorValue(vectors); + docIds.add(docV); + } + long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; + writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); + } + + private void writeVectorValue(VectorValues vectors) throws IOException { + // write vector value + float[] value = vectors.vectorValue(); + assert value.length == vectors.dimension(); + write(vectorData, Arrays.toString(value)); + newline(vectorData); + } + + private void writeMeta(FieldInfo field, long vectorDataOffset, long vectorDataLength, List docIds) throws IOException { + writeField(meta, FIELD_NUMBER, field.number); + writeField(meta, FIELD_NAME, field.name); + writeField(meta, SCORE_FUNCTION, field.getVectorScoreFunction().name()); + writeField(meta, VECTOR_DATA_OFFSET, vectorDataOffset); + writeField(meta, VECTOR_DATA_LENGTH, vectorDataLength); + writeField(meta, VECTOR_DIMENSION, field.getVectorDimension()); + writeField(meta, SIZE, docIds.size()); + for (Integer docId : docIds) { + writeInt(meta, docId); + newline(meta); + } + writeField(meta, FIELD_NUMBER, -1); + } + + @Override + public void finish() throws IOException { + SimpleTextUtil.writeChecksum(meta, scratch); + SimpleTextUtil.writeChecksum(vectorData, scratch); + } + + @Override + public void close() throws IOException { + IOUtils.close(vectorData, meta); + } + + private void writeField(IndexOutput out, BytesRef fieldName, int value) throws IOException { + write(out, fieldName); + writeInt(out, value); + newline(out); + } + + private void writeField(IndexOutput out, BytesRef fieldName, long value) throws IOException { + write(out, fieldName); + writeLong(out, value); + newline(out); + } + + private void writeField(IndexOutput out, BytesRef fieldName, String value) throws IOException { + write(out, fieldName); + write(out, value); + newline(out); + } + + private void write(IndexOutput out, String s) throws IOException { + SimpleTextUtil.write(out, s, scratch); + } + + private void writeInt(IndexOutput out, int x) throws IOException { + SimpleTextUtil.write(out, Integer.toString(x), scratch); + } + + private void writeLong(IndexOutput out, long x) throws IOException { + SimpleTextUtil.write(out, Long.toString(x), scratch); + } + + private void write(IndexOutput out, BytesRef b) throws IOException { + SimpleTextUtil.write(out, b); + } + + private void newline(IndexOutput out) throws IOException { + SimpleTextUtil.writeNewline(out); + } + +} diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java index e3e1d06f7f6f..f2117785522b 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestBlockWriter.java @@ -24,6 +24,7 @@ import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.ByteBuffersIndexOutput; import org.apache.lucene.util.BytesRef; @@ -120,6 +121,8 @@ private static FieldInfo getMockFieldInfo(String fieldName, int number) { 0, 0, 0, + 0, + VectorValues.ScoreFunction.NONE, true ); } diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java index 5707fb4f6a03..c08bb5506935 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/sharedterms/TestSTBlockReader.java @@ -42,6 +42,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.store.ByteBuffersDirectory; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.Directory; @@ -203,6 +204,8 @@ private static FieldInfo mockFieldInfo(String fieldName, int number) { 0, 0, 0, + 0, + VectorValues.ScoreFunction.NONE, false ); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index 14fa7935f9f9..3a2bc3ff593f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -56,8 +56,7 @@ static NamedSPILoader getLoader() { return LOADER; } - // TODO: should we use this, or maybe a system property is better? - static Codec defaultCodec = LOADER.lookup("Lucene87"); + static Codec defaultCodec = LOADER.lookup("Lucene90"); } private final String name; @@ -110,6 +109,9 @@ public final String getName() { /** Encodes/decodes points index */ public abstract PointsFormat pointsFormat(); + + /** Encodes/decodes numeric vector fields */ + public abstract VectorFormat vectorFormat(); /** looks up a codec by name */ public static Codec forName(String name) { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java index 9abd8d4f3313..4a5e934f7247 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/FilterCodec.java @@ -108,4 +108,9 @@ public CompoundFormat compoundFormat() { public PointsFormat pointsFormat() { return delegate.pointsFormat(); } + + @Override + public VectorFormat vectorFormat() { + return delegate.vectorFormat(); + } } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/VectorFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/VectorFormat.java new file mode 100644 index 000000000000..a7a64e1f06c2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/VectorFormat.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs; + +import java.io.IOException; + +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.VectorValues; + +/** + * Encodes/decodes per-document vector and any associated indexing structures required to support nearest-neighbor search + */ +public abstract class VectorFormat { + + /** Sole constructor */ + protected VectorFormat() {} + + /** + * Returns a {@link VectorWriter} to write the vectors to the index. + */ + public abstract VectorWriter fieldsWriter(SegmentWriteState state) throws IOException; + + /** + * Returns a {@link VectorReader} to read the vectors from the index. + */ + public abstract VectorReader fieldsReader(SegmentReadState state) throws IOException; + + /** + * EMPTY throws an exception when written. It acts as a sentinel indicating a Codec that does not support vectors. + */ + public static final VectorFormat EMPTY = new VectorFormat() { + @Override + public VectorWriter fieldsWriter(SegmentWriteState state) { + throw new UnsupportedOperationException("Attempt to write EMPTY VectorValues: maybe you forgot to use codec=Lucene90"); + } + + @Override + public VectorReader fieldsReader(SegmentReadState state) { + return new VectorReader() { + @Override + public void checkIntegrity() { + } + + @Override + public VectorValues getVectorValues(String field) { + return VectorValues.EMPTY; + } + + @Override + public void close() throws IOException { + } + + @Override + public long ramBytesUsed() { + return 0; + } + }; + } + }; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/VectorReader.java b/lucene/core/src/java/org/apache/lucene/codecs/VectorReader.java new file mode 100644 index 000000000000..15a3d4659929 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/VectorReader.java @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs; + +import java.io.Closeable; +import java.io.IOException; + +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.util.Accountable; + +/** + * Reads vectors from an index. + */ +public abstract class VectorReader implements Closeable, Accountable { + + /** Sole constructor */ + protected VectorReader() {} + + /** + * Checks consistency of this reader. + *

+ * Note that this may be costly in terms of I/O, e.g. + * may involve computing a checksum value against large data files. + * @lucene.internal + */ + public abstract void checkIntegrity() throws IOException; + + /** Returns the {@link VectorValues} for the given {@code field} */ + public abstract VectorValues getVectorValues(String field) throws IOException; + + /** + * Returns an instance optimized for merging. This instance may only be + * consumed in the thread that called {@link #getMergeInstance()}. + *

+ * The default implementation returns {@code this} */ + public VectorReader getMergeInstance() { + return this; + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/VectorWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/VectorWriter.java new file mode 100644 index 000000000000..5dda312466b0 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/VectorWriter.java @@ -0,0 +1,283 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs; + +import java.io.Closeable; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.apache.lucene.index.DocIDMerger; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.MergeState; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** + * Writes vectors to an index. + */ +public abstract class VectorWriter implements Closeable { + + /** Sole constructor */ + protected VectorWriter() {} + + /** Write all values contained in the provided reader */ + public abstract void writeField(FieldInfo fieldInfo, VectorValues values) throws IOException; + + /** Called once at the end before close */ + public abstract void finish() throws IOException; + + /** Merge the vector values from multiple segments, for all fields */ + public void merge(MergeState mergeState) throws IOException { + for (int i = 0; i < mergeState.fieldInfos.length; i++) { + VectorReader reader = mergeState.vectorReaders[i]; + assert reader != null || mergeState.fieldInfos[i].hasVectorValues() == false; + if (reader != null) { + reader.checkIntegrity(); + } + } + for (FieldInfo fieldInfo : mergeState.mergeFieldInfos) { + if (fieldInfo.hasVectorValues()) { + mergeVectors(fieldInfo, mergeState); + } + } + finish(); + } + + private void mergeVectors(FieldInfo mergeFieldInfo, final MergeState mergeState) throws IOException { + if (mergeState.infoStream.isEnabled("VV")) { + mergeState.infoStream.message("VV", "merging " + mergeState.segmentInfo); + } + List subs = new ArrayList<>(); + int dimension = -1; + VectorValues.ScoreFunction scoreFunction = null; + int nonEmptySegmentIndex = 0; + for (int i = 0; i < mergeState.vectorReaders.length; i++) { + VectorReader vectorReader = mergeState.vectorReaders[i]; + if (vectorReader != null) { + if (mergeFieldInfo != null && mergeFieldInfo.hasVectorValues()) { + int segmentDimension = mergeFieldInfo.getVectorDimension(); + VectorValues.ScoreFunction segmentScoreFunction = mergeFieldInfo.getVectorScoreFunction(); + if (dimension == -1) { + dimension = segmentDimension; + scoreFunction = mergeFieldInfo.getVectorScoreFunction(); + } else if (dimension != segmentDimension) { + throw new IllegalStateException("Varying dimensions for vector-valued field " + mergeFieldInfo.name + + ": " + dimension + "!=" + segmentDimension); + } else if (scoreFunction != segmentScoreFunction) { + throw new IllegalStateException("Varying score functions for vector-valued field " + mergeFieldInfo.name + + ": " + scoreFunction + "!=" + segmentScoreFunction); + } + VectorValues values = vectorReader.getVectorValues(mergeFieldInfo.name); + if (values != null) { + subs.add(new VectorValuesSub(nonEmptySegmentIndex++, mergeState.docMaps[i], values)); + } + } + } + } + // Create a new VectorValues by iterating over the sub vectors, mapping the resulting + // docids using docMaps in the mergeState. + if (subs.size() > 0) { + writeField(mergeFieldInfo, new VectorValuesMerger(subs, mergeState)); + } + if (mergeState.infoStream.isEnabled("VV")) { + mergeState.infoStream.message("VV", "merge done " + mergeState.segmentInfo); + } + } + + /** Tracks state of one sub-reader that we are merging */ + private static class VectorValuesSub extends DocIDMerger.Sub { + + final MergeState.DocMap docMap; + final VectorValues values; + final int segmentIndex; + int count; + + VectorValuesSub(int segmentIndex, MergeState.DocMap docMap, VectorValues values) { + super(docMap); + this.values = values; + this.segmentIndex = segmentIndex; + this.docMap = docMap; + assert values.docID() == -1; + } + + @Override + public int nextDoc() throws IOException { + int docId = values.nextDoc(); + if (docId != NO_MORE_DOCS) { + // Note: this does count deleted docs since they are present in the to-be-merged segment + ++count; + } + return docId; + } + } + + /** + * View over multiple VectorValues supporting iterator-style access via DocIdMerger. Maintains a reverse ordinal + * mapping for documents having values in order to support random access by dense ordinal. + */ + private static class VectorValuesMerger extends VectorValues { + private final List subs; + private final DocIDMerger docIdMerger; + private final int[] ordBase; + private final int cost; + private final int size; + + private int docId; + private VectorValuesSub current; + // For each doc with a vector, record its ord in the segments being merged. This enables random access into the + // unmerged segments using the ords from the merged segment. + private int[] ordMap; + private int ord; + + VectorValuesMerger(List subs, MergeState mergeState) throws IOException { + this.subs = subs; + docIdMerger = DocIDMerger.of(subs, mergeState.needsIndexSort); + int totalCost = 0, totalSize = 0; + for (VectorValuesSub sub : subs) { + totalCost += sub.values.cost(); + totalSize += sub.values.size(); + } + cost = totalCost; + size = totalSize; + ordMap = new int[size]; + ordBase = new int[subs.size()]; + int lastBase = 0; + for (int k = 0; k < subs.size(); k++) { + int size = subs.get(k).values.size(); + ordBase[k] = lastBase; + lastBase += size; + } + docId = -1; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + current = docIdMerger.next(); + if (current == null) { + docId = NO_MORE_DOCS; + } else { + docId = current.mappedDocID; + ordMap[ord++] = ordBase[current.segmentIndex] + current.count - 1; + } + return docId; + } + + @Override + public float[] vectorValue() throws IOException { + return current.values.vectorValue(); + } + + @Override + public BytesRef binaryValue() throws IOException { + return current.values.binaryValue(); + } + + @Override + public RandomAccess randomAccess() { + return new MergerRandomAccess(); + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public int size() { + return size; + } + + @Override + public long cost() { + return cost; + } + + @Override + public int dimension() { + return subs.get(0).values.dimension(); + } + + @Override + public VectorValues.ScoreFunction scoreFunction() { + return subs.get(0).values.scoreFunction(); + } + + class MergerRandomAccess implements VectorValues.RandomAccess { + + private final List raSubs; + + MergerRandomAccess() { + raSubs = new ArrayList<>(subs.size()); + for (VectorValuesSub sub : subs) { + raSubs.add(sub.values.randomAccess()); + } + } + + @Override + public int size() { + return size; + } + + @Override + public int dimension() { + return VectorValuesMerger.this.dimension(); + } + + @Override + public ScoreFunction scoreFunction() { + return VectorValuesMerger.this.scoreFunction(); + } + + @Override + public float[] vectorValue(int target) throws IOException { + int unmappedOrd = ordMap[target]; + int segmentOrd = Arrays.binarySearch(ordBase, unmappedOrd); + if (segmentOrd < 0) { + // get the index of the greatest lower bound + segmentOrd = -2 - segmentOrd; + } + while(segmentOrd < ordBase.length - 1 && ordBase[segmentOrd + 1] == ordBase[segmentOrd]) { + // forward over empty segments which will share the same ordBase + segmentOrd++; + } + return raSubs.get(segmentOrd).vectorValue(unmappedOrd - ordBase[segmentOrd]); + } + + @Override + public BytesRef binaryValue(int targetOrd) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public TopDocs search(float[] target, int k, int fanout) throws IOException { + throw new UnsupportedOperationException(); + } + + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java index f8368bc5e148..0f0fe8c642fd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/Lucene50FieldInfosFormat.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; @@ -148,7 +149,8 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm lastAttributes = attributes; try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, - indexOptions, docValuesType, dvGen, attributes, 0, 0, 0, false); + indexOptions, docValuesType, dvGen, attributes, 0, 0, 0, + 0, VectorValues.ScoreFunction.NONE, false); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java index e63873a63c66..0c869e9a6dc1 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene50/package-info.java @@ -17,7 +17,7 @@ /** * Components from the Lucene 5.0 index format - * See {@link org.apache.lucene.codecs.lucene80} for an overview + * See {@link org.apache.lucene.codecs.lucene90} for an overview * of the index format. */ package org.apache.lucene.codecs.lucene50; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java index 3b97c2670192..4f6f42a2f99d 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/Lucene60FieldInfosFormat.java @@ -31,6 +31,7 @@ import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataOutput; import org.apache.lucene.store.Directory; @@ -164,7 +165,8 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm try { infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValuesType, dvGen, attributes, - pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, isSoftDeletesField); + pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, + 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField); } catch (IllegalStateException e) { throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); } diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java index d807058f6468..9c82b1273145 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene60/package-info.java @@ -16,7 +16,7 @@ */ /** - * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene86} + * Components from the Lucene 6.0 index format. See {@link org.apache.lucene.codecs.lucene90} * for an overview of the current index format. */ package org.apache.lucene.codecs.lucene60; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java index c2c31534a79c..f654630c097f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene80/package-info.java @@ -17,7 +17,7 @@ /** * Components from the Lucene 8.0 index format - * See {@link org.apache.lucene.codecs.lucene84} for an overview + * See {@link org.apache.lucene.codecs.lucene90} for an overview * of the index format. */ package org.apache.lucene.codecs.lucene80; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java index 5940a47dca83..00d7edd04d5c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene84/package-info.java @@ -16,7 +16,7 @@ */ /** - * Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene86} + * Components from the Lucene 8.4 index format. See {@link org.apache.lucene.codecs.lucene90} * for an overview of the current index format. */ package org.apache.lucene.codecs.lucene84; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java index 13f35a189e6d..d486d3796a8f 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene86/package-info.java @@ -16,401 +16,7 @@ */ /** - * Lucene 8.6 file format. - * - *

Apache Lucene - Index File Formats

- *
- * - *
- * - *

Introduction

- *
- *

This document defines the index file formats used in this version of Lucene. - * If you are using a different version of Lucene, please consult the copy of - * docs/ that was distributed with - * the version you are using.

- *

This document attempts to provide a high-level definition of the Apache - * Lucene file formats.

- *
- * - *

Definitions

- *
- *

The fundamental concepts in Lucene are index, document, field and term.

- *

An index contains a sequence of documents.

- * - *

The same sequence of bytes in two different fields is considered a different - * term. Thus terms are represented as a pair: the string naming the field, and the - * bytes within the field.

- * - *

Inverted Indexing

- *

The index stores statistics about terms in order to make term-based search - * more efficient. Lucene's index falls into the family of indexes known as an - * inverted index. This is because it can list, for a term, the documents - * that contain it. This is the inverse of the natural relationship, in which - * documents list terms.

- * - *

Types of Fields

- *

In Lucene, fields may be stored, in which case their text is stored - * in the index literally, in a non-inverted manner. Fields that are inverted are - * called indexed. A field may be both stored and indexed.

- *

The text of a field may be tokenized into terms to be indexed, or the - * text of a field may be used literally as a term to be indexed. Most fields are - * tokenized, but sometimes it is useful for certain identifier fields to be - * indexed literally.

- *

See the {@link org.apache.lucene.document.Field Field} - * java docs for more information on Fields.

- * - *

Segments

- *

Lucene indexes may be composed of multiple sub-indexes, or segments. - * Each segment is a fully independent index, which could be searched separately. - * Indexes evolve by:

- *
    - *
  1. Creating new segments for newly added documents.
  2. - *
  3. Merging existing segments.
  4. - *
- *

Searches may involve multiple segments and/or multiple indexes, each index - * potentially composed of a set of segments.

- * - *

Document Numbers

- *

Internally, Lucene refers to documents by an integer document number. - * The first document added to an index is numbered zero, and each subsequent - * document added gets a number one greater than the previous.

- *

Note that a document's number may change, so caution should be taken when - * storing these numbers outside of Lucene. In particular, numbers may change in - * the following situations:

- * - *
- * - *

Index Structure Overview

- *
- *

Each segment index maintains the following:

- * - *

Details on each of these are provided in their linked pages.

- *
- * - *

File Naming

- *
- *

All files belonging to a segment have the same name with varying extensions. - * The extensions correspond to the different file formats described below. When - * using the Compound File format (default for small segments) these files (except - * for the Segment info file, the Lock file, and Deleted documents file) are collapsed - * into a single .cfs file (see below for details)

- *

Typically, all segments in an index are stored in a single directory, - * although this is not required.

- *

File names are never re-used. That is, when any file is saved - * to the Directory it is given a never before used filename. This is achieved - * using a simple generations approach. For example, the first segments file is - * segments_1, then segments_2, etc. The generation is a sequential long integer - * represented in alpha-numeric (base 36) form.

- *
- * - *

Summary of File Extensions

- *
- *

The following table summarizes the names and extensions of the files in - * Lucene:

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same - * file.
{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for - * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields}.fnmStores information about the fields
Field Index.fdxContains pointers to field data
Field Data.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.dii, .dimHolds indexed points, if any
- *
- * - *

Lock File

- * The write lock, which is stored in the index directory by default, is named - * "write.lock". If the lock directory is different from the index directory then - * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix - * derived from the full path to the index directory. When this file is present, a - * writer is currently modifying the index (adding or removing documents). This - * lock file ensures that only one writer is modifying the index at a time. - * - *

History

- *

Compatibility notes are provided in this document, describing how file - * formats have changed from prior versions:

- * - * - *

Limitations

- *
- *

Lucene uses a Java int to refer to - * document numbers, and the index file format uses an Int32 - * on-disk to store document numbers. This is a limitation - * of both the index file format and the current implementation. Eventually these - * should be replaced with either UInt64 values, or - * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.

- *
+ * Components from the Lucene 8.6 index format. See {@link org.apache.lucene.codecs.lucene90} + * for an overview of the current index format. */ package org.apache.lucene.codecs.lucene86; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java index 75facdb2fb41..0df8615a21e9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene87/package-info.java @@ -16,401 +16,7 @@ */ /** - * Lucene 8.7 file format. - * - *

Apache Lucene - Index File Formats

- *
- * - *
- * - *

Introduction

- *
- *

This document defines the index file formats used in this version of Lucene. - * If you are using a different version of Lucene, please consult the copy of - * docs/ that was distributed with - * the version you are using.

- *

This document attempts to provide a high-level definition of the Apache - * Lucene file formats.

- *
- * - *

Definitions

- *
- *

The fundamental concepts in Lucene are index, document, field and term.

- *

An index contains a sequence of documents.

- * - *

The same sequence of bytes in two different fields is considered a different - * term. Thus terms are represented as a pair: the string naming the field, and the - * bytes within the field.

- * - *

Inverted Indexing

- *

The index stores statistics about terms in order to make term-based search - * more efficient. Lucene's index falls into the family of indexes known as an - * inverted index. This is because it can list, for a term, the documents - * that contain it. This is the inverse of the natural relationship, in which - * documents list terms.

- * - *

Types of Fields

- *

In Lucene, fields may be stored, in which case their text is stored - * in the index literally, in a non-inverted manner. Fields that are inverted are - * called indexed. A field may be both stored and indexed.

- *

The text of a field may be tokenized into terms to be indexed, or the - * text of a field may be used literally as a term to be indexed. Most fields are - * tokenized, but sometimes it is useful for certain identifier fields to be - * indexed literally.

- *

See the {@link org.apache.lucene.document.Field Field} - * java docs for more information on Fields.

- * - *

Segments

- *

Lucene indexes may be composed of multiple sub-indexes, or segments. - * Each segment is a fully independent index, which could be searched separately. - * Indexes evolve by:

- *
    - *
  1. Creating new segments for newly added documents.
  2. - *
  3. Merging existing segments.
  4. - *
- *

Searches may involve multiple segments and/or multiple indexes, each index - * potentially composed of a set of segments.

- * - *

Document Numbers

- *

Internally, Lucene refers to documents by an integer document number. - * The first document added to an index is numbered zero, and each subsequent - * document added gets a number one greater than the previous.

- *

Note that a document's number may change, so caution should be taken when - * storing these numbers outside of Lucene. In particular, numbers may change in - * the following situations:

- * - *
- * - *

Index Structure Overview

- *
- *

Each segment index maintains the following:

- * - *

Details on each of these are provided in their linked pages.

- *
- * - *

File Naming

- *
- *

All files belonging to a segment have the same name with varying extensions. - * The extensions correspond to the different file formats described below. When - * using the Compound File format (default for small segments) these files (except - * for the Segment info file, the Lock file, and Deleted documents file) are collapsed - * into a single .cfs file (see below for details)

- *

Typically, all segments in an index are stored in a single directory, - * although this is not required.

- *

File names are never re-used. That is, when any file is saved - * to the Directory it is given a never before used filename. This is achieved - * using a simple generations approach. For example, the first segments file is - * segments_1, then segments_2, etc. The generation is a sequential long integer - * represented in alpha-numeric (base 36) form.

- *
- * - *

Summary of File Extensions

- *
- *

The following table summarizes the names and extensions of the files in - * Lucene:

- * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - * - *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same - * file.
{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for - * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.dii, .dimHolds indexed points, if any
- *
- * - *

Lock File

- * The write lock, which is stored in the index directory by default, is named - * "write.lock". If the lock directory is different from the index directory then - * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix - * derived from the full path to the index directory. When this file is present, a - * writer is currently modifying the index (adding or removing documents). This - * lock file ensures that only one writer is modifying the index at a time. - * - *

History

- *

Compatibility notes are provided in this document, describing how file - * formats have changed from prior versions:

- * - * - *

Limitations

- *
- *

Lucene uses a Java int to refer to - * document numbers, and the index file format uses an Int32 - * on-disk to store document numbers. This is a limitation - * of both the index file format and the current implementation. Eventually these - * should be replaced with either UInt64 values, or - * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.

- *
+ * Components from the Lucene 8.7 index format. See {@link org.apache.lucene.codecs.lucene90} + * for an overview of the current index format. */ package org.apache.lucene.codecs.lucene87; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java new file mode 100644 index 000000000000..fa8f7afae2db --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90Codec.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + +import java.util.Objects; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.VectorFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat; +import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat; +import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat; +import org.apache.lucene.codecs.lucene80.Lucene80NormsFormat; +import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat; +import org.apache.lucene.codecs.lucene86.Lucene86PointsFormat; +import org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat; +import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 9.0 index format + *

+ * If you want to reuse functionality of this codec in another codec, extend + * {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene90 package documentation for file format details. + * + * @lucene.experimental + */ +public class Lucene90Codec extends Codec { + private final TermVectorsFormat vectorsFormat = new Lucene50TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene90FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene86SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene50LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene50CompoundFormat(); + private final PostingsFormat defaultFormat; + + private final PostingsFormat postingsFormat = new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene90Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat docValuesFormat = new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene90Codec.this.getDocValuesFormatForField(field); + } + }; + + private final VectorFormat vectorFormat = new Lucene90VectorFormat(); + + private final StoredFieldsFormat storedFieldsFormat; + + /** + * Instantiates a new codec. + */ + public Lucene90Codec() { + this(Lucene87StoredFieldsFormat.Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression + * mode to use. + * @param mode stored fields compression mode to use for newly + * flushed/merged segments. + */ + public Lucene90Codec(Lucene87StoredFieldsFormat.Mode mode) { + super("Lucene90"); + this.storedFieldsFormat = new Lucene87StoredFieldsFormat(Objects.requireNonNull(mode)); + this.defaultFormat = new Lucene84PostingsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene86PointsFormat(); + } + + @Override + public final VectorFormat vectorFormat() { + return vectorFormat; + } + + /** Returns the postings format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene84". + *

+ * WARNING: if you subclass, you are responsible for index + * backwards compatibility: future version of Lucene are only + * guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultFormat; + } + + /** Returns the docvalues format that should be used for writing + * new segments of field. + * + * The default implementation always returns "Lucene80". + *

+ * WARNING: if you subclass, you are responsible for index + * backwards compatibility: future version of Lucene are only + * guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + private final DocValuesFormat defaultDVFormat = DocValuesFormat.forName("Lucene80"); + + private final NormsFormat normsFormat = new Lucene80NormsFormat(); + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90FieldInfosFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90FieldInfosFormat.java new file mode 100644 index 000000000000..cab87e9102e3 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90FieldInfosFormat.java @@ -0,0 +1,339 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene90; + + +import java.io.IOException; +import java.util.Collections; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; + +/** + * Lucene 9.0 Field Infos format. + *

Field names are stored in the field info file, with suffix .fnm. + *

FieldInfos (.fnm) --> Header,FieldsCount, <FieldName,FieldNumber, + * FieldBits,DocValuesBits,DocValuesGen,Attributes,DimensionCount,DimensionNumBytes> FieldsCount,Footer + *

Data types: + *

+ * Field Descriptions: + * + * + * @lucene.experimental + */ +public final class Lucene90FieldInfosFormat extends FieldInfosFormat { + + /** Sole constructor. */ + public Lucene90FieldInfosFormat() { + } + + @Override + public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, IOContext context) throws IOException { + final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION); + try (ChecksumIndexInput input = directory.openChecksumInput(fileName, context)) { + Throwable priorE = null; + FieldInfo infos[] = null; + try { + int version = CodecUtil.checkIndexHeader(input, + Lucene90FieldInfosFormat.CODEC_NAME, + Lucene90FieldInfosFormat.FORMAT_START, + Lucene90FieldInfosFormat.FORMAT_CURRENT, + segmentInfo.getId(), segmentSuffix); + + final int size = input.readVInt(); //read in the size + infos = new FieldInfo[size]; + + // previous field's attribute map, we share when possible: + Map lastAttributes = Collections.emptyMap(); + + for (int i = 0; i < size; i++) { + String name = input.readString(); + final int fieldNumber = input.readVInt(); + if (fieldNumber < 0) { + throw new CorruptIndexException("invalid field number for field: " + name + ", fieldNumber=" + fieldNumber, input); + } + byte bits = input.readByte(); + boolean storeTermVector = (bits & STORE_TERMVECTOR) != 0; + boolean omitNorms = (bits & OMIT_NORMS) != 0; + boolean storePayloads = (bits & STORE_PAYLOADS) != 0; + boolean isSoftDeletesField = (bits & SOFT_DELETES_FIELD) != 0; + + final IndexOptions indexOptions = getIndexOptions(input, input.readByte()); + + // DV Types are packed in one byte + final DocValuesType docValuesType = getDocValuesType(input, input.readByte()); + final long dvGen = input.readLong(); + Map attributes = input.readMapOfStrings(); + // just use the last field's map if its the same + if (attributes.equals(lastAttributes)) { + attributes = lastAttributes; + } + lastAttributes = attributes; + int pointDataDimensionCount = input.readVInt(); + int pointNumBytes; + int pointIndexDimensionCount = pointDataDimensionCount; + if (pointDataDimensionCount != 0) { + if (version >= Lucene90FieldInfosFormat.FORMAT_SELECTIVE_INDEXING) { + pointIndexDimensionCount = input.readVInt(); + } + pointNumBytes = input.readVInt(); + } else { + pointNumBytes = 0; + } + final int vectorDimension = input.readVInt(); + final VectorValues.ScoreFunction vectorDistFunc = getDistFunc(input, input.readByte()); + + try { + infos[i] = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, + indexOptions, docValuesType, dvGen, attributes, + pointDataDimensionCount, pointIndexDimensionCount, pointNumBytes, vectorDimension, vectorDistFunc, isSoftDeletesField); + infos[i].checkConsistency(); + } catch (IllegalStateException e) { + throw new CorruptIndexException("invalid fieldinfo for field: " + name + ", fieldNumber=" + fieldNumber, input, e); + } + } + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return new FieldInfos(infos); + } + } + + static { + // We "mirror" DocValues enum values with the constants below; let's try to ensure if we add a new DocValuesType while this format is + // still used for writing, we remember to fix this encoding: + assert DocValuesType.values().length == 6; + } + + private static byte docValuesByte(DocValuesType type) { + switch(type) { + case NONE: + return 0; + case NUMERIC: + return 1; + case BINARY: + return 2; + case SORTED: + return 3; + case SORTED_SET: + return 4; + case SORTED_NUMERIC: + return 5; + default: + // BUG + throw new AssertionError("unhandled DocValuesType: " + type); + } + } + + private static DocValuesType getDocValuesType(IndexInput input, byte b) throws IOException { + switch(b) { + case 0: + return DocValuesType.NONE; + case 1: + return DocValuesType.NUMERIC; + case 2: + return DocValuesType.BINARY; + case 3: + return DocValuesType.SORTED; + case 4: + return DocValuesType.SORTED_SET; + case 5: + return DocValuesType.SORTED_NUMERIC; + default: + throw new CorruptIndexException("invalid docvalues byte: " + b, input); + } + } + + private static VectorValues.ScoreFunction getDistFunc(IndexInput input, byte b) throws IOException { + if (b < 0 || b >= VectorValues.ScoreFunction.values().length) { + throw new CorruptIndexException("invalid distance function: " + b, input); + } + return VectorValues.ScoreFunction.values()[b]; + } + + static { + // We "mirror" IndexOptions enum values with the constants below; let's try to ensure if we add a new IndexOption while this format is + // still used for writing, we remember to fix this encoding: + assert IndexOptions.values().length == 5; + } + + private static byte indexOptionsByte(IndexOptions indexOptions) { + switch (indexOptions) { + case NONE: + return 0; + case DOCS: + return 1; + case DOCS_AND_FREQS: + return 2; + case DOCS_AND_FREQS_AND_POSITIONS: + return 3; + case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS: + return 4; + default: + // BUG: + throw new AssertionError("unhandled IndexOptions: " + indexOptions); + } + } + + private static IndexOptions getIndexOptions(IndexInput input, byte b) throws IOException { + switch (b) { + case 0: + return IndexOptions.NONE; + case 1: + return IndexOptions.DOCS; + case 2: + return IndexOptions.DOCS_AND_FREQS; + case 3: + return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; + case 4: + return IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; + default: + // BUG + throw new CorruptIndexException("invalid IndexOptions byte: " + b, input); + } + } + + @Override + public void write(Directory directory, SegmentInfo segmentInfo, String segmentSuffix, FieldInfos infos, IOContext context) throws IOException { + final String fileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, EXTENSION); + try (IndexOutput output = directory.createOutput(fileName, context)) { + CodecUtil.writeIndexHeader(output, Lucene90FieldInfosFormat.CODEC_NAME, Lucene90FieldInfosFormat.FORMAT_CURRENT, segmentInfo.getId(), segmentSuffix); + output.writeVInt(infos.size()); + for (FieldInfo fi : infos) { + fi.checkConsistency(); + + output.writeString(fi.name); + output.writeVInt(fi.number); + + byte bits = 0x0; + if (fi.hasVectors()) bits |= STORE_TERMVECTOR; + if (fi.omitsNorms()) bits |= OMIT_NORMS; + if (fi.hasPayloads()) bits |= STORE_PAYLOADS; + if (fi.isSoftDeletesField()) bits |= SOFT_DELETES_FIELD; + output.writeByte(bits); + + output.writeByte(indexOptionsByte(fi.getIndexOptions())); + + // pack the DV type and hasNorms in one byte + output.writeByte(docValuesByte(fi.getDocValuesType())); + output.writeLong(fi.getDocValuesGen()); + output.writeMapOfStrings(fi.attributes()); + output.writeVInt(fi.getPointDimensionCount()); + if (fi.getPointDimensionCount() != 0) { + output.writeVInt(fi.getPointIndexDimensionCount()); + output.writeVInt(fi.getPointNumBytes()); + } + output.writeVInt(fi.getVectorDimension()); + output.writeByte((byte) fi.getVectorScoreFunction().ordinal()); + } + CodecUtil.writeFooter(output); + } + } + + /** Extension of field infos */ + static final String EXTENSION = "fnm"; + + // Codec header + static final String CODEC_NAME = "Lucene90FieldInfos"; + static final int FORMAT_START = 0; + static final int FORMAT_SOFT_DELETES = 1; + static final int FORMAT_SELECTIVE_INDEXING = 2; + static final int FORMAT_CURRENT = FORMAT_SELECTIVE_INDEXING; + + // Field flags + static final byte STORE_TERMVECTOR = 0x1; + static final byte OMIT_NORMS = 0x2; + static final byte STORE_PAYLOADS = 0x4; + static final byte SOFT_DELETES_FIELD = 0x8; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorFormat.java new file mode 100644 index 000000000000..632bc8154d8a --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorFormat.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; + +import org.apache.lucene.codecs.VectorFormat; +import org.apache.lucene.codecs.VectorReader; +import org.apache.lucene.codecs.VectorWriter; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; + +/** + * Lucene 9.0 vector format, which encodes dense numeric vector values. + * TODO: add support for approximate KNN search. + */ +public final class Lucene90VectorFormat extends VectorFormat { + + static final String META_CODEC_NAME = "Lucene90VectorFormatMeta"; + static final String VECTOR_DATA_CODEC_NAME = "Lucene90VectorFormatData"; + + static final String META_EXTENSION = "vem"; + static final String VECTOR_DATA_EXTENSION = "vec"; + + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor */ + public Lucene90VectorFormat() { + } + + @Override + public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException { + return new Lucene90VectorWriter(state); + } + + @Override + public VectorReader fieldsReader(SegmentReadState state) throws IOException { + return new Lucene90VectorReader(state); + } + +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java new file mode 100644 index 000000000000..4a32a361fb7d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorReader.java @@ -0,0 +1,345 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.FloatBuffer; +import java.util.HashMap; +import java.util.Map; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.VectorReader; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.RamUsageEstimator; + +/** + * Reads vectors from the index segments. + * @lucene.experimental + */ +public final class Lucene90VectorReader extends VectorReader { + + private final FieldInfos fieldInfos; + private final Map fields = new HashMap<>(); + private final IndexInput vectorData; + private final int maxDoc; + + Lucene90VectorReader(SegmentReadState state) throws IOException { + this.fieldInfos = state.fieldInfos; + this.maxDoc = state.segmentInfo.maxDoc(); + + String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.META_EXTENSION); + int versionMeta = -1; + try (ChecksumIndexInput meta = state.directory.openChecksumInput(metaFileName, state.context)) { + Throwable priorE = null; + try { + versionMeta = CodecUtil.checkIndexHeader(meta, + Lucene90VectorFormat.META_CODEC_NAME, + Lucene90VectorFormat.VERSION_START, + Lucene90VectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + readFields(meta, state.fieldInfos); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(meta, priorE); + } + } + + boolean success = false; + + String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.VECTOR_DATA_EXTENSION); + this.vectorData = state.directory.openInput(vectorDataFileName, state.context); + try { + int versionVectorData = CodecUtil.checkIndexHeader(vectorData, + Lucene90VectorFormat.VECTOR_DATA_CODEC_NAME, + Lucene90VectorFormat.VERSION_START, + Lucene90VectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + if (versionMeta != versionVectorData) { + throw new CorruptIndexException("Format versions mismatch: meta=" + versionMeta + ", vector data=" + versionVectorData, vectorData); + } + CodecUtil.retrieveChecksum(vectorData); + + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(this.vectorData); + } + } + } + + private void readFields(ChecksumIndexInput meta, FieldInfos infos) throws IOException { + for (int fieldNumber = meta.readInt(); fieldNumber != -1; fieldNumber = meta.readInt()) { + FieldInfo info = infos.fieldInfo(fieldNumber); + if (info == null) { + throw new CorruptIndexException("Invalid field number: " + fieldNumber, meta); + } + int scoreFunctionId = meta.readInt(); + if (scoreFunctionId < 0 || scoreFunctionId >= VectorValues.ScoreFunction.values().length) { + throw new CorruptIndexException("Invalid score function id: " + scoreFunctionId, meta); + } + VectorValues.ScoreFunction scoreFunction = VectorValues.ScoreFunction.values()[scoreFunctionId]; + long vectorDataOffset = meta.readVLong(); + long vectorDataLength = meta.readVLong(); + int dimension = meta.readInt(); + int size = meta.readInt(); + int[] ordToDoc = new int[size]; + for (int i = 0; i < size; i++) { + int doc = meta.readVInt(); + ordToDoc[i] = doc; + } + FieldEntry fieldEntry = new FieldEntry(dimension, scoreFunction, maxDoc, vectorDataOffset, vectorDataLength, + ordToDoc); + fields.put(info.name, fieldEntry); + } + } + + @Override + public long ramBytesUsed() { + long totalBytes = RamUsageEstimator.shallowSizeOfInstance(Lucene90VectorReader.class); + totalBytes += RamUsageEstimator.sizeOfMap(fields, RamUsageEstimator.shallowSizeOfInstance(FieldEntry.class)); + for (FieldEntry entry : fields.values()) { + totalBytes += RamUsageEstimator.sizeOf(entry.ordToDoc); + } + return totalBytes; + } + + @Override + public void checkIntegrity() throws IOException { + CodecUtil.checksumEntireFile(vectorData); + } + + @Override + public VectorValues getVectorValues(String field) throws IOException { + FieldInfo info = fieldInfos.fieldInfo(field); + if (info == null) { + return null; + } + int dimension = info.getVectorDimension(); + if (dimension == 0) { + return VectorValues.EMPTY; + } + FieldEntry fieldEntry = fields.get(field); + if (fieldEntry == null) { + // There is a FieldInfo, but no vectors. Should we have deleted the FieldInfo? + return null; + } + if (dimension != fieldEntry.dimension) { + throw new IllegalStateException("Inconsistent vector dimension for field=\"" + field + "\"; " + dimension + " != " + fieldEntry.dimension); + } + long numBytes = (long) fieldEntry.size() * dimension * Float.BYTES; + if (numBytes != fieldEntry.vectorDataLength) { + throw new IllegalStateException("Vector data length " + fieldEntry.vectorDataLength + + " not matching size=" + fieldEntry.size() + " * dim=" + dimension + " * 4 = " + + numBytes); + } + IndexInput bytesSlice = vectorData.slice("vector-data", fieldEntry.vectorDataOffset, fieldEntry.vectorDataLength); + return new OffHeapVectorValues(fieldEntry, bytesSlice); + } + + @Override + public void close() throws IOException { + vectorData.close(); + } + + private static class FieldEntry { + + final int dimension; + final VectorValues.ScoreFunction scoreFunction; + final int maxDoc; + + final long vectorDataOffset; + final long vectorDataLength; + final int[] ordToDoc; + + FieldEntry(int dimension, VectorValues.ScoreFunction scoreFunction, int maxDoc, + long vectorDataOffset, long vectorDataLength, int[] ordToDoc) { + this.dimension = dimension; + this.scoreFunction = scoreFunction; + this.maxDoc = maxDoc; + this.vectorDataOffset = vectorDataOffset; + this.vectorDataLength = vectorDataLength; + this.ordToDoc = ordToDoc; + } + + int size() { + return ordToDoc.length; + } + } + + /** Read the vector values from the index input. This supports both iterated and random access. */ + private final static class OffHeapVectorValues extends VectorValues { + + final FieldEntry fieldEntry; + final IndexInput dataIn; + + final BytesRef binaryValue; + final ByteBuffer byteBuffer; + final FloatBuffer floatBuffer; + final int byteSize; + final float[] value; + + int ord = -1; + int doc = -1; + + OffHeapVectorValues(FieldEntry fieldEntry, IndexInput dataIn) { + this.fieldEntry = fieldEntry; + this.dataIn = dataIn; + byteSize = Float.BYTES * fieldEntry.dimension; + byteBuffer = ByteBuffer.allocate(byteSize); + floatBuffer = byteBuffer.asFloatBuffer(); + value = new float[fieldEntry.dimension]; + binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } + + @Override + public int dimension() { + return fieldEntry.dimension; + } + + @Override + public int size() { + return fieldEntry.size(); + } + + @Override + public ScoreFunction scoreFunction() { + return fieldEntry.scoreFunction; + } + + @Override + public float[] vectorValue() throws IOException { + binaryValue(); + floatBuffer.position(0); + floatBuffer.get(value, 0, fieldEntry.dimension); + return value; + } + + @Override + public BytesRef binaryValue() throws IOException { + dataIn.seek(ord * byteSize); + dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + return binaryValue; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int nextDoc() { + if (++ord >= size()) { + doc = NO_MORE_DOCS; + } else { + doc = fieldEntry.ordToDoc[ord]; + } + return doc; + } + + @Override + public int advance(int target) throws IOException { + // We could do better by log-binary search in ordToDoc, but this is never used + return slowAdvance(target); + } + + @Override + public long cost() { + return fieldEntry.size(); + } + + @Override + public RandomAccess randomAccess() { + return new OffHeapRandomAccess(dataIn.clone()); + } + + + class OffHeapRandomAccess implements VectorValues.RandomAccess { + + final IndexInput dataIn; + + final BytesRef binaryValue; + final ByteBuffer byteBuffer; + final FloatBuffer floatBuffer; + final int byteSize; + final float[] value; + + OffHeapRandomAccess(IndexInput dataIn) { + this.dataIn = dataIn; + byteSize = Float.BYTES * dimension(); + byteBuffer = ByteBuffer.allocate(byteSize); + floatBuffer = byteBuffer.asFloatBuffer(); + value = new float[dimension()]; + binaryValue = new BytesRef(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } + + @Override + public int size() { + return fieldEntry.size(); + } + + @Override + public int dimension() { + return fieldEntry.dimension; + } + + @Override + public VectorValues.ScoreFunction scoreFunction() { + return fieldEntry.scoreFunction; + } + + @Override + public float[] vectorValue(int targetOrd) throws IOException { + readValue(targetOrd); + floatBuffer.position(0); + floatBuffer.get(value); + return value; + } + + @Override + public BytesRef binaryValue(int targetOrd) throws IOException { + readValue(targetOrd); + return binaryValue; + } + + private void readValue(int targetOrd) throws IOException { + long offset = targetOrd * byteSize; + dataIn.seek(offset); + dataIn.readBytes(byteBuffer.array(), byteBuffer.arrayOffset(), byteSize); + } + + @Override + public TopDocs search(float[] vector, int topK, int fanout) throws IOException { + throw new UnsupportedOperationException(); + } + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorWriter.java new file mode 100644 index 000000000000..cdafb665251e --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90VectorWriter.java @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.codecs.lucene90; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.VectorWriter; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.VectorValues; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** + * Writes vector values and knn graphs to index segments. + * @lucene.experimental + */ +public final class Lucene90VectorWriter extends VectorWriter { + + private final IndexOutput meta, vectorData; + + private boolean finished; + + Lucene90VectorWriter(SegmentWriteState state) throws IOException { + assert state.fieldInfos.hasVectorValues(); + + String metaFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.META_EXTENSION); + meta = state.directory.createOutput(metaFileName, state.context); + + String vectorDataFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, Lucene90VectorFormat.VECTOR_DATA_EXTENSION); + vectorData = state.directory.createOutput(vectorDataFileName, state.context); + + try { + CodecUtil.writeIndexHeader(meta, + Lucene90VectorFormat.META_CODEC_NAME, + Lucene90VectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader(vectorData, + Lucene90VectorFormat.VECTOR_DATA_CODEC_NAME, + Lucene90VectorFormat.VERSION_CURRENT, + state.segmentInfo.getId(), state.segmentSuffix); + } catch (IOException e) { + IOUtils.closeWhileHandlingException(this); + } + } + + @Override + public void writeField(FieldInfo fieldInfo, VectorValues vectors) throws IOException { + long vectorDataOffset = vectorData.getFilePointer(); + // TODO - use a better data structure; a bitset? DocsWithFieldSet is p.p. in o.a.l.index + List docIds = new ArrayList<>(); + int docV, ord = 0; + for (docV = vectors.nextDoc(); docV != NO_MORE_DOCS; docV = vectors.nextDoc(), ord++) { + writeVectorValue(vectors); + docIds.add(docV); + // TODO: write knn graph value + } + long vectorDataLength = vectorData.getFilePointer() - vectorDataOffset; + if (vectorDataLength > 0) { + writeMeta(fieldInfo, vectorDataOffset, vectorDataLength, docIds); + } + } + + private void writeVectorValue(VectorValues vectors) throws IOException { + // write vector value + BytesRef binaryValue = vectors.binaryValue(); + assert binaryValue.length == vectors.dimension() * Float.BYTES; + vectorData.writeBytes(binaryValue.bytes, binaryValue.offset, binaryValue.length); + } + + private void writeMeta(FieldInfo field, long vectorDataOffset, long vectorDataLength, List docIds) throws IOException { + meta.writeInt(field.number); + meta.writeInt(field.getVectorScoreFunction().ordinal()); + meta.writeVLong(vectorDataOffset); + meta.writeVLong(vectorDataLength); + meta.writeInt(field.getVectorDimension()); + meta.writeInt(docIds.size()); + for (Integer docId : docIds) { + // TODO: delta-encode, or write as bitset + meta.writeVInt(docId); + } + } + + @Override + public void finish() throws IOException { + if (finished) { + throw new IllegalStateException("already finished"); + } + finished = true; + + if (meta != null) { + // write end of fields marker + meta.writeInt(-1); + CodecUtil.writeFooter(meta); + } + if (vectorData != null) { + CodecUtil.writeFooter(vectorData); + } + } + + @Override + public void close() throws IOException { + IOUtils.close(meta, vectorData); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java new file mode 100644 index 000000000000..c6722c854a5f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/package-info.java @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 9.0 file format. + * + *

Apache Lucene - Index File Formats

+ * + * + *

Introduction

+ *
+ *

This document defines the index file formats used in this version of Lucene. + * If you are using a different version of Lucene, please consult the copy of + * docs/ that was distributed with + * the version you are using.

+ *

This document attempts to provide a high-level definition of the Apache + * Lucene file formats.

+ *
+ * + *

Definitions

+ *
+ *

The fundamental concepts in Lucene are index, document, field and term.

+ *

An index contains a sequence of documents.

+ *
    + *
  • A document is a sequence of fields.
  • + *
  • A field is a named sequence of terms.
  • + *
  • A term is a sequence of bytes.
  • + *
+ *

The same sequence of bytes in two different fields is considered a different + * term. Thus terms are represented as a pair: the string naming the field, and the + * bytes within the field.

+ * + *

Inverted Indexing

+ *

Lucene's index stores terms and statistics about those terms in order to make + * term-based search more efficient. Lucene's terms index falls into the family of indexes known as + * an inverted index. This is because it can list, for a term, the documents that contain + * it. This is the inverse of the natural relationship, in which documents list terms.

+ * + *

Types of Fields

+ *

In Lucene, fields may be stored, in which case their text is stored + * in the index literally, in a non-inverted manner. Fields that are inverted are + * called indexed. A field may be both stored and indexed.

+ *

The text of a field may be tokenized into terms to be indexed, or the + * text of a field may be used literally as a term to be indexed. Most fields are + * tokenized, but sometimes it is useful for certain identifier fields to be + * indexed literally.

+ *

See the {@link org.apache.lucene.document.Field Field} + * java docs for more information on Fields.

+ * + *

Segments

+ *

Lucene indexes may be composed of multiple sub-indexes, or segments. + * Each segment is a fully independent index, which could be searched separately. + * Indexes evolve by:

+ *
    + *
  1. Creating new segments for newly added documents.
  2. + *
  3. Merging existing segments.
  4. + *
+ *

Searches may involve multiple segments and/or multiple indexes, each index + * potentially composed of a set of segments.

+ * + *

Document Numbers

+ *

Internally, Lucene refers to documents by an integer document number. + * The first document added to an index is numbered zero, and each subsequent + * document added gets a number one greater than the previous.

+ *

Note that a document's number may change, so caution should be taken when + * storing these numbers outside of Lucene. In particular, numbers may change in + * the following situations:

+ *
    + *
  • + *

    The numbers stored in each segment are unique only within the segment, and + * must be converted before they can be used in a larger context. The standard + * technique is to allocate each segment a range of values, based on the range of + * numbers used in that segment. To convert a document number from a segment to an + * external value, the segment's base document number is added. To convert + * an external value back to a segment-specific value, the segment is identified + * by the range that the external value is in, and the segment's base value is + * subtracted. For example two five document segments might be combined, so that + * the first segment has a base value of zero, and the second of five. Document + * three from the second segment would have an external value of eight.

    + *
  • + *
  • + *

    When documents are deleted, gaps are created in the numbering. These are + * eventually removed as the index evolves through merging. Deleted documents are + * dropped when segments are merged. A freshly-merged segment thus has no gaps in + * its numbering.

    + *
  • + *
+ *
+ * + *

Index Structure Overview

+ *
+ *

Each segment index maintains the following:

+ *
    + *
  • + * {@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment info}. + * This contains metadata about a segment, such as the number of documents, + * what files it uses, and information about how the segment is sorted + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene90.Lucene90FieldInfosFormat Field names}. + * This contains metadata about the set of named fields used in the index. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes + * are field names. These are used to store auxiliary information about the document, such as + * its title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term dictionary}. + * A dictionary containing all of the terms used in all of the + * indexed fields of all of the documents. The dictionary also contains the number + * of documents which contain the term, and pointers to the term's frequency and + * proximity data. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Frequency data}. + * For each term in the dictionary, the numbers of all the + * documents that contain that term, and the frequency of the term in that + * document, unless frequencies are omitted ({@link org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Proximity data}. + * For each term in the dictionary, the positions that the + * term occurs in each document. Note that this will not exist if all fields in + * all documents omit position data. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Normalization factors}. + * For each field in each document, a value is stored + * that is multiplied into the score for hits on that field. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vectors}. + * For each field in each document, the term vector (sometimes + * called document vector) may be stored. A term vector consists of term text and + * term frequency. To add Term Vectors to your index see the + * {@link org.apache.lucene.document.Field Field} constructors + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-document values}. + * Like stored values, these are also keyed by document + * number, but are generally intended to be loaded into main memory for fast + * access. Whereas stored values are generally intended for summary results from + * searches, per-document values are useful for things like scoring factors. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live documents}. + * An optional file indicating which documents are live. + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}. + * Optional pair of files, recording dimensionally indexed fields, to enable fast + * numeric range filtering and large numeric values like BigInteger and BigDecimal (1D) + * and geographic shape intersection (2D, 3D). + *
  • + *
  • + * {@link org.apache.lucene.codecs.lucene90.Lucene90VectorFormat Vector values}. + * The vector format stores numeric vectors in a format optimized for random access and computation, + * supporting high-dimensional nearest-neighbor search. + *
  • + *
+ *

Details on each of these are provided in their linked pages.

+ *
+ * + *

File Naming

+ *
+ *

All files belonging to a segment have the same name with varying extensions. + * The extensions correspond to the different file formats described below. When + * using the Compound File format (default for small segments) these files (except + * for the Segment info file, the Lock file, and Deleted documents file) are collapsed + * into a single .cfs file (see below for details)

+ *

Typically, all segments in an index are stored in a single directory, + * although this is not required.

+ *

File names are never re-used. That is, when any file is saved + * to the Directory it is given a never before used filename. This is achieved + * using a simple generations approach. For example, the first segments file is + * segments_1, then segments_2, etc. The generation is a sequential long integer + * represented in alpha-numeric (base 36) form.

+ *
+ * + *

Summary of File Extensions

+ *
+ *

The following table summarizes the names and extensions of the files in + * Lucene:

+ * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
{@link org.apache.lucene.codecs.lucene86.Lucene86SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene60.Lucene60FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene80.Lucene80NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene86.Lucene86PointsFormat Point values}.dii, .dimHolds indexed points
{@link org.apache.lucene.codecs.lucene90.Lucene90VectorFormat Vector values}.vec, .vemHolds indexed vectors; .vec files contain the raw vector data, and + * .vem the vector metadata
+ *
+ * + *

Lock File

+ * The write lock, which is stored in the index directory by default, is named + * "write.lock". If the lock directory is different from the index directory then + * the write lock will be named "XXXX-write.lock" where XXXX is a unique prefix + * derived from the full path to the index directory. When this file is present, a + * writer is currently modifying the index (adding or removing documents). This + * lock file ensures that only one writer is modifying the index at a time. + * + *

History

+ *

Compatibility notes are provided in this document, describing how file + * formats have changed from prior versions:

+ *
    + *
  • In version 2.1, the file format was changed to allow lock-less commits (ie, + * no more commit lock). The change is fully backwards compatible: you can open a + * pre-2.1 index for searching or adding/deleting of docs. When the new segments + * file is saved (committed), it will be written in the new file format (meaning + * no specific "upgrade" process is needed). But note that once a commit has + * occurred, pre-2.1 Lucene will not be able to read the index.
  • + *
  • In version 2.3, the file format was changed to allow segments to share a + * single set of doc store (vectors & stored fields) files. This allows for + * faster indexing in certain cases. The change is fully backwards compatible (in + * the same way as the lock-less commits change in 2.1).
  • + *
  • In version 2.4, Strings are now written as true UTF-8 byte sequence, not + * Java's modified UTF-8. See + * LUCENE-510 for details.
  • + *
  • In version 2.9, an optional opaque Map<String,String> CommitUserData + * may be passed to IndexWriter's commit methods (and later retrieved), which is + * recorded in the segments_N file. See + * LUCENE-1382 for details. Also, + * diagnostics were added to each segment written recording details about why it + * was written (due to flush, merge; which OS/JRE was used; etc.). See issue + * LUCENE-1654 for details.
  • + *
  • In version 3.0, compressed fields are no longer written to the index (they + * can still be read, but on merge the new segment will write them, uncompressed). + * See issue LUCENE-1960 + * for details.
  • + *
  • In version 3.1, segments records the code version that created them. See + * LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. + * See LUCENE-2811 + * for details.
  • + *
  • In version 3.2, numeric fields are written as natively to stored fields + * file, previously they were stored in text format only.
  • + *
  • In version 3.4, fields can omit position data while still indexing term + * frequencies.
  • + *
  • In version 4.0, the format of the inverted index became extensible via + * the {@link org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage + * ({@code DocValues}) was introduced. Normalization factors need no longer be a + * single byte, they can be any {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. + * Terms need not be unicode strings, they can be any byte sequence. Term offsets + * can optionally be indexed into the postings lists. Payloads can be stored in the + * term vectors.
  • + *
  • In version 4.1, the format of the postings list changed to use either + * of FOR compression or variable-byte encoding, depending upon the frequency + * of the term. Terms appearing only once were changed to inline directly into + * the term dictionary. Stored fields are compressed by default.
  • + *
  • In version 4.2, term vectors are compressed by default. DocValues has + * a new multi-valued type (SortedSet), that can be used for faceting/grouping/joining + * on multi-valued fields.
  • + *
  • In version 4.5, DocValues were extended to explicitly represent missing values.
  • + *
  • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields.
  • + *
  • In version 4.8, checksum footers were added to the end of each index file + * for improved data integrity. Specifically, the last 8 bytes of every index file + * contain the zlib-crc32 checksum of the file.
  • + *
  • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) + * that is suitable for faceting/sorting/analytics.
  • + *
  • In version 5.4, DocValues have been improved to store more information on disk: + * addresses for binary fields and ord indexes for multi-valued fields.
  • + *
  • In version 6.0, Points were added, for multi-dimensional range/distance search.
  • + *
  • In version 6.2, new Segment info format that reads/writes the index sort, to support index sorting. + *
  • In version 7.0, DocValues have been improved to better support sparse doc values + * thanks to an iterator API.
  • + *
  • In version 8.0, postings have been enhanced to record, for each block of + * doc ids, the (term freq, normalization factor) pairs that may trigger the + * maximum score of the block. This information is recorded alongside skip data + * in order to be able to skip blocks of doc ids if they may not produce high + * enough scores. + * Additionally doc values and norms has been extended with jump-tables to make access O(1) + * instead of O(n), where n is the number of elements to skip when advancing in the data.
  • + *
  • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized.
  • + *
  • In version 8.6, index sort serialization is delegated to the sorts themselves, to + * allow user-defined sorts to be used
  • + *
  • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields.
  • + *
  • In version 9.0, vector-valued fields were added.
  • + *
+ * + *

Limitations

+ *
+ *

Lucene uses a Java int to refer to + * document numbers, and the index file format uses an Int32 + * on-disk to store document numbers. This is a limitation + * of both the index file format and the current implementation. Eventually these + * should be replaced with either UInt64 values, or + * better yet, {@link org.apache.lucene.store.DataOutput#writeVInt VInt} values which have no limit.

+ *
+ */ +package org.apache.lucene.codecs.lucene90; diff --git a/lucene/core/src/java/org/apache/lucene/document/FieldType.java b/lucene/core/src/java/org/apache/lucene/document/FieldType.java index 82a6454a525a..21d9edeb77ae 100644 --- a/lucene/core/src/java/org/apache/lucene/document/FieldType.java +++ b/lucene/core/src/java/org/apache/lucene/document/FieldType.java @@ -25,6 +25,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableFieldType; import org.apache.lucene.index.PointValues; +import org.apache.lucene.index.VectorValues; /** * Describes the properties of a field. @@ -44,6 +45,8 @@ public class FieldType implements IndexableFieldType { private int dimensionCount; private int indexDimensionCount; private int dimensionNumBytes; + private int vectorDimension; + private VectorValues.ScoreFunction vectorScoreFunction = VectorValues.ScoreFunction.NONE; private Map attributes; /** @@ -62,6 +65,8 @@ public FieldType(IndexableFieldType ref) { this.dimensionCount = ref.pointDimensionCount(); this.indexDimensionCount = ref.pointIndexDimensionCount(); this.dimensionNumBytes = ref.pointNumBytes(); + this.vectorDimension = ref.vectorDimension(); + this.vectorScoreFunction = ref.vectorScoreFunction(); if (ref.getAttributes() != null) { this.attributes = new HashMap<>(ref.getAttributes()); } @@ -295,6 +300,7 @@ public void setDimensions(int dimensionCount, int dimensionNumBytes) { * Enables points indexing with selectable dimension indexing. */ public void setDimensions(int dimensionCount, int indexDimensionCount, int dimensionNumBytes) { + checkIfFrozen(); if (dimensionCount < 0) { throw new IllegalArgumentException("dimensionCount must be >= 0; got " + dimensionCount); } @@ -351,6 +357,28 @@ public int pointNumBytes() { return dimensionNumBytes; } + void setVectorDimensionsAndScoreFunction(int numDimensions, VectorValues.ScoreFunction distFunc) { + checkIfFrozen(); + if (numDimensions <= 0) { + throw new IllegalArgumentException("vector numDimensions must be > 0; got " + numDimensions); + } + if (numDimensions > VectorValues.MAX_DIMENSIONS) { + throw new IllegalArgumentException("vector numDimensions must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + numDimensions); + } + this.vectorDimension = numDimensions; + this.vectorScoreFunction = distFunc; + } + + @Override + public int vectorDimension() { + return vectorDimension; + } + + @Override + public VectorValues.ScoreFunction vectorScoreFunction() { + return vectorScoreFunction; + } + /** * Puts an attribute value. *

diff --git a/lucene/core/src/java/org/apache/lucene/document/VectorField.java b/lucene/core/src/java/org/apache/lucene/document/VectorField.java new file mode 100644 index 000000000000..24d9bd8ddafc --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/document/VectorField.java @@ -0,0 +1,99 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.document; + +import org.apache.lucene.index.VectorValues; + +/** A field that contains a single floating-point numeric vector (or none) for each document. + * Vectors are dense - that is, every dimension of a vector contains an explicit value, stored + * packed into an array (of type float[]) whose length is the vector dimension. Values can be + * retrieved using {@link VectorValues}, which is a forward-only docID-based iterator and also + * offers random-access by dense ordinal (not docId). VectorValues.ScoreFunctions may be + * used to compare vectors at query time (for example as part of result ranking). A VectorField may + * be associated with a score function that defines the metric used for nearest-neighbor search + * among vectors of that field, but at the moment this association is purely nominal: it is intended + * for future use by the to-be-implemented nearest neighbors search. + */ +public class VectorField extends Field { + + private static FieldType getType(float[] v, VectorValues.ScoreFunction scoreFunction) { + if (v == null) { + throw new IllegalArgumentException("vector value must not be null"); + } + int dimension = v.length; + if (dimension == 0) { + throw new IllegalArgumentException("cannot index an empty vector"); + } + if (dimension > VectorValues.MAX_DIMENSIONS) { + throw new IllegalArgumentException("cannot index vectors with dimension greater than " + VectorValues.MAX_DIMENSIONS); + } + if (scoreFunction == null) { + throw new IllegalArgumentException("score function must not be null"); + } + FieldType type = new FieldType(); + type.setVectorDimensionsAndScoreFunction(dimension, scoreFunction); + type.freeze(); + return type; + } + + /** Creates a numeric vector field. Fields are single-valued: each document has either one value + * or no value. Vectors of a single field share the same dimension and score function. + * + * @param name field name + * @param vector value + * @param scoreFunction a function defining vector proximity. + * @throws IllegalArgumentException if any parameter is null, or the vector is empty or has dimension > 1024. + */ + public VectorField(String name, float[] vector, VectorValues.ScoreFunction scoreFunction) { + super(name, getType(vector, scoreFunction)); + fieldsData = vector; + } + + /** Creates a numeric vector field with the default EUCLIDEAN (L2) score function. Fields are + * single-valued: each document has either one value or no value. Vectors of a single field share + * the same dimension and score function. + * + * @param name field name + * @param vector value + * @throws IllegalArgumentException if any parameter is null, or the vector is empty or has dimension > 1024. + */ + public VectorField(String name, float[] vector) { + this(name, vector, VectorValues.ScoreFunction.EUCLIDEAN); + } + + /** + * Return the vector value of this field + */ + public float[] vectorValue() { + return (float[]) fieldsData; + } + + /** + * Set the vector value of this field + * @param value the value to set; must not be null, and length must match the field type + */ + public void setVectorValue(float[] value) { + if (value == null) { + throw new IllegalArgumentException("value must not be null"); + } + if (value.length != type.vectorDimension()) { + throw new IllegalArgumentException("value length " + value.length + " must match field dimension " + type.vectorDimension()); + } + fieldsData = value; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java index 3b1f533bc477..aff5a1fc46fb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java +++ b/lucene/core/src/java/org/apache/lucene/index/CheckIndex.java @@ -220,6 +220,9 @@ public static class SegmentInfoStatus { /** Status of index sort */ public IndexSortStatus indexSortStatus; + + /** Status of vectors */ + public VectorValuesStatus vectorValuesStatus; } /** @@ -374,7 +377,25 @@ public static final class PointsStatus { /** Total number of fields with points. */ public int totalValueFields; - /** Exception thrown during doc values test (null on success) */ + /** Exception thrown during point values test (null on success) */ + public Throwable error = null; + } + + /** + * Status from testing VectorValues + */ + public static final class VectorValuesStatus { + + VectorValuesStatus() { + } + + /** Total number of vector values tested. */ + public long totalVectorValues; + + /** Total number of fields with vectors. */ + public int totalVectorFields; + + /** Exception thrown during vector values test (null on success) */ public Throwable error = null; } @@ -731,6 +752,9 @@ public Status checkIndex(List onlySegments) throws IOException { // Test PointValues segInfoStat.pointsStatus = testPoints(reader, infoStream, failFast); + // Test VectorValues + segInfoStat.vectorValuesStatus = testVectors(reader, infoStream, failFast); + // Test index sort segInfoStat.indexSortStatus = testSort(reader, indexSort, infoStream, failFast); @@ -1955,6 +1979,65 @@ public static Status.PointsStatus testPoints(CodecReader reader, PrintStream inf return status; } + /** + * Test the vectors index + * @lucene.experimental + */ + public static Status.VectorValuesStatus testVectors(CodecReader reader, PrintStream infoStream, boolean failFast) throws IOException { + if (infoStream != null) { + infoStream.print(" test: vectors.............."); + } + long startNS = System.nanoTime(); + FieldInfos fieldInfos = reader.getFieldInfos(); + Status.VectorValuesStatus status = new Status.VectorValuesStatus(); + try { + + if (fieldInfos.hasVectorValues()) { + for (FieldInfo fieldInfo : fieldInfos) { + if (fieldInfo.hasVectorValues()) { + int dimension = fieldInfo.getVectorDimension(); + if (dimension <= 0) { + throw new RuntimeException("Field \"" + fieldInfo.name + "\" has vector values but dimension is " + dimension); + } + VectorValues values = reader.getVectorValues(fieldInfo.name); + if (values == null) { + continue; + } + + status.totalVectorFields++; + + int docCount = 0; + while (values.nextDoc() != NO_MORE_DOCS) { + int valueLength = values.vectorValue().length; + if (valueLength != dimension) { + throw new RuntimeException("Field \"" + fieldInfo.name + "\" has a value whose dimension=" + valueLength + " not matching the field's dimension=" + dimension); + } + ++docCount; + } + if (docCount != values.size()) { + throw new RuntimeException("Field \"" + fieldInfo.name + "\" has size=" + values.size() + " but when iterated, returns " + docCount + " docs with values"); + } + status.totalVectorValues += docCount; + } + } + } + + msg(infoStream, String.format(Locale.ROOT, "OK [%d fields, %d vectors] [took %.3f sec]", status.totalVectorFields, status.totalVectorValues, nsToSec(System.nanoTime()-startNS))); + + } catch (Throwable e) { + if (failFast) { + throw IOUtils.rethrowAlways(e); + } + msg(infoStream, "ERROR: " + e); + status.error = e; + if (infoStream != null) { + e.printStackTrace(infoStream); + } + } + + return status; + } + /** Walks the entire N-dimensional points space, verifying that all points fall within the last cell's boundaries. * * @lucene.internal */ diff --git a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java index 4459ab11edd6..ab3b93f80a9d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/CodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/CodecReader.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.VectorReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; @@ -77,6 +78,12 @@ protected CodecReader() {} * @lucene.internal */ public abstract PointsReader getPointsReader(); + + /** + * Expert: retrieve underlying VectorReader + * @lucene.internal + */ + public abstract VectorReader getVectorReader(); @Override public final void document(int docID, StoredFieldVisitor visitor) throws IOException { @@ -202,6 +209,18 @@ public final PointValues getPointValues(String field) throws IOException { return getPointsReader().getValues(field); } + @Override + public final VectorValues getVectorValues(String field) throws IOException { + ensureOpen(); + FieldInfo fi = getFieldInfos().fieldInfo(field); + if (fi == null || fi.getVectorDimension() == 0) { + // Field does not exist or does not index vectors + return null; + } + + return getVectorReader().getVectorValues(field); + } + @Override protected void doClose() throws IOException { } diff --git a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java index 93b7f4988d68..f7f79e0b9e7d 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/DocValuesLeafReader.java @@ -47,6 +47,11 @@ public final PointValues getPointValues(String field) throws IOException { throw new UnsupportedOperationException(); } + @Override + public final VectorValues getVectorValues(String field) throws IOException { + throw new UnsupportedOperationException(); + } + @Override public final void checkIntegrity() throws IOException { throw new UnsupportedOperationException(); diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java index b8fe341fc6bb..4a3463c58fcc 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfo.java @@ -54,6 +54,9 @@ public final class FieldInfo { private int pointIndexDimensionCount; private int pointNumBytes; + private int vectorDimension; // if it is a positive value, it means this field indexes vectors + private VectorValues.ScoreFunction vectorScoreFunction = VectorValues.ScoreFunction.NONE; + // whether this field is used as the soft-deletes field private final boolean softDeletesField; @@ -64,7 +67,8 @@ public final class FieldInfo { */ public FieldInfo(String name, int number, boolean storeTermVector, boolean omitNorms, boolean storePayloads, IndexOptions indexOptions, DocValuesType docValues, long dvGen, Map attributes, - int pointDimensionCount, int pointIndexDimensionCount, int pointNumBytes, boolean softDeletesField) { + int pointDimensionCount, int pointIndexDimensionCount, int pointNumBytes, + int vectorDimension, VectorValues.ScoreFunction vectorScoreFunction, boolean softDeletesField) { this.name = Objects.requireNonNull(name); this.number = number; this.docValuesType = Objects.requireNonNull(docValues, "DocValuesType must not be null (field: \"" + name + "\")"); @@ -83,6 +87,8 @@ public FieldInfo(String name, int number, boolean storeTermVector, boolean omitN this.pointDimensionCount = pointDimensionCount; this.pointIndexDimensionCount = pointIndexDimensionCount; this.pointNumBytes = pointNumBytes; + this.vectorDimension = vectorDimension; + this.vectorScoreFunction = vectorScoreFunction; this.softDeletesField = softDeletesField; this.checkConsistency(); } @@ -137,6 +143,14 @@ public boolean checkConsistency() { throw new IllegalStateException("field '" + name + "' cannot have a docvalues update generation without having docvalues"); } + if (vectorDimension < 0) { + throw new IllegalStateException("vectorDimension must be >=0; got " + vectorDimension); + } + + if (vectorDimension == 0 && vectorScoreFunction != VectorValues.ScoreFunction.NONE) { + throw new IllegalStateException("vector score function must be NONE when dimension = 0; got " + vectorScoreFunction); + } + return true; } @@ -232,6 +246,40 @@ public int getPointNumBytes() { return pointNumBytes; } + /** Record that this field is indexed with vectors, with the specified num of dimensions and distance function */ + public void setVectorDimensionAndScoreFunction(int dimension, VectorValues.ScoreFunction scoreFunction) { + if (dimension < 0) { + throw new IllegalArgumentException("vector dimension must be >= 0; got " + dimension); + } + if (dimension > VectorValues.MAX_DIMENSIONS) { + throw new IllegalArgumentException("vector dimension must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + dimension); + } + if (dimension == 0 && scoreFunction != VectorValues.ScoreFunction.NONE) { + throw new IllegalArgumentException("vector score function must be NONE when the vector dimension = 0; got " + scoreFunction); + } + if (vectorDimension != 0 && vectorDimension != dimension) { + throw new IllegalArgumentException("cannot change vector dimension from " + vectorDimension + " to " + dimension + " for field=\"" + name + "\""); + } + if (vectorScoreFunction != VectorValues.ScoreFunction.NONE && vectorScoreFunction != scoreFunction) { + throw new IllegalArgumentException("cannot change vector score function from " + vectorScoreFunction + " to " + scoreFunction + " for field=\"" + name + "\""); + } + + this.vectorDimension = dimension; + this.vectorScoreFunction = scoreFunction; + + assert checkConsistency(); + } + + /** Returns the number of dimensions of the vector value */ + public int getVectorDimension() { + return vectorDimension; + } + + /** Returns {@link org.apache.lucene.index.VectorValues.ScoreFunction} for the field */ + public VectorValues.ScoreFunction getVectorScoreFunction() { + return vectorScoreFunction; + } + /** Record that this field is indexed with docvalues, with the specified type */ public void setDocValuesType(DocValuesType type) { if (type == null) { @@ -336,6 +384,13 @@ public boolean hasPayloads() { public boolean hasVectors() { return storeTermVector; } + + /** + * Returns whether any (numeric) vector values exist for this field + */ + public boolean hasVectorValues() { + return vectorDimension > 0; + } /** * Get a codec attribute value, or null if it does not exist diff --git a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java index 4b266d008a22..8d8ff15ee5ad 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java +++ b/lucene/core/src/java/org/apache/lucene/index/FieldInfos.java @@ -48,6 +48,7 @@ public class FieldInfos implements Iterable { private final boolean hasNorms; private final boolean hasDocValues; private final boolean hasPointValues; + private final boolean hasVectorValues; private final String softDeletesField; // used only by fieldInfo(int) @@ -68,6 +69,7 @@ public FieldInfos(FieldInfo[] infos) { boolean hasNorms = false; boolean hasDocValues = false; boolean hasPointValues = false; + boolean hasVectorValues = false; String softDeletesField = null; int size = 0; // number of elements in byNumberTemp, number of used array slots @@ -99,6 +101,7 @@ public FieldInfos(FieldInfo[] infos) { hasDocValues |= info.getDocValuesType() != DocValuesType.NONE; hasPayloads |= info.hasPayloads(); hasPointValues |= (info.getPointDimensionCount() != 0); + hasVectorValues |= (info.getVectorDimension() != 0); if (info.isSoftDeletesField()) { if (softDeletesField != null && softDeletesField.equals(info.name) == false) { throw new IllegalArgumentException("multiple soft-deletes fields [" + info.name + ", " + softDeletesField + "]"); @@ -115,6 +118,7 @@ public FieldInfos(FieldInfo[] infos) { this.hasNorms = hasNorms; this.hasDocValues = hasDocValues; this.hasPointValues = hasPointValues; + this.hasVectorValues = hasVectorValues; this.softDeletesField = softDeletesField; List valuesTemp = new ArrayList<>(); @@ -204,6 +208,11 @@ public boolean hasPointValues() { return hasPointValues; } + /** Returns true if any fields have VectorValues */ + public boolean hasVectorValues() { + return hasVectorValues; + } + /** Returns the soft-deletes field name if exists; otherwise returns null */ public String getSoftDeletesField() { return softDeletesField; @@ -261,6 +270,16 @@ public FieldDimensions(int dimensionCount, int indexDimensionCount, int dimensio this.dimensionNumBytes = dimensionNumBytes; } } + + static final class FieldVectorProperties { + final int numDimensions; + final VectorValues.ScoreFunction scoreFunction; + + FieldVectorProperties(int numDimensions, VectorValues.ScoreFunction scoreFunction) { + this.numDimensions = numDimensions; + this.scoreFunction = scoreFunction; + } + } static final class FieldNumbers { @@ -274,6 +293,8 @@ static final class FieldNumbers { private final Map dimensions; + private final Map vectorProps; + // TODO: we should similarly catch an attempt to turn // norms back on after they were already committed; today // we silently discard the norm but this is badly trappy @@ -288,6 +309,7 @@ static final class FieldNumbers { this.indexOptions = new HashMap<>(); this.docValuesType = new HashMap<>(); this.dimensions = new HashMap<>(); + this.vectorProps = new HashMap<>(); this.softDeletesFieldName = softDeletesFieldName; } @@ -297,7 +319,7 @@ static final class FieldNumbers { * number assigned if possible otherwise the first unassigned field number * is used as the field number. */ - synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int indexDimensionCount, int dimensionNumBytes, boolean isSoftDeletesField) { + synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptions indexOptions, DocValuesType dvType, int dimensionCount, int indexDimensionCount, int dimensionNumBytes, int vectorDimension, VectorValues.ScoreFunction scoreFunction, boolean isSoftDeletesField) { if (indexOptions != IndexOptions.NONE) { IndexOptions currentOpts = this.indexOptions.get(fieldName); if (currentOpts == null) { @@ -330,6 +352,19 @@ synchronized int addOrGet(String fieldName, int preferredFieldNumber, IndexOptio dimensions.put(fieldName, new FieldDimensions(dimensionCount, indexDimensionCount, dimensionNumBytes)); } } + if (vectorDimension != 0) { + FieldVectorProperties props = vectorProps.get(fieldName); + if (props != null) { + if (props.numDimensions != vectorDimension) { + throw new IllegalArgumentException("cannot change vector dimension from " + props.numDimensions + " to " + vectorDimension + " for field=\"" + fieldName + "\""); + } + if (props.scoreFunction != scoreFunction) { + throw new IllegalArgumentException("cannot change vector score function from " + props.scoreFunction + " to " + scoreFunction + " for field=\"" + fieldName + "\""); + } + } else { + vectorProps.put(fieldName, new FieldVectorProperties(vectorDimension, scoreFunction)); + } + } Integer fieldNumber = nameToNumber.get(fieldName); if (fieldNumber == null) { final Integer preferredBoxed = Integer.valueOf(preferredFieldNumber); @@ -408,6 +443,24 @@ synchronized void verifyConsistentDimensions(Integer number, String name, int da } } + synchronized void verifyConsistentVectorProperties(Integer number, String name, int numDimensions, VectorValues.ScoreFunction scoreFunction) { + if (name.equals(numberToName.get(number)) == false) { + throw new IllegalArgumentException("field number " + number + " is already mapped to field name \"" + numberToName.get(number) + "\", not \"" + name + "\""); + } + if (number.equals(nameToNumber.get(name)) == false) { + throw new IllegalArgumentException("field name \"" + name + "\" is already mapped to field number \"" + nameToNumber.get(name) + "\", not \"" + number + "\""); + } + FieldVectorProperties props = vectorProps.get(name); + if (props != null) { + if (props.numDimensions != numDimensions) { + throw new IllegalArgumentException("cannot change vector dimension from " + props.numDimensions + " to " + numDimensions + " for field=\"" + name + "\""); + } + if (props.scoreFunction != scoreFunction) { + throw new IllegalArgumentException("cannot change vector score function from " + props.scoreFunction + " to " + scoreFunction + " for field=\"" + name + "\""); + } + } + } + /** * Returns true if the {@code fieldName} exists in the map and is of the * same {@code dvType}. @@ -456,6 +509,17 @@ synchronized void setDimensions(int number, String name, int dimensionCount, int verifyConsistentDimensions(number, name, dimensionCount, indexDimensionCount, dimensionNumBytes); dimensions.put(name, new FieldDimensions(dimensionCount, indexDimensionCount, dimensionNumBytes)); } + + synchronized void setVectorDimensionsAndScoreFunction(int number, String name, int numDimensions, VectorValues.ScoreFunction scoreFunction) { + if (numDimensions <= 0) { + throw new IllegalArgumentException("vector numDimensions must be > 0; got " + numDimensions); + } + if (numDimensions > VectorValues.MAX_DIMENSIONS) { + throw new IllegalArgumentException("vector numDimensions must be <= VectorValues.MAX_DIMENSIONS (=" + VectorValues.MAX_DIMENSIONS + "); got " + numDimensions); + } + verifyConsistentVectorProperties(number, name, numDimensions, scoreFunction); + vectorProps.put(name, new FieldVectorProperties(numDimensions, scoreFunction)); + } } static final class Builder { @@ -489,8 +553,8 @@ public FieldInfo getOrAdd(String name) { // before then we'll get the same name and number, // else we'll allocate a new one: final boolean isSoftDeletesField = name.equals(globalFieldNumbers.softDeletesFieldName); - final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, 0, isSoftDeletesField); - fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, 0, isSoftDeletesField); + final int fieldNumber = globalFieldNumbers.addOrGet(name, -1, IndexOptions.NONE, DocValuesType.NONE, 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, false, false, false, IndexOptions.NONE, DocValuesType.NONE, -1, new HashMap<>(), 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, DocValuesType.NONE); byName.put(fi.name, fi); @@ -505,6 +569,7 @@ private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, DocValuesType docValues, long dvGen, Map attributes, int dataDimensionCount, int indexDimensionCount, int dimensionNumBytes, + int vectorDimension, VectorValues.ScoreFunction vectorScoreFunction, boolean isSoftDeletesField) { assert assertNotFinished(); if (docValues == null) { @@ -522,8 +587,8 @@ private FieldInfo addOrUpdateInternal(String name, int preferredFieldNumber, // number for this field. If the field was seen // before then we'll get the same name and number, // else we'll allocate a new one: - final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField); - fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, isSoftDeletesField); + final int fieldNumber = globalFieldNumbers.addOrGet(name, preferredFieldNumber, indexOptions, docValues, dataDimensionCount, indexDimensionCount, dimensionNumBytes, vectorDimension, vectorScoreFunction, isSoftDeletesField); + fi = new FieldInfo(name, fieldNumber, storeTermVector, omitNorms, storePayloads, indexOptions, docValues, dvGen, attributes, dataDimensionCount, indexDimensionCount, dimensionNumBytes, vectorDimension, vectorScoreFunction, isSoftDeletesField); assert !byName.containsKey(fi.name); globalFieldNumbers.verifyConsistent(Integer.valueOf(fi.number), fi.name, fi.getDocValuesType()); byName.put(fi.name, fi); @@ -558,6 +623,7 @@ public FieldInfo add(FieldInfo fi, long dvGen) { fi.getIndexOptions(), fi.getDocValuesType(), dvGen, fi.attributes(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), + fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()); } diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java index ff3ea186ac18..2814c117d0a1 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterCodecReader.java @@ -23,6 +23,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.VectorReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; @@ -101,6 +102,11 @@ public PointsReader getPointsReader() { return in.getPointsReader(); } + @Override + public VectorReader getVectorReader() { + return in.getVectorReader(); + } + @Override public int numDocs() { return in.numDocs(); diff --git a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java index 39087e4828dd..fc50130d582b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/FilterLeafReader.java @@ -330,6 +330,11 @@ public PointValues getPointValues(String field) throws IOException { return in.getPointValues(field); } + @Override + public VectorValues getVectorValues(String field) throws IOException { + return in.getVectorValues(field); + } + @Override public Fields getTermVectors(int docID) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java index b1a53464aba2..9c557184629b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexWriter.java @@ -1184,7 +1184,8 @@ private FieldNumbers getFieldNumberMap() throws IOException { for(SegmentCommitInfo info : segmentInfos) { FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { - map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); + map.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), + fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()); } } @@ -1921,7 +1922,7 @@ private DocValuesUpdate[] buildDocValuesUpdate(Term term, Field[] updates) { if (globalFieldNumberMap.contains(f.name(), dvType) == false) { // if this field doesn't exists we try to add it. if it exists and the DV type doesn't match we // get a consistent error message as if you try to do that during an indexing operation. - globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, 0, f.name().equals(config.softDeletesField)); + globalFieldNumberMap.addOrGet(f.name(), -1, IndexOptions.NONE, dvType, 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, f.name().equals(config.softDeletesField)); assert globalFieldNumberMap.contains(f.name(), dvType); } if (config.getIndexSortFields().contains(f.name())) { @@ -2966,7 +2967,9 @@ public long addIndexes(Directory... dirs) throws IOException { FieldInfos fis = readFieldInfos(info); for(FieldInfo fi : fis) { // This will throw exceptions if any of the incoming fields have an illegal schema change: - globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); + globalFieldNumberMap.addOrGet(fi.name, fi.number, fi.getIndexOptions(), fi.getDocValuesType(), + fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), + fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()); } infos.add(copySegmentAsIs(info, newSegName, context)); } diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java index 9eb7a1574dd9..9f85d04c7aa5 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexableFieldType.java @@ -114,6 +114,16 @@ public interface IndexableFieldType { */ public int pointNumBytes(); + /** + * The number of dimensions of the field's vector value + */ + public int vectorDimension(); + + /** + * The {@link org.apache.lucene.index.VectorValues.ScoreFunction} of the field's vector value + */ + public VectorValues.ScoreFunction vectorScoreFunction(); + /** * Attributes for the field type. * diff --git a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java index f40303b46409..071e6ceeda92 100644 --- a/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java +++ b/lucene/core/src/java/org/apache/lucene/index/IndexingChain.java @@ -32,12 +32,15 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.VectorFormat; +import org.apache.lucene.codecs.VectorWriter; import org.apache.lucene.codecs.NormsConsumer; import org.apache.lucene.codecs.NormsFormat; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsFormat; import org.apache.lucene.codecs.PointsWriter; import org.apache.lucene.document.FieldType; +import org.apache.lucene.document.VectorField; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -180,7 +183,6 @@ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException public FieldInfos getFieldInfos() { return fieldInfos.finish(); } - }; } @@ -230,6 +232,12 @@ Sorter.DocMap flush(SegmentWriteState state) throws IOException { if (infoStream.isEnabled("IW")) { infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write points"); } + + t0 = System.nanoTime(); + writeVectors(state, sortMap); + if (infoStream.isEnabled("IW")) { + infoStream.message("IW", ((System.nanoTime()-t0)/1000000) + " msec to write vectors"); + } // it's possible all docs hit non-aborting exceptions... t0 = System.nanoTime(); @@ -374,6 +382,50 @@ private void writeDocValues(SegmentWriteState state, Sorter.DocMap sortMap) thro } } + /** Writes all buffered vectors. */ + private void writeVectors(SegmentWriteState state, Sorter.DocMap sortMap) throws IOException { + VectorWriter vectorWriter = null; + boolean success = false; + try { + for (int i = 0; i { // Non-null if this field ever had points in this segment: PointValuesWriter pointValuesWriter; + // Non-null if this field ever had vector values in this segment: + VectorValuesWriter vectorValuesWriter; + /** We use this to know when a PerField is seen for the * first time in the current document. */ long fieldGen = -1; diff --git a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java index e1c31354f007..daab05555bd3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/LeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/LeafReader.java @@ -203,6 +203,10 @@ public final PostingsEnum postings(Term term) throws IOException { * used by a single thread. */ public abstract NumericDocValues getNormValues(String field) throws IOException; + /** Returns {@link VectorValues} for this field, or null if no {@link VectorValues} were indexed. + * The returned instance should only be used by a single thread. */ + public abstract VectorValues getVectorValues(String field) throws IOException; + /** * Get the {@link FieldInfos} describing all fields in * this reader. diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java index afa2612b7abe..9b854828f6cd 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeReaderWrapper.java @@ -194,6 +194,11 @@ public PointValues getPointValues(String fieldName) throws IOException { return in.getPointValues(fieldName); } + @Override + public VectorValues getVectorValues(String fieldName) throws IOException { + return in.getVectorValues(fieldName); + } + @Override public int numDocs() { return in.numDocs(); diff --git a/lucene/core/src/java/org/apache/lucene/index/MergeState.java b/lucene/core/src/java/org/apache/lucene/index/MergeState.java index 0b291e72ccb6..a0052e78c381 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MergeState.java +++ b/lucene/core/src/java/org/apache/lucene/index/MergeState.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.VectorReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; @@ -77,6 +78,9 @@ public class MergeState { /** Point readers to merge */ public final PointsReader[] pointsReaders; + /** Vector readers to merge */ + public final VectorReader[] vectorReaders; + /** Max docs per reader */ public final int[] maxDocs; @@ -103,6 +107,7 @@ public class MergeState { termVectorsReaders = new TermVectorsReader[numReaders]; docValuesProducers = new DocValuesProducer[numReaders]; pointsReaders = new PointsReader[numReaders]; + vectorReaders = new VectorReader[numReaders]; fieldInfos = new FieldInfos[numReaders]; liveDocs = new Bits[numReaders]; @@ -139,6 +144,12 @@ public class MergeState { if (pointsReaders[i] != null) { pointsReaders[i] = pointsReaders[i].getMergeInstance(); } + + vectorReaders[i] = reader.getVectorReader(); + if (vectorReaders[i] != null) { + vectorReaders[i] = vectorReaders[i].getMergeInstance(); + } + numDocs += reader.numDocs(); } diff --git a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java index 25f200a42431..1c61713f99d3 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/ParallelLeafReader.java @@ -369,6 +369,13 @@ public PointValues getPointValues(String fieldName) throws IOException { return reader == null ? null : reader.getPointValues(fieldName); } + @Override + public VectorValues getVectorValues(String fieldName) throws IOException { + ensureOpen(); + LeafReader reader = fieldToReader.get(fieldName); + return reader == null ? null : reader.getVectorValues(fieldName); + } + @Override public void checkIntegrity() throws IOException { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 1c2838f5162c..10127fb2b174 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -657,7 +657,8 @@ public synchronized boolean writeFieldUpdates(Directory dir, FieldInfos.FieldNum private FieldInfo cloneFieldInfo(FieldInfo fi, int fieldNumber) { return new FieldInfo(fi.name, fieldNumber, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), fi.getDocValuesType(), fi.getDocValuesGen(), new HashMap<>(fi.attributes()), - fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField()); + fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), + fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField()); } private SegmentReader createNewReaderWithLatestLiveDocs(SegmentReader reader) throws IOException { diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java index 1f128218fa6e..da5eb34d827c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentCoreReaders.java @@ -30,6 +30,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompoundDirectory; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.VectorReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.PostingsFormat; @@ -61,6 +62,7 @@ final class SegmentCoreReaders { final StoredFieldsReader fieldsReaderOrig; final TermVectorsReader termVectorsReaderOrig; final PointsReader pointsReader; + final VectorReader vectorReader; final CompoundDirectory cfsReader; final String segment; /** @@ -137,6 +139,13 @@ protected TermVectorsReader initialValue() { } else { pointsReader = null; } + + if (coreFieldInfos.hasVectorValues()) { + vectorReader = codec.vectorFormat().fieldsReader(segmentReadState); + } else { + vectorReader = null; + } + success = true; } catch (EOFException | FileNotFoundException e) { throw new CorruptIndexException("Problem reading index from " + dir, dir.toString(), e); @@ -168,7 +177,7 @@ void decRef() throws IOException { if (ref.decrementAndGet() == 0) { try (Closeable finalizer = this::notifyCoreClosedListeners){ IOUtils.close(termVectorsLocal, fieldsReaderLocal, fields, termVectorsReaderOrig, fieldsReaderOrig, - cfsReader, normsProducer, pointsReader); + cfsReader, normsProducer, pointsReader, vectorReader); } } } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java index 4f5549734d9b..ed04ae01f47f 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentMerger.java @@ -23,6 +23,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesConsumer; import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.VectorWriter; import org.apache.lucene.codecs.NormsConsumer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsWriter; @@ -98,15 +99,8 @@ MergeState merge() throws IOException { throw new IllegalStateException("Merge would result in 0 document segment"); } mergeFieldInfos(); - long t0 = 0; - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } - int numMerged = mergeFields(); - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge stored fields [" + numMerged + " docs]"); - } + + int numMerged = mergeWithLogging(this::mergeFields, "stored fields"); assert numMerged == mergeState.segmentInfo.maxDoc(): "numMerged=" + numMerged + " vs mergeState.segmentInfo.maxDoc()=" + mergeState.segmentInfo.maxDoc(); final SegmentWriteState segmentWriteState = new SegmentWriteState(mergeState.infoStream, directory, mergeState.segmentInfo, @@ -115,77 +109,29 @@ MergeState merge() throws IOException { IOContext.READ, segmentWriteState.segmentSuffix); if (mergeState.mergeFieldInfos.hasNorms()) { - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } - mergeNorms(segmentWriteState); - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge norms [" + numMerged + " docs]"); - } + mergeWithLogging(() -> mergeNorms(segmentWriteState), "norms", numMerged); } - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } - try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms() - ? codec.normsFormat().normsProducer(segmentReadState) - : null) { - NormsProducer normsMergeInstance = null; - if (norms != null) { - // Use the merge instance in order to reuse the same IndexInput for all terms - normsMergeInstance = norms.getMergeInstance(); - } - mergeTerms(segmentWriteState, normsMergeInstance); - } - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge postings [" + numMerged + " docs]"); - } + mergeWithLogging(() -> mergeTerms(segmentWriteState, segmentReadState), "postings", numMerged); - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } if (mergeState.mergeFieldInfos.hasDocValues()) { - mergeDocValues(segmentWriteState); - } - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge doc values [" + numMerged + " docs]"); + mergeWithLogging(() -> mergeDocValues(segmentWriteState), "doc values", numMerged); } - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } if (mergeState.mergeFieldInfos.hasPointValues()) { - mergePoints(segmentWriteState); + mergeWithLogging(() -> mergePoints(segmentWriteState), "points", numMerged); } - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge points [" + numMerged + " docs]"); + + if (mergeState.mergeFieldInfos.hasVectorValues()) { + mergeWithLogging(() -> mergeVectorValues(segmentWriteState), "numeric vectors", numMerged); } if (mergeState.mergeFieldInfos.hasVectors()) { - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } - numMerged = mergeVectors(); - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to merge vectors [" + numMerged + " docs]"); - } - assert numMerged == mergeState.segmentInfo.maxDoc(); + mergeWithLogging(this::mergeTermVectors, "term vectors"); } - + // write the merged infos - if (mergeState.infoStream.isEnabled("SM")) { - t0 = System.nanoTime(); - } - codec.fieldInfosFormat().write(directory, mergeState.segmentInfo, "", mergeState.mergeFieldInfos, context); - if (mergeState.infoStream.isEnabled("SM")) { - long t1 = System.nanoTime(); - mergeState.infoStream.message("SM", ((t1-t0)/1000000) + " msec to write field infos [" + numMerged + " docs]"); - } + mergeWithLogging(() -> codec.fieldInfosFormat().write(directory, mergeState.segmentInfo, "", mergeState.mergeFieldInfos, context), "field infos", numMerged); return mergeState; } @@ -207,7 +153,22 @@ private void mergeNorms(SegmentWriteState segmentWriteState) throws IOException consumer.merge(mergeState); } } - + + private void mergeTerms(SegmentWriteState segmentWriteState, SegmentReadState segmentReadState) throws IOException { + try (NormsProducer norms = mergeState.mergeFieldInfos.hasNorms() + ? codec.normsFormat().normsProducer(segmentReadState) + : null) { + NormsProducer normsMergeInstance = null; + if (norms != null) { + // Use the merge instance in order to reuse the same IndexInput for all terms + normsMergeInstance = norms.getMergeInstance(); + } + try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) { + consumer.merge(mergeState, normsMergeInstance); + } + } + } + public void mergeFieldInfos() { for (FieldInfos readerFieldInfos : mergeState.fieldInfos) { for (FieldInfo fi : readerFieldInfos) { @@ -233,15 +194,51 @@ private int mergeFields() throws IOException { * Merge the TermVectors from each of the segments into the new one. * @throws IOException if there is a low-level IO error */ - private int mergeVectors() throws IOException { + private int mergeTermVectors() throws IOException { try (TermVectorsWriter termVectorsWriter = codec.termVectorsFormat().vectorsWriter(directory, mergeState.segmentInfo, context)) { - return termVectorsWriter.merge(mergeState); + int numMerged = termVectorsWriter.merge(mergeState); + assert numMerged == mergeState.segmentInfo.maxDoc(); + return numMerged; } } - private void mergeTerms(SegmentWriteState segmentWriteState, NormsProducer norms) throws IOException { - try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(segmentWriteState)) { - consumer.merge(mergeState, norms); + private void mergeVectorValues(SegmentWriteState segmentWriteState) throws IOException { + try (VectorWriter writer = codec.vectorFormat().fieldsWriter(segmentWriteState)) { + writer.merge(mergeState); } } + + private interface Merger { + int merge() throws IOException; + } + + private interface VoidMerger { + void merge() throws IOException; + } + + private int mergeWithLogging(Merger merger, String formatName) throws IOException { + long t0 = 0; + if (mergeState.infoStream.isEnabled("SM")) { + t0 = System.nanoTime(); + } + int numMerged = merger.merge(); + if (mergeState.infoStream.isEnabled("SM")) { + long t1 = System.nanoTime(); + mergeState.infoStream.message("SM", ((t1 - t0) / 1000000) + " msec to merge vector values [" + numMerged + " docs]"); + } + return numMerged; + } + + private void mergeWithLogging(VoidMerger merger, String formatName, int numMerged) throws IOException { + long t0 = 0; + if (mergeState.infoStream.isEnabled("SM")) { + t0 = System.nanoTime(); + } + merger.merge(); + if (mergeState.infoStream.isEnabled("SM")) { + long t1 = System.nanoTime(); + mergeState.infoStream.message("SM", ((t1 - t0) / 1000000) + " msec to merge vector values [" + numMerged + " docs]"); + } + } + } diff --git a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java index 25145ff180f0..1da063a0b512 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java +++ b/lucene/core/src/java/org/apache/lucene/index/SegmentReader.java @@ -26,6 +26,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldInfosFormat; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.VectorReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; @@ -259,6 +260,11 @@ public DocValuesProducer getDocValuesReader() { return docValuesProducer; } + @Override + public VectorReader getVectorReader() { + return core.vectorReader; + } + @Override public FieldsProducer getPostingsReader() { ensureOpen(); diff --git a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java index 533255780351..b2ce9aa4d80c 100644 --- a/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java +++ b/lucene/core/src/java/org/apache/lucene/index/SlowCodecReaderWrapper.java @@ -24,6 +24,7 @@ import org.apache.lucene.codecs.DocValuesProducer; import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.VectorReader; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PointsReader; import org.apache.lucene.codecs.StoredFieldsReader; @@ -78,6 +79,12 @@ public DocValuesProducer getDocValuesReader() { return readerToDocValuesProducer(reader); } + @Override + public VectorReader getVectorReader() { + reader.ensureOpen(); + return readerToVectorReader(reader); + } + @Override public FieldsProducer getPostingsReader() { reader.ensureOpen(); @@ -160,6 +167,29 @@ public long ramBytesUsed() { }; } + + private static VectorReader readerToVectorReader(LeafReader reader) { + return new VectorReader() { + @Override + public VectorValues getVectorValues(String field) throws IOException { + return reader.getVectorValues(field); + } + + @Override + public void checkIntegrity() { + // We already checkIntegrity the entire reader up front + } + + @Override + public void close() { + } + + @Override + public long ramBytesUsed() { + return 0L; + } + }; + } private static NormsProducer readerToNormsProducer(final LeafReader reader) { return new NormsProducer() { diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorValues.java b/lucene/core/src/java/org/apache/lucene/index/VectorValues.java new file mode 100644 index 000000000000..45c79fc283db --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/VectorValues.java @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.IOException; + +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.BytesRef; + +/** + * This class provides access to per-document floating point vector values indexed as {@link + * org.apache.lucene.document.VectorField}. + * + * @lucene.experimental + */ +public abstract class VectorValues extends DocIdSetIterator { + + /** The maximum length of a vector */ + public static int MAX_DIMENSIONS = 1024; + + /** Sole constructor */ + protected VectorValues() {} + + /** + * Return the dimension of the vectors + */ + public abstract int dimension(); + + /** + * TODO: should we use cost() for this? We rely on its always being exactly the number + * of documents having a value for this field, which is not guaranteed by the cost() contract, + * but in all the implementations so far they are the same. + * @return the number of vectors returned by this iterator + */ + public abstract int size(); + + /** + * Return the score function used to compare these vectors + */ + public abstract ScoreFunction scoreFunction(); + + /** + * Return the vector value for the current document ID. + * It is illegal to call this method when the iterator is not positioned: before advancing, or after failing to advance. + * The returned array may be shared across calls, re-used, and modified as the iterator advances. + * @return the vector value + */ + public abstract float[] vectorValue() throws IOException; + + /** + * Return the binary encoded vector value for the current document ID. These are the bytes + * corresponding to the float array return by {@link #vectorValue}. It is illegal to call this + * method when the iterator is not positioned: before advancing, or after failing to advance. The + * returned storage may be shared across calls, re-used and modified as the iterator advances. + * @return the binary value + */ + public BytesRef binaryValue() throws IOException { + throw new UnsupportedOperationException(); + } + + /** + * Return a random access interface over this iterator's vectors. Calling the RandomAccess methods will + * have no effect on the progress of the iteration or the values returned by this iterator. Successive calls + * will retrieve independent copies that do not overwrite each others' returned values. + */ + public abstract RandomAccess randomAccess(); + + /** + * Provides random access to vectors by dense ordinal. + * + * @lucene.experimental + */ + public interface RandomAccess { + + /** + * Return the number of vector values + */ + int size(); + + /** + * Return the dimension of the returned vector values + */ + int dimension(); + + /** + * Return the score function used to compare these vectors + */ + ScoreFunction scoreFunction(); + + /** + * Return the vector value indexed at the given ordinal. The provided floating point array may + * be shared and overwritten by subsequent calls to this method and {@link #binaryValue(int)}. + * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. + */ + float[] vectorValue(int targetOrd) throws IOException; + + /** + * Return the vector indexed at the given ordinal value as an array of bytes in a BytesRef; + * these are the bytes corresponding to the float array in IEEE 754 standard encoding, encoded + * using little-endian byte order. The provided bytes may be shared and overwritten by subsequent + * calls to this method and {@link #vectorValue(int)}. + * @param targetOrd a valid ordinal, ≥ 0 and < {@link #size()}. + */ + BytesRef binaryValue(int targetOrd) throws IOException; + + /** + * Return the dense ordinal of the document if it has a vector. This ordinal ranges from 0 to the one less than the number + * of documents having a vector in this iterator, and it is guaranteed to increase with increasing docid. + * @param docId the document whose ordinal is returned + * @return the ordinal of the given document, or -1 if the document has no vector value + */ + //int ordinal(int docId); + + /** + * Return the k nearest neighbor documents as determined by comparison of their vector values + * for this field, to the given vector, by the field's score function. If the score function is + * reversed, lower values indicate nearer vectors, otherwise higher scores indicate nearer + * vectors. Unlike relevance scores, vector scores may be negative. + * @param target the vector-valued query + * @param k the number of docs to return + * @param fanout control the accuracy/speed tradeoff - larger values give better recall at higher cost + * @return the k nearest neighbor documents, along with their (scoreFunction-specific) scores. + */ + TopDocs search(float[] target, int k, int fanout) throws IOException; + } + + /** + * Score function. This is used during indexing and searching of the vectors to determine the nearest neighbors. + * Score values may be negative. By default high scores indicate nearer documents, unless the function is reversed. + */ + public enum ScoreFunction { + /** No distance function is used. Note: {@link VectorValues.RandomAccess#search(float[], int, int)} + * is not supported for fields specifying this score function. */ + NONE, + + /** Euclidean distance */ + EUCLIDEAN(true) { + @Override + public float score(float[] v1, float[] v2) { + assert v1.length == v2.length; + float squareSum = 0.0f; + int dim = v1.length; + for (int i = 0; i < dim; i++) { + float diff = v1[i] - v2[i]; + squareSum += diff * diff; + } + return squareSum; + } + }, + + /** dot product - note, may be negative; larger values are better */ + DOT_PRODUCT() { + @Override + public float score(float[] a, float[] b) { + float res = 0f; + /* + * If length of vector is larger than 8, we use unrolled dot product to accelerate the + * calculation. + */ + int i; + for (i = 0; i < a.length % 8; i++) { + res += b[i] * a[i]; + } + if (a.length < 8) { + return res; + } + float s0 = 0f; + float s1 = 0f; + float s2 = 0f; + float s3 = 0f; + float s4 = 0f; + float s5 = 0f; + float s6 = 0f; + float s7 = 0f; + for (; i + 7 < a.length; i += 8) { + s0 += b[i] * a[i]; + s1 += b[i + 1] * a[i + 1]; + s2 += b[i + 2] * a[i + 2]; + s3 += b[i + 3] * a[i + 3]; + s4 += b[i + 4] * a[i + 4]; + s5 += b[i + 5] * a[i + 5]; + s6 += b[i + 6] * a[i + 6]; + s7 += b[i + 7] * a[i + 7]; + } + res += s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7; + return res; + } + }; + + /** If reversed, smaller values are better */ + final public boolean reversed; + + ScoreFunction(boolean reversed) { + this.reversed = reversed; + } + + ScoreFunction() { + this(false); + } + + /** + * Calculates the score between the specified two vectors. + */ + public float score(float[] v1, float[] v2) { + throw new UnsupportedOperationException(); + } + + } + + /** + * Calculates a similarity score between the two vectors with specified function. + */ + public static float compare(float[] v1, float[] v2, ScoreFunction scoreFunction) { + assert v1.length == v2.length : "attempt to compare vectors of lengths: " + v1.length + " " + v2.length; + return scoreFunction.score(v1, v2); + } + + /** + * Represents the lack of vector values. It is returned by providers that do not + * support VectorValues. + */ + public static final VectorValues EMPTY = new VectorValues() { + + @Override + public int size() { + return 0; + } + + @Override + public int dimension() { + return 0; + } + + @Override + public ScoreFunction scoreFunction() { + return ScoreFunction.NONE; + } + + @Override + public float[] vectorValue() { + throw new IllegalStateException("Attempt to get vectors from EMPTY values (which was not advanced)"); + } + + @Override + public RandomAccess randomAccess() { + throw new UnsupportedOperationException(); + } + + @Override + public int docID() { + throw new IllegalStateException("VectorValues is EMPTY, and not positioned on a doc"); + } + + @Override + public int nextDoc() { + return NO_MORE_DOCS; + } + + @Override + public int advance(int target) { + return NO_MORE_DOCS; + } + + @Override + public long cost() { + return 0; + } + }; +} diff --git a/lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java b/lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java new file mode 100644 index 000000000000..564b7b61a22d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/VectorValuesWriter.java @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.index; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.codecs.VectorWriter; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Counter; +import org.apache.lucene.util.RamUsageEstimator; + +/** Buffers up pending vector value(s) per doc, then flushes when segment flushes. */ +class VectorValuesWriter { + + private final FieldInfo fieldInfo; + private final Counter iwBytesUsed; + private final List vectors = new ArrayList<>(); + private final DocsWithFieldSet docsWithField; + + private int lastDocID = -1; + + private long bytesUsed; + + VectorValuesWriter(FieldInfo fieldInfo, Counter iwBytesUsed) { + this.fieldInfo = fieldInfo; + this.iwBytesUsed = iwBytesUsed; + this.docsWithField = new DocsWithFieldSet(); + this.bytesUsed = docsWithField.ramBytesUsed(); + if (iwBytesUsed != null) { + iwBytesUsed.addAndGet(bytesUsed); + } + } + + /** + * Adds a value for the given document. Only a single value may be added. + * @param docID the value is added to this document + * @param vectorValue the value to add + * @throws IllegalArgumentException if a value has already been added to the given document + */ + public void addValue(int docID, float[] vectorValue) { + if (docID == lastDocID) { + throw new IllegalArgumentException("VectorValuesField \"" + fieldInfo.name + "\" appears more than once in this document (only one value is allowed per field)"); + } + if (vectorValue.length != fieldInfo.getVectorDimension()) { + throw new IllegalArgumentException("Attempt to index a vector of dimension " + vectorValue.length + + " but \"" + fieldInfo.name + "\" has dimension " + fieldInfo.getVectorDimension()); + } + assert docID > lastDocID; + docsWithField.add(docID); + vectors.add(ArrayUtil.copyOfSubArray(vectorValue, 0, vectorValue.length)); + updateBytesUsed(); + lastDocID = docID; + } + + private void updateBytesUsed() { + final long newBytesUsed = docsWithField.ramBytesUsed() + + vectors.size() * (RamUsageEstimator.NUM_BYTES_OBJECT_REF + RamUsageEstimator.NUM_BYTES_ARRAY_HEADER) + + vectors.size() * vectors.get(0).length * Float.BYTES; + if (iwBytesUsed != null) { + iwBytesUsed.addAndGet(newBytesUsed - bytesUsed); + } + bytesUsed = newBytesUsed; + } + + /** + * Flush this field's values to storage, sorting the values in accordance with sortMap + * @param sortMap specifies the order of documents being flushed, or null if they are to be flushed in docid order + * @param vectorWriter the Codec's vector writer that handles the actual encoding and I/O + * @throws IOException if there is an error writing the field and its values + */ + public void flush(Sorter.DocMap sortMap, VectorWriter vectorWriter) throws IOException { + VectorValues vectorValues = new BufferedVectorValues(docsWithField, vectors, fieldInfo.getVectorDimension(), fieldInfo.getVectorScoreFunction()); + if (sortMap != null) { + vectorWriter.writeField(fieldInfo, new SortingVectorValues(vectorValues, sortMap)); + } else { + vectorWriter.writeField(fieldInfo, vectorValues); + } + } + + private static class SortingVectorValues extends VectorValues { + + private final VectorValues delegate; + private final VectorValues.RandomAccess randomAccess; + private final int[] docIdOffsets; + private final int[] ordMap; + private int docId = -1; + + SortingVectorValues(VectorValues delegate, Sorter.DocMap sortMap) throws IOException { + this.delegate = delegate; + randomAccess = delegate.randomAccess(); + docIdOffsets = new int[sortMap.size()]; + + int offset = 1; // 0 means no vector for this (field, document) + int docID; + while ((docID = delegate.nextDoc()) != NO_MORE_DOCS) { + int newDocID = sortMap.oldToNew(docID); + docIdOffsets[newDocID] = offset++; + } + + // set up ordMap to map from new dense ordinal to old dense ordinal + ordMap = new int[offset - 1]; + int ord = 0; + for (int docIdOffset : docIdOffsets) { + if (docIdOffset != 0) { + ordMap[ord++] = docIdOffset - 1; + } + } + assert ord == ordMap.length; + } + + @Override + public int docID() { + return docId; + } + + @Override + public int nextDoc() throws IOException { + while (docId < docIdOffsets.length - 1) { + ++docId; + if (docIdOffsets[docId] != 0) { + return docId; + } + } + docId = NO_MORE_DOCS; + return docId; + } + + @Override + public BytesRef binaryValue() throws IOException { + return randomAccess.binaryValue(docIdOffsets[docId] - 1); + } + + @Override + public float[] vectorValue() { + throw new UnsupportedOperationException(); + } + + @Override + public int dimension() { + return delegate.dimension(); + } + + @Override + public int size() { + return delegate.size(); + } + + @Override + public ScoreFunction scoreFunction() { + return delegate.scoreFunction(); + } + + @Override + public int advance(int target) throws IOException { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return size(); + } + + @Override + public RandomAccess randomAccess() { + RandomAccess ra = delegate.randomAccess(); + return new RandomAccess() { + + @Override + public int size() { + return delegate.size(); + } + + @Override + public int dimension() { + return delegate.dimension(); + } + + @Override + public ScoreFunction scoreFunction() { + return delegate.scoreFunction(); + } + + @Override + public float[] vectorValue(int targetOrd) throws IOException { + return ra.vectorValue(ordMap[targetOrd]); + } + + @Override + public BytesRef binaryValue(int targetOrd) { + throw new UnsupportedOperationException(); + } + + @Override + public TopDocs search(float[] target, int k, int fanout) { + throw new UnsupportedOperationException(); + } + }; + } + } + + private static class BufferedVectorValues extends VectorValues implements VectorValues.RandomAccess { + + final DocsWithFieldSet docsWithField; + + // These are always the vectors of a VectorValuesWriter, which are copied when added to it + final List vectors; + final VectorValues.ScoreFunction scoreFunction; + final int dimension; + + final ByteBuffer buffer; + final BytesRef binaryValue; + final ByteBuffer raBuffer; + final BytesRef raBinaryValue; + + DocIdSetIterator docsWithFieldIter; + int ord = -1; + + BufferedVectorValues(DocsWithFieldSet docsWithField, List vectors, int dimension, VectorValues.ScoreFunction scoreFunction) { + this.docsWithField = docsWithField; + this.vectors = vectors; + this.dimension = dimension; + this.scoreFunction = scoreFunction; + buffer = ByteBuffer.allocate(dimension * Float.BYTES); + binaryValue = new BytesRef(buffer.array()); + raBuffer = ByteBuffer.allocate(dimension * Float.BYTES); + raBinaryValue = new BytesRef(raBuffer.array()); + docsWithFieldIter = docsWithField.iterator(); + } + + @Override + public RandomAccess randomAccess() { + return this; + } + + @Override + public int dimension() { + return dimension; + } + + @Override + public int size() { + return vectors.size(); + } + + @Override + public VectorValues.ScoreFunction scoreFunction() { + return scoreFunction; + } + + @Override + public BytesRef binaryValue() { + buffer.asFloatBuffer().put(vectorValue()); + return binaryValue; + } + + @Override + public BytesRef binaryValue(int targetOrd) { + raBuffer.asFloatBuffer().put(vectors.get(targetOrd)); + return raBinaryValue; + } + + @Override + public float[] vectorValue() { + return vectors.get(ord); + } + + @Override + public float[] vectorValue(int targetOrd) { + return vectors.get(targetOrd); + } + + @Override + public int docID() { + return docsWithFieldIter.docID(); + } + + @Override + public int nextDoc() throws IOException { + int docID = docsWithFieldIter.nextDoc(); + if (docID != NO_MORE_DOCS) { + ++ord; + } + return docID; + } + + @Override + public int advance(int target) { + throw new UnsupportedOperationException(); + } + + @Override + public long cost() { + return docsWithFieldIter.cost(); + } + + @Override + public TopDocs search(float[] target, int k, int fanout) throws IOException { + throw new UnsupportedOperationException(); + } + } +} diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index 2be0f71cc28d..7eec415dffa7 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene87.Lucene87Codec +org.apache.lucene.codecs.lucene90.Lucene90Codec diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java index b6dc5a5a3afd..f5dbf405200f 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene87/TestLucene87StoredFieldsFormatHighCompression.java @@ -19,6 +19,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat.Mode; +import org.apache.lucene.codecs.lucene90.Lucene90Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.BaseStoredFieldsFormatTestCase; @@ -32,7 +33,7 @@ public class TestLucene87StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene87Codec(Mode.BEST_COMPRESSION); + return new Lucene90Codec(Mode.BEST_COMPRESSION); } /** @@ -43,7 +44,7 @@ public void testMixedCompressions() throws Exception { Directory dir = newDirectory(); for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); - iwc.setCodec(new Lucene87Codec(RandomPicks.randomFrom(random(), Mode.values()))); + iwc.setCodec(new Lucene90Codec(RandomPicks.randomFrom(random(), Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -70,7 +71,7 @@ public void testMixedCompressions() throws Exception { public void testInvalidOptions() { expectThrows(NullPointerException.class, () -> { - new Lucene87Codec(null); + new Lucene90Codec(null); }); expectThrows(NullPointerException.class, () -> { diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java index f71d6a28716a..81e2387d038d 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexableField.java @@ -106,6 +106,16 @@ public int pointNumBytes() { return 0; } + @Override + public int vectorDimension() { + return 0; + } + + @Override + public VectorValues.ScoreFunction vectorScoreFunction() { + return VectorValues.ScoreFunction.NONE; + } + @Override public Map getAttributes() { return null; diff --git a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java index a7c681189042..226f199c9dea 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestPendingSoftDeletes.java @@ -37,6 +37,8 @@ import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; +import static org.apache.lucene.index.VectorValues.ScoreFunction.NONE; + public class TestPendingSoftDeletes extends TestPendingDeletes { @Override @@ -164,7 +166,7 @@ public void testApplyUpdates() throws IOException { deletes.onNewReader(segmentReader, commitInfo); reader.close(); writer.close(); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, 0, true); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 0, Collections.emptyMap(), 0, 0, 0, 0, NONE, true); List docsDeleted = Arrays.asList(1, 3, 7, 8, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); for (DocValuesFieldUpdates update : updates) { @@ -185,7 +187,7 @@ public void testApplyUpdates() throws IOException { docsDeleted = Arrays.asList(1, 2, DocIdSetIterator.NO_MORE_DOCS); updates = Arrays.asList(singleUpdate(docsDeleted, 10, true)); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, 0, true); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, 1, Collections.emptyMap(), 0, 0, 0, 0, NONE, true); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); } @@ -228,7 +230,7 @@ public void testUpdateAppliedOnlyOnce() throws IOException { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true); List docsDeleted = Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS); List updates = Arrays.asList(singleUpdate(docsDeleted, 3, true)); for (DocValuesFieldUpdates update : updates) { @@ -276,7 +278,7 @@ public void testResetOnUpdate() throws IOException { SegmentCommitInfo segmentInfo = segmentReader.getSegmentInfo(); PendingDeletes deletes = newPendingDeletes(segmentInfo); deletes.onNewReader(segmentReader, segmentInfo); - FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true); + FieldInfo fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true); List updates = Arrays.asList(singleUpdate(Arrays.asList(0, 1, DocIdSetIterator.NO_MORE_DOCS), 3, false)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); @@ -295,7 +297,7 @@ public void testResetOnUpdate() throws IOException { assertEquals(0, deletes.numPendingDeletes()); segmentInfo.advanceDocValuesGen(); - fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, true); + fieldInfo = new FieldInfo("_soft_deletes", 1, false, false, false, IndexOptions.NONE, DocValuesType.NUMERIC, segmentInfo.getNextDocValuesGen(), Collections.emptyMap(), 0, 0, 0, 0, NONE, true); updates = Arrays.asList(singleUpdate(Arrays.asList(1, DocIdSetIterator.NO_MORE_DOCS), 3, true)); for (DocValuesFieldUpdates update : updates) { deletes.onDocValuesUpdate(fieldInfo, update.iterator()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java index 5ea8f835feeb..2be3536342a8 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestSegmentToThreadMapping.java @@ -106,6 +106,9 @@ public PointValues getPointValues(String field) { return null; } + @Override + public VectorValues getVectorValues(String field) { return null; } + @Override protected void doClose() { } diff --git a/lucene/core/src/test/org/apache/lucene/index/TestVectorValues.java b/lucene/core/src/test/org/apache/lucene/index/TestVectorValues.java new file mode 100644 index 000000000000..c1494c53511b --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestVectorValues.java @@ -0,0 +1,722 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.VectorField; +import org.apache.lucene.index.VectorValues.ScoreFunction; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.TestUtil; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** Test Indexing/IndexWriter with vectors */ +public class TestVectorValues extends LuceneTestCase { + + private IndexWriterConfig createIndexWriterConfig() { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setCodec(Codec.forName("Lucene90")); + return iwc; + } + + // Suddenly add vectors to an existing field: + public void testUpgradeFieldToVectors() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(newStringField("f", "foo", Store.NO)); + w.addDocument(doc); + } + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + } + } + + public void testFieldConstructor() { + float[] v = new float[1]; + VectorField field = new VectorField("f", v); + assertEquals(1, field.fieldType().vectorDimension()); + assertEquals(ScoreFunction.EUCLIDEAN, field.fieldType().vectorScoreFunction()); + assertSame(v, field.vectorValue()); + } + + public void testFieldConstructorExceptions() { + expectThrows(IllegalArgumentException.class, () -> new VectorField(null, new float[1])); + expectThrows(IllegalArgumentException.class, () -> new VectorField("f", null)); + expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[1], null)); + expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[0])); + expectThrows(IllegalArgumentException.class, () -> new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1])); + } + + public void testFieldSetValue() { + VectorField field = new VectorField("f", new float[1]); + float[] v1 = new float[1]; + field.setVectorValue(v1); + assertSame(v1, field.vectorValue()); + expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[2])); + expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(null)); + } + + // Illegal schema change tests: + + public void testIllegalDimChangeTwoDocs() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + if (random().nextBoolean()) { + // sometimes test with two segments + w.commit(); + } + + Document doc2 = new Document(); + doc2.add(new VectorField("f", new float[3], ScoreFunction.DOT_PRODUCT)); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w.addDocument(doc2)); + assertEquals("cannot change vector dimension from 4 to 3 for field=\"f\"", expected.getMessage()); + } + } + + public void testIllegalScoreFunctionChange() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + if (random().nextBoolean()) { + // sometimes test with two segments + w.commit(); + } + + Document doc2 = new Document(); + doc2.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN)); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w.addDocument(doc2)); + assertEquals("cannot change vector score function from DOT_PRODUCT to EUCLIDEAN for field=\"f\"", expected.getMessage()); + } + } + + public void testIllegalDimChangeTwoWriters() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + + try (IndexWriter w2 = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc2 = new Document(); + doc2.add(new VectorField("f", new float[1], ScoreFunction.DOT_PRODUCT)); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w2.addDocument(doc2)); + assertEquals("cannot change vector dimension from 4 to 1 for field=\"f\"", expected.getMessage()); + } + } + } + + public void testIllegalScoreFunctionChangeTwoWriters() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + + try (IndexWriter w2 = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc2 = new Document(); + doc2.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN)); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w2.addDocument(doc2)); + assertEquals("cannot change vector score function from DOT_PRODUCT to EUCLIDEAN for field=\"f\"", expected.getMessage()); + } + } + } + + public void testAddIndexesDirectory0() throws Exception { + String fieldName = "field"; + Document doc = new Document(); + doc.add(new VectorField(fieldName, new float[4], ScoreFunction.DOT_PRODUCT)); + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + w2.addIndexes(new Directory[]{dir}); + try (IndexReader reader = w2.getReader()) { + LeafReader r = reader.leaves().get(0).reader(); + VectorValues vectorValues = r.getVectorValues(fieldName); + assertEquals(0, vectorValues.nextDoc()); + assertEquals(0, vectorValues.vectorValue()[0], 0); + assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + } + } + } + } + + public void testAddIndexesDirectory1() throws Exception { + String fieldName = "field"; + Document doc = new Document(); + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + w.addDocument(doc); + } + doc.add(new VectorField(fieldName, new float[4], ScoreFunction.DOT_PRODUCT)); + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + w2.addDocument(doc); + w2.addIndexes(new Directory[]{dir}); + try (IndexReader reader = w2.getReader()) { + LeafReader r = reader.leaves().get(0).reader(); + VectorValues vectorValues = r.getVectorValues(fieldName); + assertEquals(0, vectorValues.nextDoc()); + assertEquals(0, vectorValues.vectorValue()[0], 0); + assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + } + } + } + } + + public void testAddIndexesDirectory01() throws Exception { + String fieldName = "field"; + float[] vector = new float[1]; + Document doc = new Document(); + doc.add(new VectorField(fieldName, vector, ScoreFunction.DOT_PRODUCT)); + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + vector[0] = 1; + w2.addDocument(doc); + w2.addIndexes(new Directory[]{dir}); + w2.forceMerge(1); + try (IndexReader reader = w2.getReader()) { + LeafReader r = reader.leaves().get(0).reader(); + VectorValues vectorValues = r.getVectorValues(fieldName); + assertEquals(0, vectorValues.nextDoc()); + assertEquals(1, vectorValues.vectorValue()[0], 0); + assertEquals(1, vectorValues.nextDoc()); + assertEquals(0, vectorValues.vectorValue()[0], 0); + } + } + } + } + + public void testIllegalDimChangeViaAddIndexesDirectory() throws Exception { + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT)); + w2.addDocument(doc); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w2.addIndexes(new Directory[]{dir})); + assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage()); + } + } + } + + public void testIllegalScoreFunctionChangeViaAddIndexesDirectory() throws Exception { + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN)); + w2.addDocument(doc); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w2.addIndexes(dir)); + assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage()); + } + } + } + + public void testIllegalDimChangeViaAddIndexesCodecReader() throws Exception { + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT)); + w2.addDocument(doc); + try (DirectoryReader r = DirectoryReader.open(dir)) { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w2.addIndexes(new CodecReader[]{(CodecReader) getOnlyLeafReader(r)})); + assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage()); + } + } + } + } + + public void testIllegalScoreFunctionChangeViaAddIndexesCodecReader() throws Exception { + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN)); + w2.addDocument(doc); + try (DirectoryReader r = DirectoryReader.open(dir)) { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w2.addIndexes(new CodecReader[]{(CodecReader) getOnlyLeafReader(r)})); + assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage()); + } + } + } + } + + public void testIllegalDimChangeViaAddIndexesSlowCodecReader() throws Exception { + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[5], ScoreFunction.DOT_PRODUCT)); + w2.addDocument(doc); + try (DirectoryReader r = DirectoryReader.open(dir)) { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> TestUtil.addIndexesSlowly(w2, r)); + assertEquals("cannot change vector dimension from 5 to 4 for field=\"f\"", expected.getMessage()); + } + } + } + } + + public void testIllegalScoreFunctionChangeViaAddIndexesSlowCodecReader() throws Exception { + try (Directory dir = newDirectory(); + Directory dir2 = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w2 = new IndexWriter(dir2, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.EUCLIDEAN)); + w2.addDocument(doc); + try (DirectoryReader r = DirectoryReader.open(dir)) { + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> TestUtil.addIndexesSlowly(w2, r)); + assertEquals("cannot change vector score function from EUCLIDEAN to DOT_PRODUCT for field=\"f\"", expected.getMessage()); + } + } + } + } + + public void testIllegalMultipleValues() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, + () -> w.addDocument(doc)); + assertEquals("VectorValuesField \"f\" appears more than once in this document (only one value is allowed per field)", + expected.getMessage()); + } + } + + public void testIllegalDimensionTooLarge() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + expectThrows(IllegalArgumentException.class, + () -> doc.add(new VectorField("f", new float[VectorValues.MAX_DIMENSIONS + 1], ScoreFunction.DOT_PRODUCT))); + + Document doc2 = new Document(); + doc2.add(new VectorField("f", new float[1], ScoreFunction.EUCLIDEAN)); + w.addDocument(doc2); + } + } + + public void testIllegalEmptyVector() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + Exception e = expectThrows(IllegalArgumentException.class, + () -> doc.add(new VectorField("f", new float[0], ScoreFunction.NONE))); + assertEquals("cannot index an empty vector", e.getMessage()); + + Document doc2 = new Document(); + doc2.add(new VectorField("f", new float[1], ScoreFunction.NONE)); + w.addDocument(doc2); + } + } + + // Write vectors, one segment with default codec, another with SimpleText, then forceMerge + public void testDifferentCodecs1() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setCodec(Codec.forName("SimpleText")); + try (IndexWriter w = new IndexWriter(dir, iwc)) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.forceMerge(1); + } + } + } + + // Write vectors, one segment with with SimpleText, another with default codec, then forceMerge + public void testDifferentCodecs2() throws Exception { + IndexWriterConfig iwc = newIndexWriterConfig(); + iwc.setCodec(Codec.forName("SimpleText")); + try (Directory dir = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, iwc)) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + } + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("f", new float[4], ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.forceMerge(1); + } + } + } + + public void testInvalidVectorFieldUsage() { + VectorField field = new VectorField("field", new float[2], ScoreFunction.NONE); + + expectThrows(IllegalArgumentException.class, () -> field.setIntValue(14)); + + expectThrows(IllegalArgumentException.class, () -> field.setVectorValue(new float[1])); + + assertNull(field.numericValue()); + } + + public void testDeleteAllVectorDocs() throws Exception { + try (Directory dir = newDirectory(); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new StringField("id", "0", Store.NO)); + doc.add(new VectorField("v", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.addDocument(new Document()); + w.commit(); + + try (DirectoryReader r = w.getReader()) { + assertNotNull(r.leaves().get(0).reader().getVectorValues("v")); + } + w.deleteDocuments(new Term("id", "0")); + w.forceMerge(1); + try (DirectoryReader r = w.getReader()) { + assertNull(r.leaves().get(0).reader().getVectorValues("v")); + } + } + } + + public void testVectorFieldMissingFromOneSegment() throws Exception { + try (Directory dir = FSDirectory.open(createTempDir()); + IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new StringField("id", "0", Store.NO)); + doc.add(new VectorField("v0", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.commit(); + + doc = new Document(); + doc.add(new VectorField("v1", new float[]{2, 3, 5}, ScoreFunction.DOT_PRODUCT)); + w.addDocument(doc); + w.forceMerge(1); + } + } + + public void testSparseVectors() throws Exception { + int numDocs = atLeast(1000); + int numFields = TestUtil.nextInt(random(), 1, 10); + int[] fieldDocCounts = new int[numFields]; + float[] fieldTotals= new float[numFields]; + int[] fieldDims = new int[numFields]; + ScoreFunction[] fieldScoreFunctions = new ScoreFunction[numFields]; + for (int i = 0; i < numFields; i++) { + fieldDims[i] = random().nextInt(20) + 1; + fieldScoreFunctions[i] = ScoreFunction.values()[random().nextInt(ScoreFunction.values().length)]; + } + try (Directory dir = newDirectory(); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, createIndexWriterConfig())) { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + for (int field = 0; field < numFields; field++) { + String fieldName = "int" + field; + if (random().nextInt(100) == 17) { + float[] v = randomVector(fieldDims[field]); + doc.add(new VectorField(fieldName, v, fieldScoreFunctions[field])); + fieldDocCounts[field]++; + fieldTotals[field] += v[0]; + } + } + w.addDocument(doc); + } + + try (IndexReader r = w.getReader()) { + for (int field = 0; field < numFields; field++) { + int docCount = 0; + float checksum = 0; + String fieldName = "int" + field; + for (LeafReaderContext ctx : r.leaves()) { + VectorValues vectors = ctx.reader().getVectorValues(fieldName); + if (vectors != null) { + docCount += vectors.size(); + while (vectors.nextDoc() != NO_MORE_DOCS) { + checksum += vectors.vectorValue()[0]; + } + } + } + assertEquals(fieldDocCounts[field], docCount); + assertEquals(fieldTotals[field], checksum, 1e-5); + } + } + } + } + + public void testIndexedValueNotAliased() throws Exception { + // We copy indexed values (as for BinaryDocValues) so the input float[] can be reused across + // calls to IndexWriter.addDocument. + String fieldName = "field"; + float[] v = { 0 }; + try (Directory dir = newDirectory(); + IndexWriter iw = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc1 = new Document(); + doc1.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN)); + v[0] = 1; + Document doc2 = new Document(); + doc2.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN)); + iw.addDocument(doc1); + iw.addDocument(doc2); + v[0] = 2; + Document doc3 = new Document(); + doc3.add(new VectorField(fieldName, v, VectorValues.ScoreFunction.EUCLIDEAN)); + iw.addDocument(doc3); + try (IndexReader reader = iw.getReader()) { + LeafReader r = reader.leaves().get(0).reader(); + VectorValues vectorValues = r.getVectorValues(fieldName); + vectorValues.nextDoc(); + assertEquals(1, vectorValues.vectorValue()[0], 0); + vectorValues.nextDoc(); + assertEquals(1, vectorValues.vectorValue()[0], 0); + vectorValues.nextDoc(); + assertEquals(2, vectorValues.vectorValue()[0], 0); + } + } + } + + public void testSortedIndex() throws Exception { + IndexWriterConfig iwc = createIndexWriterConfig(); + iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); + String fieldName = "field"; + try (Directory dir = newDirectory(); + IndexWriter iw = new IndexWriter(dir, iwc)) { + add(iw, fieldName, 1, 1, new float[]{1}); + add(iw, fieldName, 4, 4, new float[]{4}); + add(iw, fieldName, 3, 3, null); + add(iw, fieldName, 2, 2, new float[]{2}); + try (IndexReader reader = iw.getReader()) { + LeafReader leaf = reader.leaves().get(0).reader(); + + VectorValues vectorValues = leaf.getVectorValues(fieldName); + assertEquals(1, vectorValues.dimension()); + assertEquals(3, vectorValues.size()); + assertEquals("1", leaf.document(vectorValues.nextDoc()).get("id")); + assertEquals(1f, vectorValues.vectorValue()[0], 0); + assertEquals("2", leaf.document(vectorValues.nextDoc()).get("id")); + assertEquals(2f, vectorValues.vectorValue()[0], 0); + assertEquals("4", leaf.document(vectorValues.nextDoc()).get("id")); + assertEquals(4f, vectorValues.vectorValue()[0], 0); + assertEquals(NO_MORE_DOCS, vectorValues.nextDoc()); + + VectorValues.RandomAccess ra = vectorValues.randomAccess(); + assertEquals(1f, ra.vectorValue(0)[0], 0); + assertEquals(2f, ra.vectorValue(1)[0], 0); + assertEquals(4f, ra.vectorValue(2)[0], 0); + } + } + } + + /** + * Index random vectors, sometimes skipping documents, sometimes deleting a document, + * sometimes merging, sometimes sorting the index, + * and verify that the expected values can be read back consistently. + */ + public void testRandom() throws Exception { + IndexWriterConfig iwc = createIndexWriterConfig(); + if (random().nextBoolean()) { + iwc.setIndexSort(new Sort(new SortField("sortkey", SortField.Type.INT))); + } + String fieldName = "field"; + try (Directory dir = newDirectory(); + IndexWriter iw = new IndexWriter(dir, iwc)) { + int numDoc = atLeast(100); + int dimension = atLeast(10); + float[] scratch = new float[dimension]; + int numValues = 0; + float[][] values = new float[numDoc][]; + for (int i = 0; i < numDoc; i++) { + if (random().nextInt(7) != 3) { + // usually index a vector value for a doc + values[i] = randomVector(dimension); + ++numValues; + } + if (random().nextBoolean() && values[i] != null) { + // sometimes use a shared scratch array + System.arraycopy(values[i], 0, scratch, 0, scratch.length); + add(iw, fieldName, i, scratch); + } else { + add(iw, fieldName, i, values[i]); + } + if (random().nextInt(10) == 2) { + // sometimes delete a random document + int idToDelete = random().nextInt(i + 1); + iw.deleteDocuments(new Term("id", Integer.toString(idToDelete))); + // and remember that it was deleted + if (values[idToDelete] != null) { + values[idToDelete] = null; + --numValues; + } + } + if (random().nextInt(10) == 3) { + iw.commit(); + } + } + iw.forceMerge(1); + try (IndexReader reader = iw.getReader()) { + int valueCount = 0, totalSize = 0; + for (LeafReaderContext ctx : reader.leaves()) { + VectorValues vectorValues = ctx.reader().getVectorValues(fieldName); + if (vectorValues == null) { + continue; + } + totalSize += vectorValues.size(); + int docId; + while ((docId = vectorValues.nextDoc()) != NO_MORE_DOCS) { + float[] v = vectorValues.vectorValue(); + assertEquals(dimension, v.length); + String idString = ctx.reader().document(docId).getField("id").stringValue(); + int id = Integer.parseInt(idString); + assertArrayEquals(idString, values[id], v, 0); + ++valueCount; + } + } + assertEquals(numValues, valueCount); + assertEquals(numValues, totalSize); + } + } + } + + private void add(IndexWriter iw, String field, int id, float[] vector) throws IOException { + add(iw, field, id, random().nextInt(100), vector); + } + + private void add(IndexWriter iw, String field, int id, int sortkey, float[] vector) throws IOException { + Document doc = new Document(); + if (vector != null) { + doc.add(new VectorField(field, vector)); + } + doc.add(new NumericDocValuesField("sortkey", sortkey)); + doc.add(new StringField("id", Integer.toString(id), Field.Store.YES)); + iw.addDocument(doc); + } + + private float[] randomVector(int dim) { + float[] v = new float[dim]; + for (int i = 0; i < dim; i++) { + v[i] = random().nextFloat(); + } + return v; + } + + public void testCheckIndexIncludesVectors() throws Exception { + try (Directory dir = newDirectory()) { + try (IndexWriter w = new IndexWriter(dir, createIndexWriterConfig())) { + Document doc = new Document(); + doc.add(new VectorField("v1", randomVector(3), ScoreFunction.NONE)); + w.addDocument(doc); + + doc.add(new VectorField("v2", randomVector(3), ScoreFunction.NONE)); + w.addDocument(doc); + } + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + CheckIndex.Status status = TestUtil.checkIndex(dir, false, true, output); + assertEquals(1, status.segmentInfos.size()); + CheckIndex.Status.SegmentInfoStatus segStatus = status.segmentInfos.get(0); + // total 3 vector values were indexed: + assertEquals(3, segStatus.vectorValuesStatus.totalVectorValues); + // ... across 2 fields: + assertEquals(2, segStatus.vectorValuesStatus.totalVectorFields); + + // Make sure CheckIndex in fact declares that it is testing vectors! + assertTrue(output.toString(IOUtils.UTF_8).contains("test: vectors...")); + } + } + + public void testScoreFunctionIdentifiers() throws Exception { + // make sure we don't accidentally mess up score function identifiers by re-ordering their enumerators + assertEquals(0, ScoreFunction.NONE.ordinal()); + assertEquals(1, ScoreFunction.EUCLIDEAN.ordinal()); + assertEquals(2, ScoreFunction.DOT_PRODUCT.ordinal()); + assertEquals(3, ScoreFunction.values().length); + } + +} diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 10319f9eb54e..01570aa01694 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -854,7 +854,7 @@ public void testPrimaryKeys() throws Exception { System.out.println("TEST: cycle=" + cycle); } RandomIndexWriter w = new RandomIndexWriter(random(), dir, - newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); + newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE)); Document doc = new Document(); Field idField = newStringField("id", "", Field.Store.NO); doc.add(idField); diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java index 42f7aec1a38e..f5f8c5c8a606 100644 --- a/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java +++ b/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java @@ -35,6 +35,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Terms; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.Version; @@ -81,7 +82,7 @@ public int size() { } FieldInfo fieldInfo = new FieldInfo(field, 0, true, true, terms.hasPayloads(), - indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, false); + indexOptions, DocValuesType.NONE, -1, Collections.emptyMap(), 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false); fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo}); } @@ -139,6 +140,11 @@ public PointValues getPointValues(String fieldName) { return null; } + @Override + public VectorValues getVectorValues(String fieldName) { + return null; + } + @Override public void checkIntegrity() throws IOException { } diff --git a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java index 705e0cc744d2..ed00f85f7c34 100644 --- a/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java +++ b/lucene/memory/src/java/org/apache/lucene/index/memory/MemoryIndex.java @@ -501,7 +501,8 @@ private FieldInfo createFieldInfo(String fieldName, int ord, IndexableFieldType IndexOptions indexOptions = storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS; return new FieldInfo(fieldName, ord, fieldType.storeTermVectors(), fieldType.omitNorms(), storePayloads, indexOptions, fieldType.docValuesType(), -1, Collections.emptyMap(), - fieldType.pointDimensionCount(), fieldType.pointIndexDimensionCount(), fieldType.pointNumBytes(), false); + fieldType.pointDimensionCount(), fieldType.pointIndexDimensionCount(), fieldType.pointNumBytes(), + fieldType.vectorDimension(), fieldType.vectorScoreFunction(), false); } private void storePointValues(Info info, BytesRef pointValue) { @@ -521,6 +522,7 @@ private void storeDocValues(Info info, DocValuesType docValuesType, Object docVa info.fieldInfo.name, info.fieldInfo.number, info.fieldInfo.hasVectors(), info.fieldInfo.hasPayloads(), info.fieldInfo.hasPayloads(), info.fieldInfo.getIndexOptions(), docValuesType, -1, info.fieldInfo.attributes(), info.fieldInfo.getPointDimensionCount(), info.fieldInfo.getPointIndexDimensionCount(), info.fieldInfo.getPointNumBytes(), + info.fieldInfo.getVectorDimension(), info.fieldInfo.getVectorScoreFunction(), info.fieldInfo.isSoftDeletesField() ); } else if (existingDocValuesType != docValuesType) { @@ -1241,6 +1243,11 @@ public PointValues getPointValues(String fieldName) { return new MemoryIndexPointValues(info); } + @Override + public VectorValues getVectorValues(String fieldName) { + return VectorValues.EMPTY; + } + @Override public void checkIntegrity() throws IOException { // no-op diff --git a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java index f4a7c9912b86..5fa5072626cb 100644 --- a/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java +++ b/lucene/suggest/src/test/org/apache/lucene/search/suggest/document/TestSuggestField.java @@ -39,7 +39,7 @@ import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene87.Lucene87Codec; +import org.apache.lucene.codecs.lucene90.Lucene90Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.IntPoint; @@ -887,7 +887,7 @@ static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, String... sugges static IndexWriterConfig iwcWithSuggestField(Analyzer analyzer, final Set suggestFields) { IndexWriterConfig iwc = newIndexWriterConfig(random(), analyzer); iwc.setMergePolicy(newLogMergePolicy()); - Codec filterCodec = new Lucene87Codec() { + Codec filterCodec = new Lucene90Codec() { CompletionPostingsFormat.FSTLoadMode fstLoadMode = RandomPicks.randomFrom(random(), CompletionPostingsFormat.FSTLoadMode.values()); PostingsFormat postingsFormat = new Completion84PostingsFormat(fstLoadMode); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java index ccc7a000846b..6d6b31158089 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/BaseIndexFileFormatTestCase.java @@ -354,7 +354,8 @@ public void testMultiClose() throws IOException { FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>(), - proto.getPointDimensionCount(), proto.getPointIndexDimensionCount(), proto.getPointNumBytes(), proto.isSoftDeletesField()); + proto.getPointDimensionCount(), proto.getPointIndexDimensionCount(), proto.getPointNumBytes(), + proto.getVectorDimension(), proto.getVectorScoreFunction(), proto.isSoftDeletesField()); FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } ); diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java index dd74a15dbd8e..ed8a1107c13d 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/MismatchedLeafReader.java @@ -80,6 +80,8 @@ static FieldInfos shuffleInfos(FieldInfos infos, Random random) { oldInfo.getPointDimensionCount(), // data dimension count oldInfo.getPointIndexDimensionCount(), // index dimension count oldInfo.getPointNumBytes(), // dimension numBytes + oldInfo.getVectorDimension(), // number of dimensions of the field's vector + oldInfo.getVectorScoreFunction(), // distance function for calculating similarity of the field's vector oldInfo.isSoftDeletesField()); // used as soft-deletes field shuffled.set(i, newInfo); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java index 2a21bab87444..366a0d9206f9 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java +++ b/lucene/test-framework/src/java/org/apache/lucene/index/RandomPostingsTester.java @@ -130,7 +130,7 @@ public RandomPostingsTester(Random random) throws IOException { fieldInfoArray[fieldUpto] = new FieldInfo(field, fieldUpto, false, false, true, IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS, DocValuesType.NONE, -1, new HashMap<>(), - 0, 0, 0, false); + 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false); fieldUpto++; SortedMap postings = new TreeMap<>(); @@ -651,7 +651,7 @@ public FieldsProducer buildIndex(Codec codec, Directory dir, IndexOptions maxAll DocValuesType.NONE, -1, new HashMap<>(), - 0, 0, 0, false); + 0, 0, 0, 0, VectorValues.ScoreFunction.NONE, false); } FieldInfos newFieldInfos = new FieldInfos(newFieldInfoArray); diff --git a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java index fdd5fb2f8c23..e4168f3a817c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java +++ b/lucene/test-framework/src/java/org/apache/lucene/search/QueryUtils.java @@ -35,6 +35,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Terms; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.util.Bits; import org.apache.lucene.util.LuceneTestCase; import org.apache.lucene.util.Version; @@ -207,6 +208,11 @@ public NumericDocValues getNormValues(String field) throws IOException { return null; } + @Override + public VectorValues getVectorValues(String field) throws IOException { + return null; + } + @Override public FieldInfos getFieldInfos() { return FieldInfos.EMPTY; diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java index 81cb328aada2..e15a59a13c8a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestRuleSetupAndRestoreClassEnv.java @@ -34,7 +34,7 @@ import org.apache.lucene.codecs.cheapbastard.CheapBastardCodec; import org.apache.lucene.codecs.compressing.CompressingCodec; import org.apache.lucene.codecs.lucene87.Lucene87StoredFieldsFormat; -import org.apache.lucene.codecs.lucene87.Lucene87Codec; +import org.apache.lucene.codecs.lucene90.Lucene90Codec; import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.index.RandomCodec; @@ -187,8 +187,8 @@ public String toString() { codec = new AssertingCodec(); } else if ("Compressing".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 6 && !shouldAvoidCodec("Compressing"))) { codec = CompressingCodec.randomInstance(random); - } else if ("Lucene87".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene87"))) { - codec = new Lucene87Codec(RandomPicks.randomFrom(random, Lucene87StoredFieldsFormat.Mode.values()) + } else if ("Lucene90".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene90"))) { + codec = new Lucene90Codec(RandomPicks.randomFrom(random, Lucene87StoredFieldsFormat.Mode.values()) ); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); diff --git a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java index 7104a85c2424..be27b0c23c5c 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/util/TestUtil.java @@ -46,6 +46,8 @@ import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; @@ -54,7 +56,7 @@ import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; import org.apache.lucene.codecs.lucene80.Lucene80DocValuesFormat; import org.apache.lucene.codecs.lucene84.Lucene84PostingsFormat; -import org.apache.lucene.codecs.lucene87.Lucene87Codec; +import org.apache.lucene.codecs.lucene90.Lucene90Codec; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; import org.apache.lucene.document.BinaryDocValuesField; @@ -101,9 +103,6 @@ import org.apache.lucene.store.NoLockFactory; import org.junit.Assert; -import com.carrotsearch.randomizedtesting.generators.RandomNumbers; -import com.carrotsearch.randomizedtesting.generators.RandomPicks; - /** * General utility methods for Lucene unit tests. */ @@ -919,7 +918,7 @@ public DocValuesFormat getDocValuesFormatForField(String field) { * This may be different than {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene87Codec(); + return new Lucene90Codec(); } /** diff --git a/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java b/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java index 4f1b56cecaaa..33c23a17ddd9 100644 --- a/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java +++ b/solr/core/src/java/org/apache/solr/index/SlowCompositeReaderWrapper.java @@ -276,6 +276,12 @@ public PointValues getPointValues(String field) { return null; // because not supported. Throw UOE? } + @Override + public VectorValues getVectorValues(String field) { + ensureOpen(); + return null; // because not supported. Throw UOE? + } + @Override public FieldInfos getFieldInfos() { return fieldInfos; diff --git a/solr/core/src/java/org/apache/solr/schema/SchemaField.java b/solr/core/src/java/org/apache/solr/schema/SchemaField.java index 2f60ccc1441f..cea4536ce4f1 100644 --- a/solr/core/src/java/org/apache/solr/schema/SchemaField.java +++ b/solr/core/src/java/org/apache/solr/schema/SchemaField.java @@ -26,6 +26,7 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.IndexableFieldType; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.search.SortField; import org.apache.solr.common.SolrException; import org.apache.solr.common.util.SimpleOrderedMap; @@ -447,6 +448,16 @@ public int pointNumBytes() { return 0; } + @Override + public int vectorDimension() { + return 0; + } + + @Override + public VectorValues.ScoreFunction vectorScoreFunction() { + return VectorValues.ScoreFunction.NONE; + } + @Override public Map getAttributes() { return null; diff --git a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java index 9a1e740bd8fb..a82a64174eac 100644 --- a/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java +++ b/solr/core/src/java/org/apache/solr/search/CollapsingQParserPlugin.java @@ -493,6 +493,8 @@ private static class ReaderWrapper extends FilterLeafReader { fieldInfo.getPointDimensionCount(), fieldInfo.getPointIndexDimensionCount(), fieldInfo.getPointNumBytes(), + fieldInfo.getVectorDimension(), + fieldInfo.getVectorScoreFunction(), fieldInfo.isSoftDeletesField()); newInfos.add(f); } else { diff --git a/solr/core/src/java/org/apache/solr/search/Insanity.java b/solr/core/src/java/org/apache/solr/search/Insanity.java index 18b760f63ed7..7eae21750a54 100644 --- a/solr/core/src/java/org/apache/solr/search/Insanity.java +++ b/solr/core/src/java/org/apache/solr/search/Insanity.java @@ -67,7 +67,8 @@ private static class InsaneReader extends FilterLeafReader { if (fi.name.equals(insaneField)) { filteredInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), DocValuesType.NONE, -1, Collections.emptyMap(), - fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); + fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), + fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField())); } else { filteredInfos.add(fi); } diff --git a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java index d83e70ebb5d9..ed62cc4e3fc3 100644 --- a/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java +++ b/solr/core/src/java/org/apache/solr/uninverting/UninvertingReader.java @@ -284,7 +284,8 @@ public static LeafReader wrap(LeafReader in, Function mapping) { wrap = true; newFieldInfos.add(new FieldInfo(fi.name, fi.number, fi.hasVectors(), fi.omitsNorms(), fi.hasPayloads(), fi.getIndexOptions(), type, fi.getDocValuesGen(), fi.attributes(), - fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), fi.isSoftDeletesField())); + fi.getPointDimensionCount(), fi.getPointIndexDimensionCount(), fi.getPointNumBytes(), + fi.getVectorDimension(), fi.getVectorScoreFunction(), fi.isSoftDeletesField())); } else { newFieldInfos.add(fi); } diff --git a/solr/core/src/test/org/apache/solr/search/TestDocSet.java b/solr/core/src/test/org/apache/solr/search/TestDocSet.java index 00ee6ec7aef2..822830f58c41 100644 --- a/solr/core/src/test/org/apache/solr/search/TestDocSet.java +++ b/solr/core/src/test/org/apache/solr/search/TestDocSet.java @@ -37,6 +37,7 @@ import org.apache.lucene.index.SortedSetDocValues; import org.apache.lucene.index.StoredFieldVisitor; import org.apache.lucene.index.Terms; +import org.apache.lucene.index.VectorValues; import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.TotalHits; @@ -343,6 +344,11 @@ public PointValues getPointValues(String field) { return null; } + @Override + public VectorValues getVectorValues(String field) { + return null; + } + @Override protected void doClose() { }