diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index f7d5356e8438..315a8311e674 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -33,6 +33,7 @@ exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene99; + exports org.apache.lucene.codecs.lucene910; exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; exports org.apache.lucene.codecs.perfield; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java new file mode 100644 index 000000000000..87c974b43d26 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene910; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DataCubesFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * TODO : need to make this default codec and move Lucene99 to backwards codec Implements the Lucene + * 9.10 index format + * + *
If you want to reuse functionality of this codec in another codec, extend {@link
+ * org.apache.lucene.codecs.FilterCodec}.
+ *
+ * @see org.apache.lucene.codecs.lucene99 package documentation for file format details.
+ * @lucene.experimental
+ */
+public class Lucene910Codec extends Codec {
+ /** Configuration option for the codec */
+ public enum Mode {
+ /** Trade compression ratio for retrieval speed. */
+ BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED),
+ /** Trade retrieval speed for compression ratio. */
+ BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION);
+
+ private final Lucene90StoredFieldsFormat.Mode storedMode;
+
+ private Mode(Lucene90StoredFieldsFormat.Mode storedMode) {
+ this.storedMode = Objects.requireNonNull(storedMode);
+ }
+ }
+
+ private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat();
+ private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat();
+ private final SegmentInfoFormat segmentInfosFormat = new Lucene910SegmentInfoFormat();
+ private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat();
+ private final CompoundFormat compoundFormat = new Lucene90CompoundFormat();
+ private final NormsFormat normsFormat = new Lucene90NormsFormat();
+ private final PostingsFormat defaultPostingsFormat;
+ private final PostingsFormat postingsFormat =
+ new PerFieldPostingsFormat() {
+ @Override
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return Lucene910Codec.this.getPostingsFormatForField(field);
+ }
+ };
+
+ private final DocValuesFormat defaultDVFormat;
+ private final DocValuesFormat docValuesFormat =
+ new PerFieldDocValuesFormat() {
+ @Override
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return Lucene910Codec.this.getDocValuesFormatForField(field);
+ }
+ };
+
+ private final KnnVectorsFormat defaultKnnVectorsFormat;
+ private final KnnVectorsFormat knnVectorsFormat =
+ new PerFieldKnnVectorsFormat() {
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return Lucene910Codec.this.getKnnVectorsFormatForField(field);
+ }
+ };
+
+ private final StoredFieldsFormat storedFieldsFormat;
+
+ /** Instantiates a new codec. */
+ public Lucene910Codec() {
+ this(Lucene910Codec.Mode.BEST_SPEED);
+ }
+
+ /**
+ * Instantiates a new codec, specifying the stored fields compression mode to use.
+ *
+ * @param mode stored fields compression mode to use for newly flushed/merged segments.
+ */
+ public Lucene910Codec(Lucene910Codec.Mode mode) {
+ super("Lucene910");
+ this.storedFieldsFormat =
+ new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode);
+ this.defaultPostingsFormat = new Lucene99PostingsFormat();
+ this.defaultDVFormat = new Lucene90DocValuesFormat();
+ this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat();
+ }
+
+ @Override
+ public final StoredFieldsFormat storedFieldsFormat() {
+ return storedFieldsFormat;
+ }
+
+ @Override
+ public final TermVectorsFormat termVectorsFormat() {
+ return vectorsFormat;
+ }
+
+ @Override
+ public final PostingsFormat postingsFormat() {
+ return postingsFormat;
+ }
+
+ @Override
+ public final FieldInfosFormat fieldInfosFormat() {
+ return fieldInfosFormat;
+ }
+
+ @Override
+ public final SegmentInfoFormat segmentInfoFormat() {
+ return segmentInfosFormat;
+ }
+
+ @Override
+ public final LiveDocsFormat liveDocsFormat() {
+ return liveDocsFormat;
+ }
+
+ @Override
+ public final CompoundFormat compoundFormat() {
+ return compoundFormat;
+ }
+
+ @Override
+ public final PointsFormat pointsFormat() {
+ return new Lucene90PointsFormat();
+ }
+
+ @Override
+ public final KnnVectorsFormat knnVectorsFormat() {
+ return knnVectorsFormat;
+ }
+
+ @Override
+ public DataCubesFormat dataCubesFormat() {
+ return DataCubesFormat.EMPTY; // TODO
+ }
+
+ /**
+ * Returns the postings format that should be used for writing new segments of field
.
+ *
+ *
The default implementation always returns "Lucene99". + * + *
WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation,
+ */
+ public PostingsFormat getPostingsFormatForField(String field) {
+ return defaultPostingsFormat;
+ }
+
+ /**
+ * Returns the docvalues format that should be used for writing new segments of field
+ * .
+ *
+ *
The default implementation always returns "Lucene99". + * + *
WARNING: if you subclass, you are responsible for index backwards compatibility:
+ * future version of Lucene are only guaranteed to be able to read the default implementation.
+ */
+ public DocValuesFormat getDocValuesFormatForField(String field) {
+ return defaultDVFormat;
+ }
+
+ /**
+ * Returns the vectors format that should be used for writing new segments of field
+ *
+ *
The default implementation always returns "Lucene95". + * + *
WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java new file mode 100644 index 000000000000..792159950f95 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene910; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DataCubeField; +import org.apache.lucene.index.DataCubeFieldProvider; +import org.apache.lucene.index.DataCubesConfig; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexSorter; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.index.SortFieldProvider; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Version; + +/** + * TODO: make this default latest segment info format and move lucene99 segmentInfo format to + * backward codec Lucene 9.10 Segment info format. + * + *
Files: + * + *
.si
: Header, SegVersion, SegSize, IsCompoundFile, Diagnostics, Files,
+ * Attributes, IndexSort, Footer
+ * This document defines the index file formats used in this version of Lucene. If you are using
+ * a different version of Lucene, please consult the copy of docs/
that was distributed
+ * with the version you are using.
+ *
+ *
This document attempts to provide a high-level definition of the Apache Lucene file formats. + *
The fundamental concepts in Lucene are index, document, field and term. + * + *
An index contains a sequence of documents. + * + *
The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *
Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *
In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *
The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *
See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *
Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *
Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *
Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *
Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *
The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *
When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *
Each segment index maintains the following: + * + *
Details on each of these are provided in their linked pages.
All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *
Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *
File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.
The following table summarizes the names and extensions of the files in Lucene: + * + *
Name | + *Extension | + *Brief Description | + *
---|---|---|
{@link org.apache.lucene.index.SegmentInfos Segments File} | + *segments_N | + *Stores information about a commit point | + *
Lock File | + *write.lock | + *The Write lock prevents multiple IndexWriters from writing to the same + * file. | + *
{@link org.apache.lucene.codecs.lucene910.Lucene910SegmentInfoFormat Segment Info} | + *.si | + *Stores metadata about a segment | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File} | + *.cfs, .cfe | + *An optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles. | + *
{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields} | + *.fnm | + *Stores information about the fields | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index} | + *.fdx | + *Contains pointers to field data | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data} | + *.fdt | + *The stored fields for documents | + *
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Dictionary} | + *.tim | + *The term dictionary, stores term info | + *
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Index} | + *.tip | + *The index into the Term Dictionary | + *
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Frequencies} | + *.doc | + *Contains the list of docs which contain each term along with frequency | + *
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Positions} | + *.pos | + *Stores position information about where a term occurs in the index | + *
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Payloads} | + *.pay | + *Stores additional per-position metadata information such as character offsets and user payloads | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms} | + *.nvd, .nvm | + *Encodes length and boost factors for docs and fields | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values} | + *.dvd, .dvm | + *Encodes additional scoring factors or other per-document information. | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index} | + *.tvx | + *Stores offset into the document data file | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data} | + *.tvd | + *Contains term vector data. | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents} | + *.liv | + *Info about what documents are live | + *
{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values} | + *.dii, .dim | + *Holds indexed points | + *
{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values} | + *.vec, .vem, .veq, vex | + *Holds indexed vectors; .vec files contain the raw vector data,
+ * .vem the vector metadata, .veq the quantized vector data, and .vex the
+ * hnsw graph data. |
+ *
Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *
Lucene uses a Java int
to refer to document numbers, and the index file format
+ * uses an Int32
on-disk to store document numbers. This is a limitation of both the
+ * index file format and the current implementation. Eventually these should be replaced with either
+ * UInt64
values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt
+ * VInt} values which have no limit.