From 464f772be4fd3f82b6926d6522d6a41863fe0cb5 Mon Sep 17 00:00:00 2001 From: Bharathwaj G Date: Fri, 3 May 2024 16:27:19 +0530 Subject: [PATCH] Incrementing codec and segmentinfo, datacubefield changes Signed-off-by: Bharathwaj G --- lucene/core/src/java/module-info.java | 1 + .../codecs/lucene910/Lucene910Codec.java | 222 +++++++++ .../lucene910/Lucene910SegmentInfoFormat.java | 279 ++++++++++++ .../lucene/codecs/lucene910/package-info.java | 430 ++++++++++++++++++ .../lucene/codecs/lucene99/Lucene99Codec.java | 2 +- .../apache/lucene/index/DataCubeField.java | 53 ++- 6 files changed, 972 insertions(+), 15 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java create mode 100644 lucene/core/src/java/org/apache/lucene/codecs/lucene910/package-info.java diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index f7d5356e8438..315a8311e674 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -33,6 +33,7 @@ exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene99; + exports org.apache.lucene.codecs.lucene910; exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; exports org.apache.lucene.codecs.perfield; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java new file mode 100644 index 000000000000..87c974b43d26 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910Codec.java @@ -0,0 +1,222 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene910; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DataCubesFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * TODO : need to make this default codec and move Lucene99 to backwards codec Implements the Lucene + * 9.10 index format + * + *

If you want to reuse functionality of this codec in another codec, extend {@link + * org.apache.lucene.codecs.FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene99 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene910Codec extends Codec { + /** Configuration option for the codec */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene910SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene910Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene910Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene910Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene910Codec() { + this(Lucene910Codec.Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene910Codec(Lucene910Codec.Mode mode) { + super("Lucene910"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene99PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + @Override + public DataCubesFormat dataCubesFormat() { + return DataCubesFormat.EMPTY; // TODO + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

The default implementation always returns "Lucene99". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

The default implementation always returns "Lucene99". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

The default implementation always returns "Lucene95". + * + *

WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java new file mode 100644 index 000000000000..792159950f95 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/Lucene910SegmentInfoFormat.java @@ -0,0 +1,279 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene910; + +import java.io.IOException; +import java.util.Map; +import java.util.Set; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.DataCubeField; +import org.apache.lucene.index.DataCubeFieldProvider; +import org.apache.lucene.index.DataCubesConfig; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexSorter; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.index.SortFieldProvider; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.Version; + +/** + * TODO: make this default latest segment info format and move lucene99 segmentInfo format to + * backward codec Lucene 9.10 Segment info format. + * + *

Files: + * + *

+ * + * Data types: + * + * + * + * Field Descriptions: + * + * + * + * @see SegmentInfos + * @lucene.experimental + */ +public class Lucene910SegmentInfoFormat extends SegmentInfoFormat { + + /** File extension used to store {@link SegmentInfo}. */ + public static final String SI_EXTENSION = "si"; + + static final String CODEC_NAME = "Lucene90SegmentInfo"; + static final int VERSION_START = 0; + static final int VERSION_CURRENT = VERSION_START; + + /** Sole constructor. */ + public Lucene910SegmentInfoFormat() {} + + @Override + public SegmentInfo read(Directory dir, String segment, byte[] segmentID, IOContext context) + throws IOException { + final String fileName = IndexFileNames.segmentFileName(segment, "", SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(fileName)) { + Throwable priorE = null; + SegmentInfo si = null; + try { + CodecUtil.checkIndexHeader( + input, CODEC_NAME, VERSION_START, VERSION_CURRENT, segmentID, ""); + si = parseSegmentInfo(dir, input, segment, segmentID); + } catch (Throwable exception) { + priorE = exception; + } finally { + CodecUtil.checkFooter(input, priorE); + } + return si; + } + } + + private SegmentInfo parseSegmentInfo( + Directory dir, DataInput input, String segment, byte[] segmentID) throws IOException { + final Version version = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + byte hasMinVersion = input.readByte(); + final Version minVersion; + switch (hasMinVersion) { + case 0: + minVersion = null; + break; + case 1: + minVersion = Version.fromBits(input.readInt(), input.readInt(), input.readInt()); + break; + default: + throw new CorruptIndexException("Illegal boolean value " + hasMinVersion, input); + } + + final int docCount = input.readInt(); + if (docCount < 0) { + throw new CorruptIndexException("invalid docCount: " + docCount, input); + } + final boolean isCompoundFile = input.readByte() == SegmentInfo.YES; + final boolean hasBlocks = input.readByte() == SegmentInfo.YES; + + final Map diagnostics = input.readMapOfStrings(); + final Set files = input.readSetOfStrings(); + final Map attributes = input.readMapOfStrings(); + + int numSortFields = input.readVInt(); + Sort indexSort; + if (numSortFields > 0) { + SortField[] sortFields = new SortField[numSortFields]; + for (int i = 0; i < numSortFields; i++) { + String name = input.readString(); + sortFields[i] = SortFieldProvider.forName(name).readSortField(input); + } + indexSort = new Sort(sortFields); + } else if (numSortFields < 0) { + throw new CorruptIndexException("invalid index sort field count: " + numSortFields, input); + } else { + indexSort = null; + } + + DataCubesConfig dataCubesConfig; + int numDataCubeFields = input.readVInt(); + if (numDataCubeFields > 0) { + DataCubeField[] compositeIndexFields = new DataCubeField[numDataCubeFields]; + for (int i = 0; i < numDataCubeFields; i++) { + String name = input.readString(); + compositeIndexFields[i] = DataCubeFieldProvider.forName(name).readDataCubeField(input); + } + dataCubesConfig = new DataCubesConfig(compositeIndexFields); + } else if (numDataCubeFields < 0) { + throw new CorruptIndexException("invalid datacube field count: " + numDataCubeFields, input); + } else { + dataCubesConfig = null; + } + + SegmentInfo si = + new SegmentInfo( + dir, + version, + minVersion, + segment, + docCount, + isCompoundFile, + hasBlocks, + null, + diagnostics, + segmentID, + attributes, + indexSort, + dataCubesConfig); + si.setFiles(files); + return si; + } + + @Override + public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOException { + final String fileName = IndexFileNames.segmentFileName(si.name, "", SI_EXTENSION); + + try (IndexOutput output = dir.createOutput(fileName, ioContext)) { + // Only add the file once we've successfully created it, else IFD assert can trip: + si.addFile(fileName); + CodecUtil.writeIndexHeader(output, CODEC_NAME, VERSION_CURRENT, si.getId(), ""); + + writeSegmentInfo(output, si); + + CodecUtil.writeFooter(output); + } + } + + private void writeSegmentInfo(DataOutput output, SegmentInfo si) throws IOException { + Version version = si.getVersion(); + if (version.major < 7) { + throw new IllegalArgumentException( + "invalid major version: should be >= 7 but got: " + version.major + " segment=" + si); + } + // Write the Lucene version that created this segment, since 3.1 + output.writeInt(version.major); + output.writeInt(version.minor); + output.writeInt(version.bugfix); + + // Write the min Lucene version that contributed docs to the segment, since 7.0 + if (si.getMinVersion() != null) { + output.writeByte((byte) 1); + Version minVersion = si.getMinVersion(); + output.writeInt(minVersion.major); + output.writeInt(minVersion.minor); + output.writeInt(minVersion.bugfix); + } else { + output.writeByte((byte) 0); + } + + assert version.prerelease == 0; + output.writeInt(si.maxDoc()); + + output.writeByte((byte) (si.getUseCompoundFile() ? SegmentInfo.YES : SegmentInfo.NO)); + output.writeByte((byte) (si.getHasBlocks() ? SegmentInfo.YES : SegmentInfo.NO)); + output.writeMapOfStrings(si.getDiagnostics()); + Set files = si.files(); + for (String file : files) { + if (!IndexFileNames.parseSegmentName(file).equals(si.name)) { + throw new IllegalArgumentException( + "invalid files: expected segment=" + si.name + ", got=" + files); + } + } + output.writeSetOfStrings(files); + output.writeMapOfStrings(si.getAttributes()); + + Sort indexSort = si.getIndexSort(); + int numSortFields = indexSort == null ? 0 : indexSort.getSort().length; + output.writeVInt(numSortFields); + for (int i = 0; i < numSortFields; ++i) { + SortField sortField = indexSort.getSort()[i]; + IndexSorter sorter = sortField.getIndexSorter(); + if (sorter == null) { + throw new IllegalArgumentException("cannot serialize SortField " + sortField); + } + output.writeString(sorter.getProviderName()); + SortFieldProvider.write(sortField, output); + } + + DataCubesConfig dataCubesConfig = si.getDataCubesConfig(); + int numCompositeFields = dataCubesConfig == null ? 0 : dataCubesConfig.getFields().length; + output.writeVInt(numCompositeFields); + for (int i = 0; i < numCompositeFields; ++i) { + DataCubeField dataCubeField = dataCubesConfig.getFields()[i]; + output.writeString(dataCubeField.getProviderName()); + DataCubeFieldProvider.write(dataCubeField, output); + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene910/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/package-info.java new file mode 100644 index 000000000000..62d3355bb23f --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene910/package-info.java @@ -0,0 +1,430 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 9.10 file format. + * + *

Apache Lucene - Index File Formats

+ * + * + * + *

Introduction

+ * + *
+ * + *

This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

+ * + *

Definitions

+ * + *
+ * + *

The fundamental concepts in Lucene are index, document, field and term. + * + *

An index contains a sequence of documents. + * + *

    + *
  • A document is a sequence of fields. + *
  • A field is a named sequence of terms. + *
  • A term is a sequence of bytes. + *
+ * + *

The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

Inverted Indexing

+ * + *

Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

Types of Fields

+ * + *

In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

Segments

+ * + *

Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

    + *
  1. Creating new segments for newly added documents. + *
  2. Merging existing segments. + *
+ * + *

Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

Document Numbers

+ * + *

Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

    + *
  • + *

    The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *

  • + *

    When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *

+ * + *
+ * + *

Index Structure Overview

+ * + *
+ * + *

Each segment index maintains the following: + * + *

    + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This + * contains metadata about a segment, such as the number of documents, what files it uses, and + * information about how the segment is sorted + *
  • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This + * contains metadata about the set of named fields used in the index. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes are + * field names. These are used to store auxiliary information about the document, such as its + * title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term dictionary}. A + * dictionary containing all of the terms used in all of the indexed fields of all of the + * documents. The dictionary also contains the number of documents which contain the term, and + * pointers to the term's frequency and proximity data. + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Frequency data}. For + * each term in the dictionary, the numbers of all the documents that contain that term, and + * the frequency of the term in that document, unless frequencies are omitted ({@link + * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Proximity data}. For + * each term in the dictionary, the positions that the term occurs in each document. Note that + * this will not exist if all fields in all documents omit position data. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For + * each field in each document, a value is stored that is multiplied into the score for hits + * on that field. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each + * field in each document, the term vector (sometimes called document vector) may be stored. A + * term vector consists of term text and term frequency. To add Term Vectors to your index see + * the {@link org.apache.lucene.document.Field Field} constructors + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like + * stored values, these are also keyed by document number, but are generally intended to be + * loaded into main memory for fast access. Whereas stored values are generally intended for + * summary results from searches, per-document values are useful for things like scoring + * factors. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An + * optional file indicating which documents are live. + *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair + * of files, recording dimensionally indexed fields, to enable fast numeric range filtering + * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape + * intersection (2D, 3D). + *
  • {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The + * vector format stores numeric vectors in a format optimized for random access and + * computation, supporting high-dimensional nearest-neighbor search. + *
+ * + *

Details on each of these are provided in their linked pages.

+ * + *

File Naming

+ * + *
+ * + *

All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

+ * + *

Summary of File Extensions

+ * + *
+ * + *

The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
lucene filenames by extension
NameExtensionBrief Description
{@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
{@link org.apache.lucene.codecs.lucene910.Lucene910SegmentInfoFormat Segment Info}.siStores metadata about a segment
{@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
{@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
{@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Term Index}.tipThe index into the Term Dictionary
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Positions}.posStores position information about where a term occurs in the index
{@link org.apache.lucene.codecs.lucene99.Lucene99PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
{@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
{@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
{@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
{@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
{@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.dii, .dimHolds indexed points
{@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}.vec, .vem, .veq, vexHolds indexed vectors; .vec files contain the raw vector data, + * .vem the vector metadata, .veq the quantized vector data, and .vex the + * hnsw graph data.
+ * + *

+ * + *

Lock File

+ * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

History

+ * + *

Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

    + *
  • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit + * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching + * or adding/deleting of docs. When the new segments file is saved (committed), it will be + * written in the new file format (meaning no specific "upgrade" process is needed). But note + * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. + *
  • In version 2.3, the file format was changed to allow segments to share a single set of doc + * store (vectors & stored fields) files. This allows for faster indexing in certain + * cases. The change is fully backwards compatible (in the same way as the lock-less commits + * change in 2.1). + *
  • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified + * UTF-8. See LUCENE-510 for + * details. + *
  • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to + * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N + * file. See LUCENE-1382 for + * details. Also, diagnostics were added to each segment written recording details about why + * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. + *
  • In version 3.0, compressed fields are no longer written to the index (they can still be + * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. + *
  • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. + *
  • In version 3.2, numeric fields are written as natively to stored fields file, previously + * they were stored in text format only. + *
  • In version 3.4, fields can omit position data while still indexing term frequencies. + *
  • In version 4.0, the format of the inverted index became extensible via the {@link + * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) + * was introduced. Normalization factors need no longer be a single byte, they can be any + * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be + * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into + * the postings lists. Payloads can be stored in the term vectors. + *
  • In version 4.1, the format of the postings list changed to use either of FOR compression or + * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once + * were changed to inline directly into the term dictionary. Stored fields are compressed by + * default. + *
  • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued + * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. + *
  • In version 4.5, DocValues were extended to explicitly represent missing values. + *
  • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields. + *
  • In version 4.8, checksum footers were added to the end of each index file for improved data + * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 + * checksum of the file. + *
  • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is + * suitable for faceting/sorting/analytics. + *
  • In version 5.4, DocValues have been improved to store more information on disk: addresses + * for binary fields and ord indexes for multi-valued fields. + *
  • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
  • In version 6.2, new Segment info format that reads/writes the index sort, to support index + * sorting. + *
  • In version 7.0, DocValues have been improved to better support sparse doc values thanks to + * an iterator API. + *
  • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term + * freq, normalization factor) pairs that may trigger the maximum score of the block. This + * information is recorded alongside skip data in order to be able to skip blocks of doc ids + * if they may not produce high enough scores. Additionally doc values and norms has been + * extended with jump-tables to make access O(1) instead of O(n), where n is the number of + * elements to skip when advancing in the data. + *
  • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized. + *
  • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow + * user-defined sorts to be used + *
  • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields. + *
  • In version 9.0, vector-valued fields were added. + *
  • In version 9.1, vector-valued fields were modified to add a graph hierarchy. + *
  • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by + * IndexDISI. ordToDoc mappings was added to .vem. + *
  • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. + * Additionally, metadata file size improvements were made by delta-encoding nodes by graph + * layer and not writing the node ids for the zeroth layer. + *
  • In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector + * format to utilize int8 quantized vectors for float32 vector search. + *
  • In version 9.10, new Segment info format that reads/writes the data cubes config, to + * support building * data cube indices. TODO : further update this + *
+ * + * + * + *

Limitations

+ * + *
+ * + *

Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

+ */ +package org.apache.lucene.codecs.lucene910; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java index 10f1431352c1..daa0aef2dc6e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene99/Lucene99Codec.java @@ -152,7 +152,7 @@ public final KnnVectorsFormat knnVectorsFormat() { @Override public DataCubesFormat dataCubesFormat() { - return DataCubesFormat.EMPTY; // TODO + return DataCubesFormat.EMPTY; } /** diff --git a/lucene/core/src/java/org/apache/lucene/index/DataCubeField.java b/lucene/core/src/java/org/apache/lucene/index/DataCubeField.java index 29e9d06a15fb..436f01d14069 100644 --- a/lucene/core/src/java/org/apache/lucene/index/DataCubeField.java +++ b/lucene/core/src/java/org/apache/lucene/index/DataCubeField.java @@ -17,6 +17,8 @@ package org.apache.lucene.index; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Set; import org.apache.lucene.store.DataInput; import org.apache.lucene.store.DataOutput; @@ -28,24 +30,24 @@ public class DataCubeField { private final String name; - private final Set dims; - private final Set metrics; + private final Set dimensionFields; + private final List metricFields; /** Sole constructor */ - public DataCubeField(String name, Set dims, Set metrics) { + public DataCubeField(String name, Set dimensionFields, List metricFields) { this.name = name; - this.dims = dims; - this.metrics = metrics; + this.dimensionFields = dimensionFields; + this.metricFields = metricFields; } /** Returns set of dimensions associated with this DataCubeField */ - public Set getDims() { - return dims; + public Set getDimensionFields() { + return dimensionFields; } /** Returns set of metrics associated with this DataCubeField */ - public Set getMetrics() { - return metrics; + public List getMetricFields() { + return metricFields; } /** Returns name of this DataCubeField */ @@ -58,6 +60,20 @@ public String getProviderName() { return Provider.NAME; } + /** Metric field which encapsulates name and associated aggregation function */ + public static class MetricField { + private final String name; + // TODO : if we make this enum, how to enable it for extensions + // in custom formats - so keeping it as string for now + private final String metricFunction; + + /** sole constructor */ + public MetricField(String name, String metricFunction) { + this.name = name; + this.metricFunction = metricFunction; + } + } + /** Provider for DataCubeField */ public static final class Provider extends DataCubeFieldProvider { @@ -72,9 +88,14 @@ public Provider() { /** Reads DataCubeField from DataInput */ @Override public DataCubeField readDataCubeField(DataInput in) throws IOException { - DataCubeField cf = - new DataCubeField(in.readString(), in.readSetOfStrings(), in.readSetOfStrings()); - return cf; + String name = in.readString(); + Set dimensionFields = in.readSetOfStrings(); + List metricFields = new ArrayList<>(); + int metricFieldsCount = in.readVInt(); + for (int i = 0; i < metricFieldsCount; i++) { + metricFields.add(new MetricField(in.readString(), in.readString())); + } + return new DataCubeField(name, dimensionFields, metricFields); } /** Writes DataCubeField to DataOutput */ @@ -87,7 +108,11 @@ public void writeDataCubeField(DataCubeField cf, DataOutput out) throws IOExcept /** Serializes DataCubeField to DataOutput */ private void serialize(DataOutput out) throws IOException { out.writeString(name); - out.writeSetOfStrings(dims); - out.writeSetOfStrings(metrics); + out.writeSetOfStrings(dimensionFields); + out.writeVInt(metricFields.size()); + for (MetricField metricfield : metricFields) { + out.writeString(metricfield.name); + out.writeString(metricfield.metricFunction); + } } }