Skip to content

Commit

Permalink
LUCENE-9322: Add Lucene90 codec, including VectorFormat
Browse files Browse the repository at this point in the history
This commit adds support for dense floating point VectorFields.
The new VectorValues class provides access to the indexed vectors.
  • Loading branch information
Michael Sokolov committed Oct 18, 2020
1 parent 85b58c2 commit c02f07f
Show file tree
Hide file tree
Showing 81 changed files with 4,567 additions and 918 deletions.
3 changes: 3 additions & 0 deletions .dir-locals.el
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
;; set up Lucene style for emacs
((java-mode . ((c-basic-offset . 2))))

3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@ build
dist
lib
test-lib
/*~
*~
.#*
/build.properties
/.idea
lucene/**/*.iml
Expand Down
3 changes: 2 additions & 1 deletion gradle/documentation/render-javadoc.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,8 @@ configure(project(":lucene:backward-codecs")) {
"org.apache.lucene.codecs.lucene60",
"org.apache.lucene.codecs.lucene80",
"org.apache.lucene.codecs.lucene84",
"org.apache.lucene.codecs.lucene86"
"org.apache.lucene.codecs.lucene86",
"org.apache.lucene.codecs.lucene87"
]
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand Down Expand Up @@ -128,4 +129,9 @@ public final DocValuesFormat docValuesFormat() {
public final NormsFormat normsFormat() {
return normsFormat;
}

@Override
public final VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand Down Expand Up @@ -136,6 +137,11 @@ public PointsFormat pointsFormat() {
return new Lucene60PointsFormat();
}

@Override
public VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}

/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.FilterCodec;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand Down Expand Up @@ -136,6 +137,11 @@ public final PointsFormat pointsFormat() {
return pointsFormat;
}

@Override
public final VectorFormat vectorFormat() {
return VectorFormat.EMPTY;
}

/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import org.apache.lucene.codecs.SegmentInfoFormat;
import org.apache.lucene.codecs.StoredFieldsFormat;
import org.apache.lucene.codecs.TermVectorsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.lucene50.Lucene50CompoundFormat;
import org.apache.lucene.codecs.lucene50.Lucene50LiveDocsFormat;
import org.apache.lucene.codecs.lucene50.Lucene50TermVectorsFormat;
Expand Down Expand Up @@ -137,6 +138,9 @@ public final PointsFormat pointsFormat() {
return pointsFormat;
}

@Override
public final VectorFormat vectorFormat() { return VectorFormat.EMPTY; }

/** Returns the postings format that should be used for writing
* new segments of <code>field</code>.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
</head>
<body>
Lucene 8.7 file format.
</body>
</html>
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,4 @@
org.apache.lucene.codecs.lucene80.Lucene80Codec
org.apache.lucene.codecs.lucene84.Lucene84Codec
org.apache.lucene.codecs.lucene86.Lucene86Codec
org.apache.lucene.codecs.lucene87.Lucene87Codec
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.lucene87.Lucene87Codec;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexCommit;
import org.apache.lucene.index.IndexDeletionPolicy;
Expand Down Expand Up @@ -138,7 +138,7 @@ public static IndexWriterConfig createWriterConfig(Config config, PerfRunData ru
if (defaultCodec == null && postingsFormat != null) {
try {
final PostingsFormat postingsFormatChosen = PostingsFormat.forName(postingsFormat);
iwConf.setCodec(new Lucene87Codec() {
iwConf.setCodec(new Lucene90Codec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return postingsFormatChosen;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.lucene.codecs.CompoundFormat;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.FieldInfosFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.LiveDocsFormat;
import org.apache.lucene.codecs.NormsFormat;
import org.apache.lucene.codecs.PointsFormat;
Expand All @@ -46,6 +47,7 @@ public final class SimpleTextCodec extends Codec {
private final DocValuesFormat dvFormat = new SimpleTextDocValuesFormat();
private final CompoundFormat compoundFormat = new SimpleTextCompoundFormat();
private final PointsFormat pointsFormat = new SimpleTextPointsFormat();
private final VectorFormat vectorFormat = new SimpleTextVectorFormat();

public SimpleTextCodec() {
super("SimpleText");
Expand Down Expand Up @@ -100,4 +102,9 @@ public CompoundFormat compoundFormat() {
public PointsFormat pointsFormat() {
return pointsFormat;
}

@Override
public VectorFormat vectorFormat() {
return vectorFormat;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.VectorValues;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IOContext;
Expand Down Expand Up @@ -67,6 +68,8 @@ public class SimpleTextFieldInfosFormat extends FieldInfosFormat {
static final BytesRef DATA_DIM_COUNT = new BytesRef(" data dimensional count ");
static final BytesRef INDEX_DIM_COUNT = new BytesRef(" index dimensional count ");
static final BytesRef DIM_NUM_BYTES = new BytesRef(" dimensional num bytes ");
static final BytesRef VECTOR_NUM_DIMS = new BytesRef(" vector number of dimensions ");
static final BytesRef VECTOR_SCORE_FUNC = new BytesRef(" vector score function ");
static final BytesRef SOFT_DELETES = new BytesRef(" soft-deletes ");

@Override
Expand Down Expand Up @@ -146,13 +149,23 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm
assert StringHelper.startsWith(scratch.get(), DIM_NUM_BYTES);
int dimensionalNumBytes = Integer.parseInt(readString(DIM_NUM_BYTES.length, scratch));

SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), VECTOR_NUM_DIMS);
int vectorNumDimensions = Integer.parseInt(readString(VECTOR_NUM_DIMS.length, scratch));

SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), VECTOR_SCORE_FUNC);
String scoreFunction = readString(VECTOR_SCORE_FUNC.length, scratch);
VectorValues.ScoreFunction vectorDistFunc = distanceFunction(scoreFunction);

SimpleTextUtil.readLine(input, scratch);
assert StringHelper.startsWith(scratch.get(), SOFT_DELETES);
boolean isSoftDeletesField = Boolean.parseBoolean(readString(SOFT_DELETES.length, scratch));

infos[i] = new FieldInfo(name, fieldNumber, storeTermVector,
omitNorms, storePayloads, indexOptions, docValuesType, dvGen, Collections.unmodifiableMap(atts),
dimensionalCount, indexDimensionalCount, dimensionalNumBytes, isSoftDeletesField);
dimensionalCount, indexDimensionalCount, dimensionalNumBytes,
vectorNumDimensions, vectorDistFunc, isSoftDeletesField);
}

SimpleTextUtil.checkFooter(input);
Expand All @@ -172,6 +185,10 @@ public FieldInfos read(Directory directory, SegmentInfo segmentInfo, String segm
public DocValuesType docValuesType(String dvType) {
return DocValuesType.valueOf(dvType);
}

public VectorValues.ScoreFunction distanceFunction(String scoreFunction) {
return VectorValues.ScoreFunction.valueOf(scoreFunction);
}

private String readString(int offset, BytesRefBuilder scratch) {
return new String(scratch.bytes(), offset, scratch.length()-offset, StandardCharsets.UTF_8);
Expand Down Expand Up @@ -253,6 +270,14 @@ public void write(Directory directory, SegmentInfo segmentInfo, String segmentSu
SimpleTextUtil.write(out, Integer.toString(fi.getPointNumBytes()), scratch);
SimpleTextUtil.writeNewline(out);

SimpleTextUtil.write(out, VECTOR_NUM_DIMS);
SimpleTextUtil.write(out, Integer.toString(fi.getVectorDimension()), scratch);
SimpleTextUtil.writeNewline(out);

SimpleTextUtil.write(out, VECTOR_SCORE_FUNC);
SimpleTextUtil.write(out, fi.getVectorScoreFunction().name(), scratch);
SimpleTextUtil.writeNewline(out);

SimpleTextUtil.write(out, SOFT_DELETES);
SimpleTextUtil.write(out, Boolean.toString(fi.isSoftDeletesField()), scratch);
SimpleTextUtil.writeNewline(out);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;


import java.io.IOException;

import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.VectorReader;
import org.apache.lucene.codecs.VectorWriter;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;

/** For debugging, curiosity, transparency only!! Do not use this codec in production.
*
* <p>This codec stores all data in a single human-readable text file (_N.vec). You can view this in
* any text editor, and even edit it to alter your index.
*
* @lucene.experimental */
public final class SimpleTextVectorFormat extends VectorFormat {

@Override
public VectorWriter fieldsWriter(SegmentWriteState state) throws IOException {
return new SimpleTextVectorWriter(state);
}

@Override
public VectorReader fieldsReader(SegmentReadState state) throws IOException {
return new SimpleTextVectorReader(state);
}

/** Extension of points data file */
static final String VECTOR_EXTENSION = "vec";

/** Extension of points index file */
static final String META_EXTENSION = "gri";
}
Loading

0 comments on commit c02f07f

Please sign in to comment.