From 4a7225d6fb7ba747d02a6286e482c17fad2b2ae9 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Sat, 9 Dec 2023 01:07:35 +0000 Subject: [PATCH 1/2] Output well-formed UTF-8 bytes in SimpleTextCodec's segmentinfos The SimpleTextSegmentInfoFormat was writing the random byte array used as a segment's ID directly -- not converting to a simple text representation of the byte array. As a result, the segment infos were often malformed. --- .../SimpleTextSegmentInfoFormat.java | 5 +-- .../TestSimpleTextSegmentInfoFormat.java | 45 +++++++++++++++++++ 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java index accdb184df80..5480c0fec0c0 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java @@ -36,7 +36,6 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.util.ArrayUtil; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefBuilder; import org.apache.lucene.util.CollectionUtil; @@ -164,7 +163,7 @@ public SegmentInfo read( SimpleTextUtil.readLine(input, scratch); assert StringHelper.startsWith(scratch.get(), SI_ID); - final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length()); + final byte[] id = SimpleTextUtil.fromBytesRefString(readString(SI_ID.length, scratch)).bytes; if (!Arrays.equals(segmentID, id)) { throw new CorruptIndexException( @@ -307,7 +306,7 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE } SimpleTextUtil.write(output, SI_ID); - SimpleTextUtil.write(output, new BytesRef(si.getId())); + SimpleTextUtil.write(output, new BytesRef(si.getId()).toString(), scratch); SimpleTextUtil.writeNewline(output); Sort indexSort = si.getIndexSort(); diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java index 610f5a2d7564..e68a22016271 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java @@ -16,8 +16,18 @@ */ package org.apache.lucene.codecs.simpletext; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.Collections; import org.apache.lucene.codecs.Codec; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase; +import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; /** Tests SimpleTextSegmentInfoFormat */ @@ -33,4 +43,39 @@ protected Version[] getVersions() { protected Codec getCodec() { return codec; } + + public void testFileIsUTF8() throws IOException { + Directory dir = newDirectory(); + Codec codec = getCodec(); + byte[] id = StringHelper.randomId(); + SegmentInfo info = + new SegmentInfo( + dir, + getVersions()[0], + getVersions()[0], + "_123", + 1, + false, + false, + codec, + Collections.emptyMap(), + id, + Collections.emptyMap(), + null); + info.setFiles(Collections.emptySet()); + codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT); + String segFileName = + IndexFileNames.segmentFileName("_123", "", SimpleTextSegmentInfoFormat.SI_EXTENSION); + try (ChecksumIndexInput input = dir.openChecksumInput(segFileName)) { + long length = input.length(); + if (length > 5_000) { + // Avoid allocating a huge array if the length is wrong + fail("SegmentInfos should not be this large"); + } + byte[] bytes = new byte[(int) length]; + input.readBytes(bytes, 0, bytes.length); + StandardCharsets.UTF_8.newDecoder().decode(ByteBuffer.wrap(bytes)); + } + dir.close(); + } } From 59c3b8c6ec01900065fc2a3d5502d8baee673174 Mon Sep 17 00:00:00 2001 From: Michael Froh Date: Fri, 29 Dec 2023 17:44:38 +0000 Subject: [PATCH 2/2] Use Term.toString in test --- .../simpletext/TestSimpleTextSegmentInfoFormat.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java index e68a22016271..ee55977a8969 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java @@ -17,16 +17,16 @@ package org.apache.lucene.codecs.simpletext; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; import java.util.Collections; import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfo; +import org.apache.lucene.index.Term; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase; +import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.StringHelper; import org.apache.lucene.util.Version; @@ -73,8 +73,9 @@ public void testFileIsUTF8() throws IOException { fail("SegmentInfos should not be this large"); } byte[] bytes = new byte[(int) length]; - input.readBytes(bytes, 0, bytes.length); - StandardCharsets.UTF_8.newDecoder().decode(ByteBuffer.wrap(bytes)); + BytesRef bytesRef = new BytesRef(bytes); + // If the following are equal, it means the bytes were not well-formed UTF8. + assertNotEquals(bytesRef.toString(), Term.toString(bytesRef)); } dir.close(); }