From 4a7225d6fb7ba747d02a6286e482c17fad2b2ae9 Mon Sep 17 00:00:00 2001
From: Michael Froh <msfroh@gmail.com>
Date: Sat, 9 Dec 2023 01:07:35 +0000
Subject: [PATCH 1/2] Output well-formed UTF-8 bytes in SimpleTextCodec's
 segmentinfos

The SimpleTextSegmentInfoFormat was writing the random byte array used
as a segment's ID directly -- not converting to a simple text
representation of the byte array. As a result, the segment infos were
often malformed.
---
 .../SimpleTextSegmentInfoFormat.java          |  5 +--
 .../TestSimpleTextSegmentInfoFormat.java      | 45 +++++++++++++++++++
 2 files changed, 47 insertions(+), 3 deletions(-)
diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
index accdb184df80..5480c0fec0c0 100644
--- a/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextSegmentInfoFormat.java
@@ -36,7 +36,6 @@
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.IndexOutput;
-import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefBuilder;
 import org.apache.lucene.util.CollectionUtil;
@@ -164,7 +163,7 @@ public SegmentInfo read(
 
       SimpleTextUtil.readLine(input, scratch);
       assert StringHelper.startsWith(scratch.get(), SI_ID);
-      final byte[] id = ArrayUtil.copyOfSubArray(scratch.bytes(), SI_ID.length, scratch.length());
+      final byte[] id = SimpleTextUtil.fromBytesRefString(readString(SI_ID.length, scratch)).bytes;
 
       if (!Arrays.equals(segmentID, id)) {
         throw new CorruptIndexException(
@@ -307,7 +306,7 @@ public void write(Directory dir, SegmentInfo si, IOContext ioContext) throws IOE
       }
 
       SimpleTextUtil.write(output, SI_ID);
-      SimpleTextUtil.write(output, new BytesRef(si.getId()));
+      SimpleTextUtil.write(output, new BytesRef(si.getId()).toString(), scratch);
       SimpleTextUtil.writeNewline(output);
 
       Sort indexSort = si.getIndexSort();
diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
index 610f5a2d7564..e68a22016271 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
@@ -16,8 +16,18 @@
  */
 package org.apache.lucene.codecs.simpletext;
 
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
 import org.apache.lucene.codecs.Codec;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.store.ChecksumIndexInput;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
 import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
+import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.Version;
 
 /** Tests SimpleTextSegmentInfoFormat */
@@ -33,4 +43,39 @@ protected Version[] getVersions() {
   protected Codec getCodec() {
     return codec;
   }
+
+  public void testFileIsUTF8() throws IOException {
+    Directory dir = newDirectory();
+    Codec codec = getCodec();
+    byte[] id = StringHelper.randomId();
+    SegmentInfo info =
+        new SegmentInfo(
+            dir,
+            getVersions()[0],
+            getVersions()[0],
+            "_123",
+            1,
+            false,
+            false,
+            codec,
+            Collections.<String, String>emptyMap(),
+            id,
+            Collections.emptyMap(),
+            null);
+    info.setFiles(Collections.<String>emptySet());
+    codec.segmentInfoFormat().write(dir, info, IOContext.DEFAULT);
+    String segFileName =
+        IndexFileNames.segmentFileName("_123", "", SimpleTextSegmentInfoFormat.SI_EXTENSION);
+    try (ChecksumIndexInput input = dir.openChecksumInput(segFileName)) {
+      long length = input.length();
+      if (length > 5_000) {
+        // Avoid allocating a huge array if the length is wrong
+        fail("SegmentInfos should not be this large");
+      }
+      byte[] bytes = new byte[(int) length];
+      input.readBytes(bytes, 0, bytes.length);
+      StandardCharsets.UTF_8.newDecoder().decode(ByteBuffer.wrap(bytes));
+    }
+    dir.close();
+  }
 }

From 59c3b8c6ec01900065fc2a3d5502d8baee673174 Mon Sep 17 00:00:00 2001
From: Michael Froh <msfroh@gmail.com>
Date: Fri, 29 Dec 2023 17:44:38 +0000
Subject: [PATCH 2/2] Use Term.toString in test

---
 .../simpletext/TestSimpleTextSegmentInfoFormat.java      | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
index e68a22016271..ee55977a8969 100644
--- a/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
+++ b/lucene/codecs/src/test/org/apache/lucene/codecs/simpletext/TestSimpleTextSegmentInfoFormat.java
@@ -17,16 +17,16 @@
 package org.apache.lucene.codecs.simpletext;
 
 import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
 import java.util.Collections;
 import org.apache.lucene.codecs.Codec;
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentInfo;
+import org.apache.lucene.index.Term;
 import org.apache.lucene.store.ChecksumIndexInput;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.tests.index.BaseSegmentInfoFormatTestCase;
+import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.StringHelper;
 import org.apache.lucene.util.Version;
 
@@ -73,8 +73,9 @@ public void testFileIsUTF8() throws IOException {
         fail("SegmentInfos should not be this large");
       }
       byte[] bytes = new byte[(int) length];
-      input.readBytes(bytes, 0, bytes.length);
-      StandardCharsets.UTF_8.newDecoder().decode(ByteBuffer.wrap(bytes));
+      BytesRef bytesRef = new BytesRef(bytes);
+      // If the following are equal, it means the bytes were not well-formed UTF8.
+      assertNotEquals(bytesRef.toString(), Term.toString(bytesRef));
     }
     dir.close();
   }