Merge branch 'main' into fstpostingformat-off-heap

apache · Dec 28, 2023 · 78ff079 · 78ff079
2 parents cc249e1 + 948970b
commit 78ff079
Show file tree

Hide file tree

Showing 102 changed files with 1,036 additions and 541 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -25,15 +25,21 @@ First of all, you need the Lucene source code.
 
 Get the source code using: `git clone https://github.com/apache/lucene`
 
-### Notes for required Java version
+Please note that it is important to preserve the files' original line breaks - some of them have their checksums verified during build.
+If you are using Windows you might want to override the default Git configuration when cloning the repository:
+`git clone --config core.autocrlf=false https://github.com/apache/lucene`
+
+### Pre-requisites
 
 Be sure that you are using an appropriate version of the JDK. Please check [README](./README.md) for the required JDK version for current main branch.
 
+Some build tasks (in particular `./gradlew check`) require Perl and Python 3.
+
 ### Building with Gradle
 
 Lucene uses [Gradle](https://gradle.org/) for build control. Gradle is itself Java-based and may be incompatible with newer Java versions; you can still build and test Lucene with these Java releases, see [jvms.txt](./help/jvms.txt) for more information.
 
-NOTE: DO NOT use the `gradle` command that is perhaps installed on your machine. This may result in using a different gradle version than the project requires and this is known to lead to very cryptic errors. The "gradle wrapper" (gradlew script) does everything required to build the project from scratch: it downloads the correct version of gradle, sets up sane local configurations and is tested on multiple environments.
+NOTE: DO NOT use the `gradle` command that is perhaps installed on your machine. This may result in using a different gradle version than the project requires and this is known to lead to very cryptic errors. The "gradle wrapper" (`gradlew` script) does everything required to build the project from scratch: it downloads the correct version of gradle, sets up sane local configurations and is tested on multiple environments.
 
 The first time you run gradlew, it will create a file "gradle.properties" that contains machine-specific settings. Normally you can use this file as-is, but it can be modified if necessary.
 

diff --git a/dev-tools/README.txt b/dev-tools/README.txt
@@ -5,8 +5,8 @@ as to the usefulness of the tools.
 
 Description of dev-tools/ contents:
 
-./size-estimator-lucene-solr.xls -- Spreadsheet for estimating memory and disk usage in Lucene/Solr
+./aws-jmh/       -- Scripts for running microbenchmarks across different ec2 instance types.
+./doap/          -- Lucene project descriptors in DOAP RDF format.
 ./missing-doclet -- JavaDoc validation doclet subproject
-./doap/       -- Lucene and Solr project descriptors in DOAP RDF format.
-./scripts/    -- Odds and ends for building releases, etc.
-./test-patch/ -- Scripts for automatically validating patches
+./scripts/       -- Odds and ends for building releases, etc.
+./test-patch/    -- Scripts for automatically validating patches
diff --git a/dev-tools/doap/lucene.rdf b/dev-tools/doap/lucene.rdf
@@ -67,6 +67,13 @@
     </maintainer>
 
     <!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
+    <release>
+       <Version>
+         <name>lucene-9.9.1</name>
+         <created>2023-12-16</created>
+         <revision>9.9.1</revision>
+       </Version>
+    </release>
     <release>
        <Version>
          <name>lucene-9.9.0</name>

diff --git a/gradle/help.gradle b/gradle/help.gradle
@@ -46,7 +46,7 @@ configure(rootProject) {
   help {
     doLast {
       println ""
-      println "This is an experimental Lucene/Solr gradle build. See some"
+      println "This is Lucene's gradle build. See some"
       println "guidelines, ant-equivalent commands etc. under help/*; or type:"
       println ""
       helpFiles.each { section, path, sectionInfo ->

diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle
@@ -144,7 +144,6 @@ allprojects {
          "tests.leaveTemporary",
          "tests.leavetemporary",
          "tests.leavetmpdir",
-         "solr.test.leavetmpdir",
       ].find { prop ->
         def v = Boolean.parseBoolean(propertyOrDefault(prop, "false"))
         if (v) {

diff --git a/gradle/testing/randomization/policies/tests.policy b/gradle/testing/randomization/policies/tests.policy
@@ -60,9 +60,6 @@ grant {
   permission java.lang.RuntimePermission "getFileStoreAttributes";
   permission java.lang.RuntimePermission "writeFileDescriptor";
 
-  // needed to check if C2 (implied by the presence of the CI env) is enabled
-  permission java.lang.RuntimePermission "getenv.CI";
-
   // TestLockFactoriesMultiJVM opens a random port on 127.0.0.1 (port 0 = ephemeral port range):
   permission java.net.SocketPermission "127.0.0.1:0", "accept,listen,resolve";
 
@@ -80,11 +77,6 @@ grant {
   // used by nested tests? (e.g. TestLeaveFilesIfTestFails). TODO: look into this
   permission java.util.PropertyPermission "tests.runnested", "write";
 
-  // solr properties. TODO: move these out to SolrTestCase
-  permission java.util.PropertyPermission "solr.data.dir", "write";
-  permission java.util.PropertyPermission "solr.solr.home", "write";
-  permission java.util.PropertyPermission "solr.directoryFactory", "write";
-
   // allows LuceneTestCase#runWithRestrictedPermissions to execute with lower (or no) permission
   permission java.security.SecurityPermission "createAccessControlContext";
 

diff --git a/help/formatting.txt b/help/formatting.txt
@@ -16,5 +16,5 @@ IMPORTANT: There is *no* way to mark sections of the code as excluded
 from formatting. This is by design and cannot be altered. In vast
 majority of cases the formatter will do a great job of cleaning up the
 code. Occasionally you may want to rewrite the code (introduce a local
-variable orreshape code paths) so that it's easier to read after
+variable or reshape code paths) so that it's easier to read after
 automatic formatting.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -80,6 +80,9 @@ API Changes
 * GITHUB#12980: Make FSTPostingsFormat to build FST off-heap. This PostingsFormat will now
   create 2 FST files (tfp.meta and tfp.data) instead of a single one. (Anh Dung Bui)
 
+* GITHUB#12875: Ensure token position is always increased in PathHierarchyTokenizer and ReversePathHierarchyTokenizer
+  and resulting tokens do not overlap. (Michael Froh, Lukáš Vlček)
+
 New Features
 ---------------------
 
@@ -120,6 +123,8 @@ Optimizations
 
 * GITHUB#12552: Make FSTPostingsFormat load FSTs off-heap. (Tony X)
 
+* GITHUB#12841: Move group-varint encoding/decoding logic to DataOutput/DataInput.  (Adrien Grand, Zhang Chao, Uwe Schindler)
+
 Bug Fixes
 ---------------------
 
@@ -191,7 +196,9 @@ Improvements
 
 Optimizations
 ---------------------
-(No changes)
+
+* GITHUB#12839: Introduce method to grow arrays up to a given upper limit and use it to reduce overallocation for
+  DirectoryTaxonomyReader#getBulkOrdinals. (Stefan Vodita)
 
 Bug Fixes
 ---------------------
@@ -216,6 +223,8 @@ Other
 
 * GITHUB#11023: Removing @lucene.experimental tags in testXXX methods in CheckIndex. (Jakub Slowinski)
 
+* GITHUB#12934: Cleaning up old references to Lucene/Solr. (Jakub Slowinski)
+
 ======================== Lucene 9.9.1 =======================
 
 Bug Fixes

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -134,6 +134,11 @@ It now declares that it may throw `IOException`. This was an oversight because
 compiled expressions call `DoubleValues#doubleValue` behind the scenes, which
 may throw `IOException` on index problems, bubbling up unexpectedly to the caller.
 
+### PathHierarchyTokenizer and ReversePathHierarchyTokenizer do not produce overlapping tokens
+
+`(Reverse)PathHierarchyTokenizer` now produces sequential (instead of overlapping) tokens with accurate
+offsets, making positional queries and highlighters possible for fields tokenized with this tokenizer.
+
 ## Migration from Lucene 9.0 to Lucene 9.1
 
 ### Test framework package migration and module (LUCENE-10301)

diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/path/PathHierarchyTokenizer.java
@@ -100,7 +100,8 @@ public PathHierarchyTokenizer(
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionIncrementAttribute posIncAtt =
+      addAttribute(PositionIncrementAttribute.class);
   private int startPosition = 0;
   private int skipped = 0;
   private boolean endDelimiter = false;
@@ -112,11 +113,7 @@ public PathHierarchyTokenizer(
   public final boolean incrementToken() throws IOException {
     clearAttributes();
     termAtt.append(resultToken);
-    if (resultToken.length() == 0) {
-      posAtt.setPositionIncrement(1);
-    } else {
-      posAtt.setPositionIncrement(0);
-    }
+    posIncAtt.setPositionIncrement(1);
     int length = 0;
     boolean added = false;
     if (endDelimiter) {

diff --git a/...alysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java b/...alysis/common/src/java/org/apache/lucene/analysis/path/ReversePathHierarchyTokenizer.java
@@ -112,7 +112,8 @@ public ReversePathHierarchyTokenizer(
 
   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-  private final PositionIncrementAttribute posAtt = addAttribute(PositionIncrementAttribute.class);
+  private final PositionIncrementAttribute posIncAtt =
+      addAttribute(PositionIncrementAttribute.class);
 
   private int endPosition = 0;
   private int finalOffset = 0;
@@ -158,10 +159,8 @@ public final boolean incrementToken() throws IOException {
         endPosition = delimiterPositions.get(idx);
       }
       finalOffset = correctOffset(length);
-      posAtt.setPositionIncrement(1);
-    } else {
-      posAtt.setPositionIncrement(0);
     }
+    posIncAtt.setPositionIncrement(1);
 
     while (skipped < delimitersCount - skip - 1) {
       int start = delimiterPositions.get(skipped);

diff --git a/.../analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java b/.../analysis/common/src/test/org/apache/lucene/analysis/path/TestPathHierarchyTokenizer.java
@@ -19,6 +19,7 @@
 import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_DELIMITER;
 import static org.apache.lucene.analysis.path.PathHierarchyTokenizer.DEFAULT_SKIP;
 
+import java.io.IOException;
 import java.io.Reader;
 import java.io.StringReader;
 import java.util.Random;
@@ -41,7 +42,7 @@ public void testBasic() throws Exception {
         new String[] {"/a", "/a/b", "/a/b/c"},
         new int[] {0, 0, 0},
         new int[] {2, 4, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -56,7 +57,7 @@ public void testEndOfDelimiter() throws Exception {
         new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"},
         new int[] {0, 0, 0, 0},
         new int[] {2, 4, 6, 7},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -71,7 +72,7 @@ public void testStartOfChar() throws Exception {
         new String[] {"a", "a/b", "a/b/c"},
         new int[] {0, 0, 0},
         new int[] {1, 3, 5},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -86,7 +87,7 @@ public void testStartOfCharEndOfDelimiter() throws Exception {
         new String[] {"a", "a/b", "a/b/c", "a/b/c/"},
         new int[] {0, 0, 0, 0},
         new int[] {1, 3, 5, 6},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -111,7 +112,7 @@ public void testOnlyDelimiters() throws Exception {
         new String[] {"/", "//"},
         new int[] {0, 0},
         new int[] {1, 2},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -125,7 +126,7 @@ public void testReplace() throws Exception {
         new String[] {"\\a", "\\a\\b", "\\a\\b\\c"},
         new int[] {0, 0, 0},
         new int[] {2, 4, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -139,7 +140,7 @@ public void testWindowsPath() throws Exception {
         new String[] {"c:", "c:\\a", "c:\\a\\b", "c:\\a\\b\\c"},
         new int[] {0, 0, 0, 0},
         new int[] {2, 4, 6, 8},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -158,7 +159,7 @@ public void testNormalizeWinDelimToLinuxDelim() throws Exception {
         new String[] {"c:", "c:/a", "c:/a/b", "c:/a/b/c"},
         new int[] {0, 0, 0, 0},
         new int[] {2, 4, 6, 8},
-        new int[] {1, 0, 0, 0},
+        new int[] {1, 1, 1, 1},
         path.length());
   }
 
@@ -172,7 +173,7 @@ public void testBasicSkip() throws Exception {
         new String[] {"/b", "/b/c"},
         new int[] {2, 2},
         new int[] {4, 6},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -186,7 +187,7 @@ public void testEndOfDelimiterSkip() throws Exception {
         new String[] {"/b", "/b/c", "/b/c/"},
         new int[] {2, 2, 2},
         new int[] {4, 6, 7},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -200,7 +201,7 @@ public void testStartOfCharSkip() throws Exception {
         new String[] {"/b", "/b/c"},
         new int[] {1, 1},
         new int[] {3, 5},
-        new int[] {1, 0},
+        new int[] {1, 1},
         path.length());
   }
 
@@ -214,7 +215,7 @@ public void testStartOfCharEndOfDelimiterSkip() throws Exception {
         new String[] {"/b", "/b/c", "/b/c/"},
         new int[] {1, 1, 1},
         new int[] {3, 5, 6},
-        new int[] {1, 0, 0},
+        new int[] {1, 1, 1},
         path.length());
   }
 
@@ -270,4 +271,20 @@ protected TokenStreamComponents createComponents(String fieldName) {
     checkRandomData(random, a, 100 * RANDOM_MULTIPLIER, 1027, false, false);
     a.close();
   }
+
+  private final Analyzer analyzer =
+      new Analyzer() {
+        @Override
+        protected TokenStreamComponents createComponents(String fieldName) {
+          Tokenizer tokenizer = new PathHierarchyTokenizer();
+          return new TokenStreamComponents(tokenizer);
+        }
+      };
+
+  public void testTokenizerViaAnalyzerOutput() throws IOException {
+    assertAnalyzesTo(analyzer, "a/b/c", new String[] {"a", "a/b", "a/b/c"});
+    assertAnalyzesTo(analyzer, "a/b/c/", new String[] {"a", "a/b", "a/b/c", "a/b/c/"});
+    assertAnalyzesTo(analyzer, "/a/b/c", new String[] {"/a", "/a/b", "/a/b/c"});
+    assertAnalyzesTo(analyzer, "/a/b/c/", new String[] {"/a", "/a/b", "/a/b/c", "/a/b/c/"});
+  }
 }