From 66324f763fc7fb0d8e7cd6f334e5438f0171c84e Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Wed, 1 Nov 2023 21:40:44 +0100 Subject: [PATCH 01/19] Fix test failure. The test expects that opening a writer on 5 segments doesn't cause merging, but actually it does since randomization created a merge policy with a factor of 5. --- .../org/apache/lucene/index/TestDeletionPolicy.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java index f2b56868e6c7..ac2ff786dd96 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -459,7 +459,8 @@ public void testOpenPriorSnapshot() throws IOException { dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy) - .setIndexCommit(lastCommit)); + .setIndexCommit(lastCommit) + .setMergePolicy(newLogMergePolicy(10))); assertEquals(10, writer.getDocStats().numDocs); // Should undo our rollback: @@ -476,12 +477,13 @@ public void testOpenPriorSnapshot() throws IOException { dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy) - .setIndexCommit(lastCommit)); + .setIndexCommit(lastCommit) + .setMergePolicy(newLogMergePolicy(10))); assertEquals(10, writer.getDocStats().numDocs); // Commits the rollback: writer.close(); - // Now 8 because we made another commit + // Now 7 because we made another commit assertEquals(7, DirectoryReader.listCommits(dir).size()); r = DirectoryReader.open(dir); @@ -507,7 +509,10 @@ public void testOpenPriorSnapshot() throws IOException { // but this time keeping only the last commit: writer = new IndexWriter( - dir, newIndexWriterConfig(new MockAnalyzer(random())).setIndexCommit(lastCommit)); + dir, + newIndexWriterConfig(new MockAnalyzer(random())) + .setIndexCommit(lastCommit) + .setMergePolicy(newLogMergePolicy(10))); assertEquals(10, writer.getDocStats().numDocs); // Reader still sees fully merged index, because writer From 6bf2188b358469b5308cef6db9b848408c7d0d1f Mon Sep 17 00:00:00 2001 From: EC2 Default User Date: Thu, 2 Nov 2023 04:09:26 +0000 Subject: [PATCH 02/19] stabilize vectorutil benchmark --- .../benchmark/jmh/VectorUtilBenchmark.java | 36 ++++++++----------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java index b130bc3422fb..0ba817d410c8 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java @@ -24,8 +24,14 @@ @BenchmarkMode(Mode.Throughput) @OutputTimeUnit(TimeUnit.MICROSECONDS) @State(Scope.Benchmark) -@Warmup(iterations = 3, time = 3) -@Measurement(iterations = 5, time = 3) +// first iteration is complete garbage, so make sure we really warmup +@Warmup(iterations = 4, time = 1) +// real iterations. not useful to spend tons of time here, better to fork more +@Measurement(iterations = 5, time = 1) +// engage some noise reduction +@Fork( + value = 3, + jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) public class VectorUtilBenchmark { private byte[] bytesA; @@ -36,7 +42,7 @@ public class VectorUtilBenchmark { @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) int size; - @Setup(Level.Trial) + @Setup(Level.Iteration) public void init() { ThreadLocalRandom random = ThreadLocalRandom.current(); @@ -56,84 +62,72 @@ public void init() { } @Benchmark - @Fork(value = 1) public float binaryCosineScalar() { return VectorUtil.cosine(bytesA, bytesB); } @Benchmark - @Fork( - value = 1, - jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float binaryCosineVector() { return VectorUtil.cosine(bytesA, bytesB); } @Benchmark - @Fork(value = 1) public int binaryDotProductScalar() { return VectorUtil.dotProduct(bytesA, bytesB); } @Benchmark - @Fork( - value = 1, - jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public int binaryDotProductVector() { return VectorUtil.dotProduct(bytesA, bytesB); } @Benchmark - @Fork(value = 1) public int binarySquareScalar() { return VectorUtil.squareDistance(bytesA, bytesB); } @Benchmark - @Fork( - value = 1, - jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public int binarySquareVector() { return VectorUtil.squareDistance(bytesA, bytesB); } @Benchmark - @Fork(value = 1) public float floatCosineScalar() { return VectorUtil.cosine(floatsA, floatsB); } @Benchmark @Fork( - value = 1, + value = 15, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float floatCosineVector() { return VectorUtil.cosine(floatsA, floatsB); } @Benchmark - @Fork(value = 1) public float floatDotProductScalar() { return VectorUtil.dotProduct(floatsA, floatsB); } @Benchmark @Fork( - value = 1, + value = 15, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float floatDotProductVector() { return VectorUtil.dotProduct(floatsA, floatsB); } @Benchmark - @Fork(value = 1) public float floatSquareScalar() { return VectorUtil.squareDistance(floatsA, floatsB); } @Benchmark @Fork( - value = 1, + value = 15, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float floatSquareVector() { return VectorUtil.squareDistance(floatsA, floatsB); From 8400f89a9166ec7b74798dcbe4139bfb2a05503a Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Thu, 2 Nov 2023 08:49:41 +0100 Subject: [PATCH 03/19] Fix javac task inputs so that they include modular dependencies #12742 (#12745) Fix javac task inputs so that they include modular dependencies #12742 --- gradle/java/modules.gradle | 6 ++++++ lucene/CHANGES.txt | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/gradle/java/modules.gradle b/gradle/java/modules.gradle index f9ebac3d345b..cb8f7c8df34c 100644 --- a/gradle/java/modules.gradle +++ b/gradle/java/modules.gradle @@ -67,6 +67,12 @@ allprojects { tasks.named(sourceSet.getCompileJavaTaskName()).configure({ JavaCompile task -> task.dependsOn modularPaths.compileModulePathConfiguration + // GH-12742: add the modular path as inputs so that if anything changes, the task + // is not up to date and is re-run. I [dw] believe this should be a @Classpath parameter + // on the task itself... but I don't know how to implement this on an existing class. + // this is a workaround but should work just fine though. + task.inputs.files(modularPaths.compileModulePathConfiguration) + // LUCENE-10327: don't allow gradle to emit an empty sourcepath as it would break // compilation of modules. task.options.setSourcepath(sourceSet.java.sourceDirectories) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6a92ae5c6e20..76293f6dbbd9 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -278,7 +278,11 @@ Bug Fixes Build --------------------- +* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed + leading to odd runtime errors (Chris Hostetter, Dawid Weiss) + * GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler) + * GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden) Other From cdc7d87fcc321f63b6e9d9f8eceb295b1c919bd5 Mon Sep 17 00:00:00 2001 From: Dzung Bui Date: Thu, 2 Nov 2023 17:34:36 +0900 Subject: [PATCH 04/19] Clean up UnCompiledNode.inputCount (#12735) * Clean up inputCount * Update CHANGES.txt --- lucene/CHANGES.txt | 4 +++- .../org/apache/lucene/util/fst/FSTCompiler.java | 16 ---------------- .../org/apache/lucene/util/fst/TestFSTs.java | 1 - 3 files changed, 3 insertions(+), 18 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 76293f6dbbd9..09e34a55e3e0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -62,9 +62,11 @@ API Changes * GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera) -* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods +* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods of the two (Anh Dung Bui) +* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui) + New Features --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index f17c220f83d2..3af624100708 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -270,10 +270,6 @@ public float getDirectAddressingMaxOversizingFactor() { return directAddressingMaxOversizingFactor; } - public long getTermCount() { - return frontier[0].inputCount; - } - public long getNodeCount() { // 1+ in order to count the -1 implicit final node return 1 + nodeCount; @@ -749,7 +745,6 @@ public void add(IntsRef input, T output) throws IOException { // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node - frontier[0].inputCount++; frontier[0].isFinal = true; fst.setEmptyOutput(output); return; @@ -760,9 +755,6 @@ public void add(IntsRef input, T output) throws IOException { int pos2 = input.offset; final int pos1Stop = Math.min(lastInput.length(), input.length); while (true) { - frontier[pos1].inputCount++; - // System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + - // frontier[pos1]); if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) { break; } @@ -786,7 +778,6 @@ public void add(IntsRef input, T output) throws IOException { // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.length; idx++) { frontier[idx - 1].addArc(input.ints[input.offset + idx - 1], frontier[idx]); - frontier[idx].inputCount++; } final UnCompiledNode lastNode = frontier[input.length]; @@ -835,8 +826,6 @@ public void add(IntsRef input, T output) throws IOException { // save last input lastInput.copyInts(input); - - // System.out.println(" count[0]=" + frontier[0].inputCount); } private boolean validOutput(T output) { @@ -906,10 +895,6 @@ static final class UnCompiledNode implements Node { T output; boolean isFinal; - // TODO: remove this tracking? we used to use it for confusingly pruning NodeHash, but - // we switched to LRU by RAM usage instead: - long inputCount; - /** This node's depth, starting from the automaton root. */ final int depth; @@ -935,7 +920,6 @@ void clear() { numArcs = 0; isFinal = false; output = owner.NO_OUTPUT; - inputCount = 0; // We don't clear the depth here because it never changes // for nodes on the frontier (even when reused). diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 927fe058ef05..f6dd84efd0e6 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -568,7 +568,6 @@ public void run(int limit, boolean verify) throws IOException { System.out.println( ((tMid - tStart) / (double) TimeUnit.SECONDS.toNanos(1)) + " sec to add all terms"); - assert fstCompiler.getTermCount() == ord; FST fst = fstCompiler.compile(); long tEnd = System.nanoTime(); System.out.println( From cbb5b6e331fa986bf539a4812fb1e65e85bec941 Mon Sep 17 00:00:00 2001 From: lujiefsi Date: Thu, 2 Nov 2023 18:53:48 +0800 Subject: [PATCH 05/19] LUCENE-10144:fix resource leak due to Files.list (#354) * LUCENE-10144:fix resource leak due to Files.list * LUCENE-10144:fix resource leak due to Files.list --- .../org/apache/lucene/gradle/datasets/ExtractReuters.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java index b8d6735c9089..34f046ffbe36 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java +++ b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java @@ -27,6 +27,7 @@ import java.nio.file.StandardCopyOption; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Stream; /** * Split the Reuters SGML documents into Simple Text files containing: @@ -44,9 +45,10 @@ public ExtractReuters(Path reutersDir, Path outputDir) throws IOException { public void extract() throws IOException { long count = 0; Files.createDirectories(outputDir); - - if (Files.list(outputDir).count() > 0) { - throw new IOException("The output directory must be empty: " + outputDir); + try(Stream files = Files.list(outputDir)) { + if (files.count() > 0) { + throw new IOException("The output directory must be empty: " + outputDir); + } } try (DirectoryStream stream = Files.newDirectoryStream(reutersDir, "*.sgm")) { From d62fb5309ea591e7832e702c8c83b72d53c608a7 Mon Sep 17 00:00:00 2001 From: xiaoshi Date: Thu, 2 Nov 2023 19:02:31 +0800 Subject: [PATCH 06/19] LUCENE-10100: configuration items of the alg file are adapted to the 9.0 branch (#301) --- lucene/benchmark/conf/analyzer.alg | 4 ++-- lucene/benchmark/conf/collector-small.alg | 2 +- lucene/benchmark/conf/collector.alg | 2 +- lucene/benchmark/conf/compound-penalty.alg | 4 ++-- .../conf/english-porter-comparison.alg | 3 ++- lucene/benchmark/conf/facets.alg | 3 ++- lucene/benchmark/conf/highlights.alg | 3 ++- .../indexing-flush-by-RAM-multithreaded.alg | 4 ++-- .../benchmark/conf/indexing-flush-by-RAM.alg | 4 ++-- .../benchmark/conf/indexing-multithreaded.alg | 4 ++-- lucene/benchmark/conf/indexing.alg | 4 ++-- .../conf/micro-standard-flush-by-ram.alg | 4 ++-- lucene/benchmark/conf/sample.alg | 4 ++-- lucene/benchmark/conf/shingle.alg | 3 ++- lucene/benchmark/conf/sloppy-phrase.alg | 3 ++- lucene/benchmark/conf/sort-standard.alg | 3 ++- .../benchmark/conf/standard-flush-by-RAM.alg | 4 ++-- lucene/benchmark/conf/standard.alg | 4 ++-- lucene/benchmark/conf/wstok.alg | 3 ++- .../byTask/tasks/NewAnalyzerTask.java | 18 +++++++----------- 20 files changed, 43 insertions(+), 40 deletions(-) diff --git a/lucene/benchmark/conf/analyzer.alg b/lucene/benchmark/conf/analyzer.alg index 497ec3d216d8..4ed777915bd6 100644 --- a/lucene/benchmark/conf/analyzer.alg +++ b/lucene/benchmark/conf/analyzer.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/collector-small.alg b/lucene/benchmark/conf/collector-small.alg index 763cb0454ad8..e57ee8646b11 100644 --- a/lucene/benchmark/conf/collector-small.alg +++ b/lucene/benchmark/conf/collector-small.alg @@ -21,7 +21,7 @@ # Fully Qualified Class Name of a Collector with a empty constructor # topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs # topScoreDocUnordered - Like above, but allows out of order -collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered +collector.class=coll:topScoreDoc analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer directory=FSDirectory diff --git a/lucene/benchmark/conf/collector.alg b/lucene/benchmark/conf/collector.alg index d85582a7ba29..e2843492dcab 100644 --- a/lucene/benchmark/conf/collector.alg +++ b/lucene/benchmark/conf/collector.alg @@ -21,7 +21,7 @@ # Fully Qualified Class Name of a Collector with a empty constructor # topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs # topScoreDocUnordered - Like above, but allows out of order -collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered +collector.class=coll:topScoreDoc analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer directory=FSDirectory diff --git a/lucene/benchmark/conf/compound-penalty.alg b/lucene/benchmark/conf/compound-penalty.alg index 06b2821f04b9..8626baa571a2 100644 --- a/lucene/benchmark/conf/compound-penalty.alg +++ b/lucene/benchmark/conf/compound-penalty.alg @@ -37,8 +37,8 @@ doc.term.vector=vector:true:true:false:false log.step=500 log.step.DeleteDoc=100 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/english-porter-comparison.alg b/lucene/benchmark/conf/english-porter-comparison.alg index e83f04a8dae2..e391c0b0d8d8 100644 --- a/lucene/benchmark/conf/english-porter-comparison.alg +++ b/lucene/benchmark/conf/english-porter-comparison.alg @@ -20,7 +20,8 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource doc.tokenized=false doc.body.tokenized=true -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 -AnalyzerFactory(name:original-porter-stemmer,StandardTokenizer, EnglishPossessiveFilter,LowerCaseFilter,StopFilter, diff --git a/lucene/benchmark/conf/facets.alg b/lucene/benchmark/conf/facets.alg index 63e7cac73748..32d7270e3b49 100644 --- a/lucene/benchmark/conf/facets.alg +++ b/lucene/benchmark/conf/facets.alg @@ -30,7 +30,8 @@ doc.tokenized=true doc.term.vector=false log.step=1000 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/highlights.alg b/lucene/benchmark/conf/highlights.alg index 88b056ecee40..7c5fd7d73378 100644 --- a/lucene/benchmark/conf/highlights.alg +++ b/lucene/benchmark/conf/highlights.alg @@ -30,7 +30,8 @@ doc.term.vector.offsets=false doc.term.vector.positions=false log.step=2000 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg b/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg index 43a6c91bbebb..d86e182a172f 100644 --- a/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg +++ b/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing-flush-by-RAM.alg b/lucene/benchmark/conf/indexing-flush-by-RAM.alg index 0b6c79762ef5..0a911c940863 100644 --- a/lucene/benchmark/conf/indexing-flush-by-RAM.alg +++ b/lucene/benchmark/conf/indexing-flush-by-RAM.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing-multithreaded.alg b/lucene/benchmark/conf/indexing-multithreaded.alg index 1d2e18e260dd..b34b8266178a 100644 --- a/lucene/benchmark/conf/indexing-multithreaded.alg +++ b/lucene/benchmark/conf/indexing-multithreaded.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing.alg b/lucene/benchmark/conf/indexing.alg index e31f87185b16..b4a4d92fc26c 100644 --- a/lucene/benchmark/conf/indexing.alg +++ b/lucene/benchmark/conf/indexing.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/micro-standard-flush-by-ram.alg b/lucene/benchmark/conf/micro-standard-flush-by-ram.alg index 993e58a883d8..d4a22f12495e 100644 --- a/lucene/benchmark/conf/micro-standard-flush-by-ram.alg +++ b/lucene/benchmark/conf/micro-standard-flush-by-ram.alg @@ -31,8 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/sample.alg b/lucene/benchmark/conf/sample.alg index 4f93230bfc78..aa63293de6c7 100644 --- a/lucene/benchmark/conf/sample.alg +++ b/lucene/benchmark/conf/sample.alg @@ -42,8 +42,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource #content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/shingle.alg b/lucene/benchmark/conf/shingle.alg index b0744341c76a..67b513064a1e 100644 --- a/lucene/benchmark/conf/shingle.alg +++ b/lucene/benchmark/conf/shingle.alg @@ -16,7 +16,8 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource doc.tokenized=false doc.body.tokenized=true -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 log.step=1000 -AnalyzerFactory(name:shingle-bigrams-unigrams, diff --git a/lucene/benchmark/conf/sloppy-phrase.alg b/lucene/benchmark/conf/sloppy-phrase.alg index 4d06d6fdbe0f..4c49ddd59e61 100644 --- a/lucene/benchmark/conf/sloppy-phrase.alg +++ b/lucene/benchmark/conf/sloppy-phrase.alg @@ -30,7 +30,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource diff --git a/lucene/benchmark/conf/sort-standard.alg b/lucene/benchmark/conf/sort-standard.alg index 48cae964dbee..08c7b90b0cdc 100644 --- a/lucene/benchmark/conf/sort-standard.alg +++ b/lucene/benchmark/conf/sort-standard.alg @@ -31,7 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=100000 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource diff --git a/lucene/benchmark/conf/standard-flush-by-RAM.alg b/lucene/benchmark/conf/standard-flush-by-RAM.alg index 3ceed106fae7..c3cb2789b987 100644 --- a/lucene/benchmark/conf/standard-flush-by-RAM.alg +++ b/lucene/benchmark/conf/standard-flush-by-RAM.alg @@ -31,8 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/standard.alg b/lucene/benchmark/conf/standard.alg index 4d0b0480ffe7..4885593954b7 100644 --- a/lucene/benchmark/conf/standard.alg +++ b/lucene/benchmark/conf/standard.alg @@ -31,8 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/wstok.alg b/lucene/benchmark/conf/wstok.alg index c43759032c3f..ab6a6593c5ce 100644 --- a/lucene/benchmark/conf/wstok.alg +++ b/lucene/benchmark/conf/wstok.alg @@ -18,7 +18,8 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource doc.tokenized=false doc.body.tokenized=true -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 -AnalyzerFactory(name:WhitespaceTokenizer, WhitespaceTokenizer(rule:java)) diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java index 2248756998e9..032019f1e4ed 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java @@ -23,9 +23,9 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory; -import org.apache.lucene.util.Version; /** * Create a new {@link org.apache.lucene.analysis.Analyzer} and set it in the getRunData() for use @@ -42,17 +42,13 @@ public NewAnalyzerTask(PerfRunData runData) { public static final Analyzer createAnalyzer(String className) throws Exception { final Class clazz = Class.forName(className).asSubclass(Analyzer.class); - try { - // first try to use a ctor with version parameter (needed for many new Analyzers that have no - // default one anymore - Constructor cnstr = clazz.getConstructor(Version.class); - return cnstr.newInstance(Version.LATEST); - } catch ( - @SuppressWarnings("unused") - NoSuchMethodException nsme) { - // otherwise use default ctor - return clazz.getConstructor().newInstance(); + Constructor cnstr; + if (className.equals("org.apache.lucene.analysis.core.StopAnalyzer")) { + cnstr = clazz.getConstructor(CharArraySet.class); + return cnstr.newInstance(CharArraySet.EMPTY_SET); } + cnstr = clazz.getConstructor(); + return cnstr.newInstance(); } @Override From 5f1c72680781834062134819006466b92a0beeb8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=C3=B8ydahl?= Date: Thu, 2 Nov 2023 12:42:45 +0100 Subject: [PATCH 07/19] ReleaseWizard - Upgrade 'consolemenu' dependency to v0.7.1 (#11855) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ported from https://github.com/apache/solr/pull/1020 Also pin python versions in requirements.txt to avoid unexpected incompatibilties in the future Co-authored-by: Jan Høydahl --- dev-tools/scripts/releaseWizard.py | 138 +++++------------------------ dev-tools/scripts/requirements.txt | 16 ++-- 2 files changed, 31 insertions(+), 123 deletions(-) diff --git a/dev-tools/scripts/releaseWizard.py b/dev-tools/scripts/releaseWizard.py index b57eefb50d9a..2fe72a65f2c7 100755 --- a/dev-tools/scripts/releaseWizard.py +++ b/dev-tools/scripts/releaseWizard.py @@ -63,7 +63,6 @@ import scriptutil from consolemenu import ConsoleMenu from consolemenu.items import FunctionItem, SubmenuItem, ExitItem -from consolemenu.screen import Screen from scriptutil import BranchType, Version, download, run # Lucene-to-Java version mapping @@ -654,8 +653,8 @@ def get_title(self): return "%s%s (%d/%d)" % (prefix, self.title, self.num_done(), self.num_applies()) def get_submenu(self): - menu = UpdatableConsoleMenu(title=self.title, subtitle=self.get_subtitle, prologue_text=self.get_description(), - screen=MyScreen()) + menu = ConsoleMenu(title=self.title, subtitle=self.get_subtitle, prologue_text=self.get_description(), + clear_screen=False) menu.exit_item = CustomExitItem("Return") for todo in self.get_todos(): if todo.applies(state.release_type): @@ -663,7 +662,7 @@ def get_submenu(self): return menu def get_menu_item(self): - item = UpdatableSubmenuItem(self.get_title, self.get_submenu()) + item = SubmenuItem(self.get_title, self.get_submenu()) return item def get_todos(self): @@ -820,7 +819,7 @@ def display_and_confirm(self): print("ERROR while executing todo %s (%s)" % (self.get_title(), e)) def get_menu_item(self): - return UpdatableFunctionItem(self.get_title, self.display_and_confirm) + return FunctionItem(self.get_title, self.display_and_confirm) def clone(self): clone = Todo(self.id, self.title, description=self.description) @@ -1234,104 +1233,6 @@ def pause(fun=None): input("\nPress ENTER to continue...") -# Custom classes for ConsoleMenu, to make menu texts dynamic -# Needed until https://github.com/aegirhall/console-menu/pull/25 is released -# See https://pypi.org/project/console-menu/ for other docs - -class UpdatableConsoleMenu(ConsoleMenu): - - def __repr__(self): - return "%s: %s. %d items" % (self.get_title(), self.get_subtitle(), len(self.items)) - - def draw(self): - """ - Refreshes the screen and redraws the menu. Should be called whenever something changes that needs to be redrawn. - """ - self.screen.printf(self.formatter.format(title=self.get_title(), subtitle=self.get_subtitle(), items=self.items, - prologue_text=self.get_prologue_text(), epilogue_text=self.get_epilogue_text())) - - # Getters to get text in case method reference - def get_title(self): - return self.title() if callable(self.title) else self.title - - def get_subtitle(self): - return self.subtitle() if callable(self.subtitle) else self.subtitle - - def get_prologue_text(self): - return self.prologue_text() if callable(self.prologue_text) else self.prologue_text - - def get_epilogue_text(self): - return self.epilogue_text() if callable(self.epilogue_text) else self.epilogue_text - - -class UpdatableSubmenuItem(SubmenuItem): - def __init__(self, text, submenu, menu=None, should_exit=False): - """ - :ivar ConsoleMenu self.submenu: The submenu to be opened when this item is selected - """ - super(UpdatableSubmenuItem, self).__init__(text=text, menu=menu, should_exit=should_exit, submenu=submenu) - - if menu: - self.get_submenu().parent = menu - - def show(self, index): - return "%2d - %s" % (index + 1, self.get_text()) - - # Getters to get text in case method reference - def get_text(self): - return self.text() if callable(self.text) else self.text - - def set_menu(self, menu): - """ - Sets the menu of this item. - Should be used instead of directly accessing the menu attribute for this class. - - :param ConsoleMenu menu: the menu - """ - self.menu = menu - self.get_submenu().parent = menu - - def action(self): - """ - This class overrides this method - """ - self.get_submenu().start() - - def clean_up(self): - """ - This class overrides this method - """ - self.get_submenu().join() - self.menu.clear_screen() - self.menu.resume() - - def get_return(self): - """ - :return: The returned value in the submenu - """ - return self.get_submenu().returned_value - - def get_submenu(self): - """ - We unwrap the submenu variable in case it is a reference to a method that returns a submenu - """ - return self.submenu if not callable(self.submenu) else self.submenu() - - -class UpdatableFunctionItem(FunctionItem): - def show(self, index): - return "%2d - %s" % (index + 1, self.get_text()) - - # Getters to get text in case method reference - def get_text(self): - return self.text() if callable(self.text) else self.text - - -class MyScreen(Screen): - def clear(self): - return - - class CustomExitItem(ExitItem): def show(self, index): return super(CustomExitItem, self).show(index) @@ -1346,6 +1247,13 @@ def main(): global templates print("Lucene releaseWizard v%s" % getScriptVersion()) + + try: + ConsoleMenu(clear_screen=True) + except Exception as e: + sys.exit("You need to install 'consolemenu' package version 0.7.1 for the Wizard to function. Please run 'pip " + "install -r requirements.txt'") + c = parse_config() if c.dry: @@ -1402,18 +1310,18 @@ def main(): lucene_news_file = os.path.join(state.get_website_git_folder(), 'content', 'core', 'core_news', "%s-%s-available.md" % (state.get_release_date_iso(), state.release_version.replace(".", "-"))) - main_menu = UpdatableConsoleMenu(title="Lucene ReleaseWizard", + main_menu = ConsoleMenu(title="Lucene ReleaseWizard", subtitle=get_releasing_text, prologue_text="Welcome to the release wizard. From here you can manage the process including creating new RCs. " "All changes are persisted, so you can exit any time and continue later. Make sure to read the Help section.", epilogue_text="® 2022 The Lucene project. Licensed under the Apache License 2.0\nScript version v%s)" % getScriptVersion(), - screen=MyScreen()) + clear_screen=False) - todo_menu = UpdatableConsoleMenu(title=get_releasing_text, + todo_menu = ConsoleMenu(title=get_releasing_text, subtitle=get_subtitle, prologue_text=None, epilogue_text=None, - screen=MyScreen()) + clear_screen=False) todo_menu.exit_item = CustomExitItem("Return") for todo_group in state.todo_groups: @@ -1422,14 +1330,14 @@ def main(): menu_item.set_menu(todo_menu) todo_menu.append_item(menu_item) - main_menu.append_item(UpdatableSubmenuItem(get_todo_menuitem_title, todo_menu, menu=main_menu)) - main_menu.append_item(UpdatableFunctionItem(get_start_new_rc_menu_title, start_new_rc)) - main_menu.append_item(UpdatableFunctionItem('Clear and restart current RC', state.clear_rc)) - main_menu.append_item(UpdatableFunctionItem("Clear all state, restart the %s release" % state.release_version, reset_state)) - main_menu.append_item(UpdatableFunctionItem('Start release for a different version', release_other_version)) - main_menu.append_item(UpdatableFunctionItem('Generate Asciidoc guide for this release', generate_asciidoc)) - # main_menu.append_item(UpdatableFunctionItem('Dump YAML', dump_yaml)) - main_menu.append_item(UpdatableFunctionItem('Help', help)) + main_menu.append_item(SubmenuItem(get_todo_menuitem_title, todo_menu, menu=main_menu)) + main_menu.append_item(FunctionItem(get_start_new_rc_menu_title, start_new_rc)) + main_menu.append_item(FunctionItem('Clear and restart current RC', state.clear_rc)) + main_menu.append_item(FunctionItem("Clear all state, restart the %s release" % state.release_version, reset_state)) + main_menu.append_item(FunctionItem('Start release for a different version', release_other_version)) + main_menu.append_item(FunctionItem('Generate Asciidoc guide for this release', generate_asciidoc)) + # main_menu.append_item(FunctionItem('Dump YAML', dump_yaml)) + main_menu.append_item(FunctionItem('Help', help)) main_menu.show() diff --git a/dev-tools/scripts/requirements.txt b/dev-tools/scripts/requirements.txt index b8a124b8f828..0617ad153f54 100644 --- a/dev-tools/scripts/requirements.txt +++ b/dev-tools/scripts/requirements.txt @@ -1,8 +1,8 @@ -six>=1.11.0 -Jinja2>=2.10.1 -PyYAML>=5.1 -holidays>=0.9.10 -ics>=0.4 -console-menu>=0.5.1 -PyGithub -jira \ No newline at end of file +six~=1.16.0 +Jinja2~=3.1.1 +PyYAML~=6.0 +holidays~=0.16 +ics~=0.7.2 +console-menu~=0.7.1 +PyGithub~=1.56 +jira~=3.4.1 \ No newline at end of file From 96b5edd3bbf4d3e420995aa1107f59085a37e49a Mon Sep 17 00:00:00 2001 From: luyuncheng Date: Thu, 2 Nov 2023 20:25:06 +0800 Subject: [PATCH 08/19] Remove unnecessary sort in writeFieldUpdates (#12273) * Update ReadersAndUpdates.java Reduce unnecessary sort in writeFieldUpdates * Update ReadersAndUpdates.java precommit import --- .../src/java/org/apache/lucene/index/ReadersAndUpdates.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 0f579b9d266a..9713923916bb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -18,8 +18,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -555,8 +553,6 @@ public synchronized boolean writeFieldUpdates( FieldInfos fieldInfos = null; boolean any = false; for (List updates : pendingDVUpdates.values()) { - // Sort by increasing delGen: - Collections.sort(updates, Comparator.comparingLong(a -> a.delGen)); for (DocValuesFieldUpdates update : updates) { if (update.delGen <= maxDelGen && update.any()) { any = true; From 4b3f7662ce880204632ff5dabf55d5326e064703 Mon Sep 17 00:00:00 2001 From: tang donghai Date: Thu, 2 Nov 2023 20:29:22 +0800 Subject: [PATCH 09/19] unify exception thrown by regexp & check repetition range (#12277) * unify exception thrown by regexp & check repetition range * check m equals -1 or not * needn't rethrow IllegalArgumentException --- .../java/org/apache/lucene/util/automaton/RegExp.java | 4 ++++ .../org/apache/lucene/util/automaton/TestRegExp.java | 11 +++++++++++ 2 files changed, 15 insertions(+) diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index ed1688efd301..0d17a6fcab47 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -1128,6 +1128,10 @@ else if (match('{')) { if (start != pos) m = Integer.parseInt(originalString.substring(start, pos)); } else m = n; if (!match('}')) throw new IllegalArgumentException("expected '}' at position " + pos); + if (m != -1 && n > m) { + throw new IllegalArgumentException( + "invalid repetition range(out of order): " + n + ".." + m); + } if (m == -1) e = makeRepeat(flags, e, n); else e = makeRepeat(flags, e, n, m); } diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index c960e7363047..8f6f765f2936 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -86,6 +86,17 @@ public void testLegalBackslashChars() { } } + public void testParseIllegalRepeatExp() { + // out of order + IllegalArgumentException expected = + expectThrows( + IllegalArgumentException.class, + () -> { + new RegExp("a{99,11}"); + }); + assertTrue(expected.getMessage().contains("out of order")); + } + static String randomDocValue(int minLength) { String charPalette = "AAAaaaBbbCccc123456 \t"; StringBuilder sb = new StringBuilder(); From 5b87a31556a5b3da7b5b1bafbabdb186dbd0b2da Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 2 Nov 2023 14:16:11 +0100 Subject: [PATCH 10/19] Speed up sorting on unique string fields. (#11903) Since increasing the number of hits retrieved in nightly benchmarks from 10 to 100, the performance of sorting documents by title dropped back to the level it had before introducing dynamic pruning. This is not too surprising given that the `title` field is a unique field, so the optimization would only kick in when the current 100th hit would have an ordinal that is less than 128 - something that would only happen after collecting most hits. This change increases the threshold to 1024, so that the optimization would kick in when the current 100th hit has an ordinal that is less than 1024, something that happens a bit sooner. --- lucene/CHANGES.txt | 2 ++ .../apache/lucene/search/comparators/TermOrdValComparator.java | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 09e34a55e3e0..953e1b1cbb9d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -253,6 +253,8 @@ Optimizations * GITHUB#12719: Top-level conjunctions that are not sorted by score now have a specialized bulk scorer. (Adrien Grand) +* GITHUB#11903: Faster sort on high-cardinality string fields. (Adrien Grand) + Changes in runtime behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index 548bbb401b20..616b8cf7a7bc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -475,7 +475,7 @@ private static class PostingsEnumAndOrd { private class CompetitiveIterator extends DocIdSetIterator { - private static final int MAX_TERMS = 128; + private static final int MAX_TERMS = 1024; private final LeafReaderContext context; private final int maxDoc; From 30db21704849c5fad7ae8319f961d35f71e74f32 Mon Sep 17 00:00:00 2001 From: twosom <72733442+twosom@users.noreply.github.com> Date: Thu, 2 Nov 2023 23:10:44 +0900 Subject: [PATCH 11/19] move CSVUtil to common from analyzer nori and kuromoji (#12390) Closes #12389 --- .../apache/lucene/analysis/util}/CSVUtil.java | 6 +- .../lucene/analysis/util}/TestCSVUtil.java | 3 +- .../ja/dict/TokenInfoDictionaryBuilder.java | 1 + .../dict/TokenInfoDictionaryEntryWriter.java | 1 + .../ja/dict/UnknownDictionaryBuilder.java | 1 + .../analysis/ja/dict/UserDictionary.java | 1 + .../analysis/ja/dict/UserMorphData.java | 2 + .../ja/dict/TestUnknownDictionary.java | 1 + .../lucene/analysis/ko/dict/CSVUtil.java | 93 ------------------- .../ko/dict/TokenInfoDictionaryBuilder.java | 1 + .../dict/TokenInfoDictionaryEntryWriter.java | 1 + .../ko/dict/UnknownDictionaryBuilder.java | 1 + .../ko/dict/TestUnknownDictionary.java | 1 + 13 files changed, 15 insertions(+), 98 deletions(-) rename lucene/analysis/{kuromoji/src/java/org/apache/lucene/analysis/ja/dict => common/src/java/org/apache/lucene/analysis/util}/CSVUtil.java (94%) rename lucene/analysis/{kuromoji/src/test/org/apache/lucene/analysis/ja => common/src/test/org/apache/lucene/analysis/util}/TestCSVUtil.java (95%) delete mode 100644 lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CSVUtil.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CSVUtil.java similarity index 94% rename from lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CSVUtil.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CSVUtil.java index e3662f291ca2..36d6e0560e77 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CSVUtil.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CSVUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ja.dict; +package org.apache.lucene.analysis.util; import java.util.ArrayList; import java.util.regex.Matcher; @@ -69,7 +69,7 @@ public static String[] parse(String line) { return new String[0]; } - return result.toArray(new String[result.size()]); + return result.toArray(new String[0]); } private static String unQuoteUnEscape(String original) { @@ -83,7 +83,7 @@ private static String unQuoteUnEscape(String original) { } // Unescape - if (result.indexOf(ESCAPED_QUOTE) >= 0) { + if (result.contains(ESCAPED_QUOTE)) { result = result.replace(ESCAPED_QUOTE, "\""); } } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestCSVUtil.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCSVUtil.java similarity index 95% rename from lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestCSVUtil.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCSVUtil.java index 8cc6fb66e5aa..85901ca0e46a 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestCSVUtil.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCSVUtil.java @@ -14,10 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ja; +package org.apache.lucene.analysis.util; import java.io.IOException; -import org.apache.lucene.analysis.ja.dict.CSVUtil; import org.apache.lucene.tests.util.LuceneTestCase; /* diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java index 80b1cef6c327..5a16db673ced 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java index e5270b32844c..4bdfe5095a6c 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java @@ -20,6 +20,7 @@ import java.io.OutputStream; import java.nio.ByteBuffer; import org.apache.lucene.analysis.morph.DictionaryEntryWriter; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java index a367c49ca4da..ba5bc0e6a058 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import org.apache.lucene.analysis.util.CSVUtil; class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*"; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 52604c4e1952..de69c726ee2a 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -26,6 +26,7 @@ import java.util.Map; import java.util.TreeMap; import org.apache.lucene.analysis.morph.Dictionary; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java index be895f1268a6..6bc4dc72d28a 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java @@ -19,6 +19,8 @@ import static org.apache.lucene.analysis.ja.dict.UserDictionary.CUSTOM_DICTIONARY_WORD_ID_OFFSET; import static org.apache.lucene.analysis.ja.dict.UserDictionary.INTERNAL_SEPARATOR; +import org.apache.lucene.analysis.util.CSVUtil; + /** Morphological information for user dictionary. */ final class UserMorphData implements JaMorphData { public static final int WORD_COST = -100000; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java index 5ccdaa6b926c..2d245c7a599c 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.analysis.ja.dict; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java deleted file mode 100644 index b9e3ff9483bc..000000000000 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.dict; - -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** Utility class for parsing CSV text */ -public final class CSVUtil { - private static final char QUOTE = '"'; - - private static final char COMMA = ','; - - private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$"); - - private static final String ESCAPED_QUOTE = "\"\""; - - private CSVUtil() {} // no instance!!! - - /** - * Parse CSV line - * - * @param line line containing csv-encoded data - * @return Array of values - */ - public static String[] parse(String line) { - boolean insideQuote = false; - ArrayList result = new ArrayList<>(); - int quoteCount = 0; - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < line.length(); i++) { - char c = line.charAt(i); - - if (c == QUOTE) { - insideQuote = !insideQuote; - quoteCount++; - } - - if (c == COMMA && !insideQuote) { - String value = sb.toString(); - value = unQuoteUnEscape(value); - result.add(value); - sb.setLength(0); - continue; - } - - sb.append(c); - } - - result.add(sb.toString()); - - // Validate - if (quoteCount % 2 != 0) { - return new String[0]; - } - - return result.toArray(new String[0]); - } - - private static String unQuoteUnEscape(String original) { - String result = original; - - // Unquote - if (result.indexOf('\"') >= 0) { - Matcher m = QUOTE_REPLACE_PATTERN.matcher(original); - if (m.matches()) { - result = m.group(1); - } - - // Unescape - if (result.contains(ESCAPED_QUOTE)) { - result = result.replace(ESCAPED_QUOTE, "\""); - } - } - - return result; - } -} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java index 3726f9e6673b..e3db26b08b82 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java index f7ee696a1970..95ce0277a9d5 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java @@ -24,6 +24,7 @@ import java.util.List; import org.apache.lucene.analysis.ko.POS; import org.apache.lucene.analysis.morph.DictionaryEntryWriter; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java index 1004ab89581d..71099b2f0737 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import org.apache.lucene.analysis.util.CSVUtil; class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1801,3559,3677,SY,*,*,*,*,*,*,*"; diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java index dbce890deda1..13190b21a73a 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.analysis.ko.dict; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; From 2d50c345fea3d1a64090d6d0cffef6b70d482a9f Mon Sep 17 00:00:00 2001 From: zhouhui Date: Thu, 2 Nov 2023 22:34:52 +0800 Subject: [PATCH 12/19] Fix comment on decode method in PForUtil (#12495) * Fix comment on decode method. * Fix comment on decode method(lucene84). --- .../org/apache/lucene/backward_codecs/lucene84/PForUtil.java | 2 +- .../src/java/org/apache/lucene/codecs/lucene90/PForUtil.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java index 4e99d3a9f5cf..690bfa501f4b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java @@ -92,7 +92,7 @@ void encode(long[] longs, DataOutput out) throws IOException { out.writeBytes(exceptions, exceptions.length); } - /** Decode 128 integers into {@code ints}. */ + /** Decode 128 integers into {@code longs}. */ void decode(DataInput in, long[] longs) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java index eb735c84b83f..211912142a45 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java @@ -116,7 +116,7 @@ void encode(long[] longs, DataOutput out) throws IOException { out.writeBytes(exceptions, exceptions.length); } - /** Decode 128 integers into {@code ints}. */ + /** Decode 128 integers into {@code longs}. */ void decode(DataInput in, long[] longs) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; From 4c9e241db3aed9158f152078ec2fd8f07706ab2a Mon Sep 17 00:00:00 2001 From: Michael Sokolov Date: Thu, 2 Nov 2023 14:49:52 -0300 Subject: [PATCH 13/19] Remove or repurpose obsolete JIRA tasks from release wizard (#11833) --- dev-tools/scripts/releaseWizard.yaml | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/dev-tools/scripts/releaseWizard.yaml b/dev-tools/scripts/releaseWizard.yaml index a25407c4e275..ec7f61774302 100644 --- a/dev-tools/scripts/releaseWizard.yaml +++ b/dev-tools/scripts/releaseWizard.yaml @@ -521,7 +521,7 @@ groups: addition wait a couple more days? Merges of bug fixes into the branch may become more difficult. * Only Github issues with Milestone {{ release_version_major }}.{{ release_version_minor }} - and priority "Blocker" will delay a release candidate build. + will delay a release candidate build. ---- types: - major @@ -979,8 +979,8 @@ groups: title: Publish docs, changes and javadocs description: | Ensure your refrigerator has at least 2 beers - the svn import operation can take a while, - depending on your upload bandwidth. We'll publish this directly to the production tree. - At the end of the task, the two links below shall work. + depending on your upload bandwidth. We'll publish this directly to the production tree. At + the end of the task, the two links below shall work. links: - http://lucene.apache.org/core/{{ version }} vars: @@ -1126,12 +1126,18 @@ groups: comment: Push all changes logfile: push-website.log post_description: | - Wait a few minutes for the build to happen. You can follow the site build at https://ci2.apache.org/#/builders/3 - and view the staged site at https://lucene.staged.apache.org - Verify that correct links and versions are mentioned in download pages, download buttons etc. - If you find anything wrong, then commit and push any changes and check again. - - Next step is to merge the changes to branch 'production' in order to publish the site. + Wait a few minutes for the build to happen. You can follow the site build at + https://ci2.apache.org/#/builders/3 and view the staged site at + https://lucene.staged.apache.org Verify that correct links and versions are mentioned in + download pages, download buttons etc. If you find anything wrong, then commit and push any + changes and check again. You may find that the publish fails, leaving a directory listing + instead a beautiful website. If this happens, check the "builder" link and click through into + its details to find possible error messages produced by the website publication process. You + may have produced malformed Markdown. Or the website publish may just fail for some reason out + of your control. If this happens, you can attempt to retrigger the publishing with some + innocuous changes. Next step is to merge the changes to branch 'production' in order to + publish the site. Before doing this, you may want to replenish your stock of beers, or get + stronger stuff. links: - https://ci2.apache.org/#/builders/3 - https://lucene.staged.apache.org @@ -1159,7 +1165,8 @@ groups: post_description: | Wait a few minutes for the build to happen. You can follow the site build at https://ci2.apache.org/#/builders/3 - Verify on https://lucene.apache.org that the site is OK. + Verify on https://lucene.apache.org that the site is OK. It really should be, but see staging + site publication instructions for possible debugging/recovery options if it is not. You can now also verify that http://lucene.apache.org/core/api/core/ redirects to the latest version links: From 43a568cb3370fc783726fdb3cf8df8c123e0d3d2 Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 3 Nov 2023 08:55:15 +0100 Subject: [PATCH 14/19] LUCENE-10560: Faster merging of TermsEnum (#1052) Closes #11596 --- lucene/CHANGES.txt | 3 + .../org/apache/lucene/index/MultiTerms.java | 14 +- .../apache/lucene/index/MultiTermsEnum.java | 75 +++---- .../org/apache/lucene/index/OrdinalMap.java | 36 ++-- .../apache/lucene/index/TermsEnumIndex.java | 183 ++++++++++++++++++ .../lucene/index/TestTermsEnumIndex.java | 67 +++++++ 6 files changed, 297 insertions(+), 81 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java create mode 100644 lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 953e1b1cbb9d..039afcba68de 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -253,8 +253,11 @@ Optimizations * GITHUB#12719: Top-level conjunctions that are not sorted by score now have a specialized bulk scorer. (Adrien Grand) +* GITHUB#1052: Faster merging of terms enums. (Adrien Grand) + * GITHUB#11903: Faster sort on high-cardinality string fields. (Adrien Grand) + Changes in runtime behavior --------------------- diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java index f56989d5a622..7f61da627ec2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java @@ -136,17 +136,16 @@ public ReaderSlice[] getSubSlices() { @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - final List termsEnums = new ArrayList<>(); + final List termsEnums = new ArrayList<>(); for (int i = 0; i < subs.length; i++) { final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm); if (termsEnum != null) { - termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); + termsEnums.add(new TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { - return new MultiTermsEnum(subSlices) - .reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); + return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } @@ -181,17 +180,16 @@ public BytesRef getMax() throws IOException { @Override public TermsEnum iterator() throws IOException { - final List termsEnums = new ArrayList<>(); + final List termsEnums = new ArrayList<>(); for (int i = 0; i < subs.length; i++) { final TermsEnum termsEnum = subs[i].iterator(); if (termsEnum != null) { - termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); + termsEnums.add(new TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { - return new MultiTermsEnum(subSlices) - .reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); + return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java index b849b07cec90..f4cbb4cc1f5b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -36,7 +36,7 @@ public final class MultiTermsEnum extends BaseTermsEnum { new Comparator() { @Override public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) { - return o1.index - o2.index; + return o1.subIndex - o2.subIndex; } }; @@ -56,17 +56,6 @@ public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) { private int numSubs; private BytesRef current; - static class TermsEnumIndex { - public static final TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; - final int subIndex; - final TermsEnum termsEnum; - - public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { - this.termsEnum = termsEnum; - this.subIndex = subIndex; - } - } - /** Returns how many sub-reader slices contain the current term. @see #getMatchArray */ public int getMatchCount() { return numTop; @@ -114,10 +103,10 @@ public TermsEnum reset(TermsEnumIndex[] termsEnumsIndex) throws IOException { final TermsEnumIndex termsEnumIndex = termsEnumsIndex[i]; assert termsEnumIndex != null; - final BytesRef term = termsEnumIndex.termsEnum.next(); + final BytesRef term = termsEnumIndex.next(); if (term != null) { final TermsEnumWithSlice entry = subs[termsEnumIndex.subIndex]; - entry.reset(termsEnumIndex.termsEnum, term); + entry.reset(termsEnumIndex); queue.add(entry); currentSubs[numSubs++] = entry; } else { @@ -154,7 +143,7 @@ public boolean seekExact(BytesRef term) throws IOException { // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { - final BytesRef curTerm = currentSubs[i].current; + final BytesRef curTerm = currentSubs[i].term(); if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { @@ -162,19 +151,19 @@ public boolean seekExact(BytesRef term) throws IOException { } else if (cmp < 0) { status = false; } else { - status = currentSubs[i].terms.seekExact(term); + status = currentSubs[i].seekExact(term); } } else { status = false; } } else { - status = currentSubs[i].terms.seekExact(term); + status = currentSubs[i].seekExact(term); } if (status) { top[numTop++] = currentSubs[i]; - current = currentSubs[i].current = currentSubs[i].terms.term(); - assert term.equals(currentSubs[i].current); + current = currentSubs[i].term(); + assert term.equals(currentSubs[i].term()); } } @@ -206,7 +195,7 @@ public SeekStatus seekCeil(BytesRef term) throws IOException { // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { - final BytesRef curTerm = currentSubs[i].current; + final BytesRef curTerm = currentSubs[i].term(); if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { @@ -214,28 +203,25 @@ public SeekStatus seekCeil(BytesRef term) throws IOException { } else if (cmp < 0) { status = SeekStatus.NOT_FOUND; } else { - status = currentSubs[i].terms.seekCeil(term); + status = currentSubs[i].seekCeil(term); } } else { status = SeekStatus.END; } } else { - status = currentSubs[i].terms.seekCeil(term); + status = currentSubs[i].seekCeil(term); } if (status == SeekStatus.FOUND) { top[numTop++] = currentSubs[i]; - current = currentSubs[i].current = currentSubs[i].terms.term(); + current = currentSubs[i].term(); queue.add(currentSubs[i]); } else { if (status == SeekStatus.NOT_FOUND) { - currentSubs[i].current = currentSubs[i].terms.term(); - assert currentSubs[i].current != null; + assert currentSubs[i].term() != null; queue.add(currentSubs[i]); } else { assert status == SeekStatus.END; - // enum exhausted - currentSubs[i].current = null; } } } @@ -269,15 +255,14 @@ private void pullTop() { // top term assert numTop == 0; numTop = queue.fillTop(top); - current = top[0].current; + current = top[0].term(); } private void pushTop() throws IOException { // call next() on each top, and reorder queue for (int i = 0; i < numTop; i++) { TermsEnumWithSlice top = queue.top(); - top.current = top.terms.next(); - if (top.current == null) { + if (top.next() == null) { queue.pop(); } else { queue.updateTop(); @@ -320,7 +305,7 @@ public BytesRef next() throws IOException { public int docFreq() throws IOException { int sum = 0; for (int i = 0; i < numTop; i++) { - sum += top[i].terms.docFreq(); + sum += top[i].termsEnum.docFreq(); } return sum; } @@ -329,7 +314,7 @@ public int docFreq() throws IOException { public long totalTermFreq() throws IOException { long sum = 0; for (int i = 0; i < numTop; i++) { - final long v = top[i].terms.totalTermFreq(); + final long v = top[i].termsEnum.totalTermFreq(); assert v != -1; sum += v; } @@ -359,12 +344,12 @@ public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { final TermsEnumWithSlice entry = top[i]; - assert entry.index < docsEnum.subPostingsEnums.length - : entry.index + " vs " + docsEnum.subPostingsEnums.length + "; " + subs.length; + assert entry.subIndex < docsEnum.subPostingsEnums.length + : entry.subIndex + " vs " + docsEnum.subPostingsEnums.length + "; " + subs.length; final PostingsEnum subPostingsEnum = - entry.terms.postings(docsEnum.subPostingsEnums[entry.index], flags); + entry.termsEnum.postings(docsEnum.subPostingsEnums[entry.subIndex], flags); assert subPostingsEnum != null; - docsEnum.subPostingsEnums[entry.index] = subPostingsEnum; + docsEnum.subPostingsEnums[entry.subIndex] = subPostingsEnum; subDocs[upto].postingsEnum = subPostingsEnum; subDocs[upto].slice = entry.subSlice; upto++; @@ -379,26 +364,18 @@ public ImpactsEnum impacts(int flags) throws IOException { return new SlowImpactsEnum(postings(null, flags)); } - static final class TermsEnumWithSlice { + static final class TermsEnumWithSlice extends TermsEnumIndex { private final ReaderSlice subSlice; - TermsEnum terms; - public BytesRef current; - final int index; public TermsEnumWithSlice(int index, ReaderSlice subSlice) { + super(null, index); this.subSlice = subSlice; - this.index = index; assert subSlice.length >= 0 : "length=" + subSlice.length; } - public void reset(TermsEnum terms, BytesRef term) { - this.terms = terms; - current = term; - } - @Override public String toString() { - return subSlice.toString() + ":" + terms; + return subSlice.toString() + ":" + super.toString(); } } @@ -413,7 +390,7 @@ private static final class TermMergeQueue extends PriorityQueue { - public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { - this.termsEnum = termsEnum; - this.subIndex = subIndex; + TermsEnumPriorityQueue(int size) { + super(size); } - public BytesRef next() throws IOException { - currentTerm = termsEnum.next(); - return currentTerm; + @Override + protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { + return a.compareTermTo(b) < 0; } } @@ -227,13 +221,7 @@ public static OrdinalMap build( long[] segmentOrds = new long[subs.length]; // Just merge-sorts by term: - PriorityQueue queue = - new PriorityQueue(subs.length) { - @Override - protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { - return a.currentTerm.compareTo(b.currentTerm) < 0; - } - }; + TermsEnumPriorityQueue queue = new TermsEnumPriorityQueue(subs.length); for (int i = 0; i < subs.length; i++) { TermsEnumIndex sub = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i); @@ -242,19 +230,18 @@ protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { } } - BytesRefBuilder scratch = new BytesRefBuilder(); + TermsEnumIndex.TermState topState = new TermsEnumIndex.TermState(); long globalOrd = 0; while (queue.size() != 0) { TermsEnumIndex top = queue.top(); - scratch.copyBytes(top.currentTerm); + topState.copyFrom(top); int firstSegmentIndex = Integer.MAX_VALUE; long globalOrdDelta = Long.MAX_VALUE; // Advance past this term, recording the per-segment ord deltas: while (true) { - top = queue.top(); long segmentOrd = top.termsEnum.ord(); long delta = globalOrd - segmentOrd; int segmentIndex = top.subIndex; @@ -284,10 +271,11 @@ protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { if (queue.size() == 0) { break; } + top = queue.top(); } else { - queue.updateTop(); + top = queue.updateTop(); } - if (queue.top().currentTerm.equals(scratch.get()) == false) { + if (top.termEquals(topState) == false) { break; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java b/lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java new file mode 100644 index 000000000000..57a5d5ae0cba --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** + * Wrapper around a {@link TermsEnum} and an integer that identifies it. All operations that move + * the current position of the {@link TermsEnum} must be performed via this wrapper class, not + * directly on the wrapped {@link TermsEnum}. + */ +class TermsEnumIndex { + + static final TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; + + /** + * Copy the first 8 bytes of the given term as a comparable unsigned long. In case the term has + * less than 8 bytes, missing bytes will be replaced with zeroes. Note that two terms that produce + * the same long could still be different due to the fact that missing bytes are replaced with + * zeroes, e.g. {@code [1, 0]} and {@code [1]} get mapped to the same long. + */ + static long prefix8ToComparableUnsignedLong(BytesRef term) { + // Use Big Endian so that longs are comparable + if (term.length >= Long.BYTES) { + return (long) BitUtil.VH_BE_LONG.get(term.bytes, term.offset); + } else { + long l; + int o; + if (Integer.BYTES <= term.length) { + l = (int) BitUtil.VH_BE_INT.get(term.bytes, term.offset); + o = Integer.BYTES; + } else { + l = 0; + o = 0; + } + if (o + Short.BYTES <= term.length) { + l = + (l << Short.SIZE) + | Short.toUnsignedLong( + (short) BitUtil.VH_BE_SHORT.get(term.bytes, term.offset + o)); + o += Short.BYTES; + } + if (o < term.length) { + l = (l << Byte.SIZE) | Byte.toUnsignedLong(term.bytes[term.offset + o]); + } + l <<= (Long.BYTES - term.length) << 3; + return l; + } + } + + final int subIndex; + TermsEnum termsEnum; + private BytesRef currentTerm; + private long currentTermPrefix8; + + TermsEnumIndex(TermsEnum termsEnum, int subIndex) { + this.termsEnum = termsEnum; + this.subIndex = subIndex; + } + + BytesRef term() { + return currentTerm; + } + + private void setTerm(BytesRef term) { + currentTerm = term; + if (currentTerm == null) { + currentTermPrefix8 = 0; + } else { + currentTermPrefix8 = prefix8ToComparableUnsignedLong(currentTerm); + } + } + + BytesRef next() throws IOException { + BytesRef term = termsEnum.next(); + setTerm(term); + return term; + } + + SeekStatus seekCeil(BytesRef term) throws IOException { + SeekStatus status = termsEnum.seekCeil(term); + if (status == SeekStatus.END) { + setTerm(null); + } else { + setTerm(termsEnum.term()); + } + return status; + } + + boolean seekExact(BytesRef term) throws IOException { + boolean found = termsEnum.seekExact(term); + if (found) { + setTerm(termsEnum.term()); + } else { + setTerm(null); + } + return found; + } + + void seekExact(long ord) throws IOException { + termsEnum.seekExact(ord); + setTerm(termsEnum.term()); + } + + void reset(TermsEnumIndex tei) throws IOException { + termsEnum = tei.termsEnum; + currentTerm = tei.currentTerm; + currentTermPrefix8 = tei.currentTermPrefix8; + } + + int compareTermTo(TermsEnumIndex that) { + if (currentTermPrefix8 != that.currentTermPrefix8) { + int cmp = Long.compareUnsigned(currentTermPrefix8, that.currentTermPrefix8); + assert Integer.signum(cmp) + == Integer.signum( + Arrays.compareUnsigned( + currentTerm.bytes, + currentTerm.offset, + currentTerm.offset + currentTerm.length, + that.currentTerm.bytes, + that.currentTerm.offset, + that.currentTerm.offset + that.currentTerm.length)); + return cmp; + } + + return Arrays.compareUnsigned( + currentTerm.bytes, + currentTerm.offset, + currentTerm.offset + currentTerm.length, + that.currentTerm.bytes, + that.currentTerm.offset, + that.currentTerm.offset + that.currentTerm.length); + } + + @Override + public String toString() { + return Objects.toString(termsEnum); + } + + /** Wrapper around a term that allows for quick equals comparisons. */ + static class TermState { + private final BytesRefBuilder term = new BytesRefBuilder(); + private long termPrefix8; + + void copyFrom(TermsEnumIndex tei) { + term.copyBytes(tei.term()); + termPrefix8 = tei.currentTermPrefix8; + } + } + + boolean termEquals(TermState that) { + if (currentTermPrefix8 != that.termPrefix8) { + return false; + } + return Arrays.equals( + currentTerm.bytes, + currentTerm.offset, + currentTerm.offset + currentTerm.length, + that.term.bytes(), + 0, + that.term.length()); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java new file mode 100644 index 000000000000..ac964052718f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestTermsEnumIndex extends LuceneTestCase { + + public void testPrefix8ToComparableUnsignedLong() { + byte[] b = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + assertEquals(0L, TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 1, 0))); + assertEquals(4L << 56, TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 1))); + assertEquals( + (4L << 56) | (5L << 48), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 2))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 3))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 4))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32) | (8L << 24), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 5))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32) | (8L << 24) | (9L << 16), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 6))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32) | (8L << 24) | (9L << 16) | (10L << 8), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 7))); + assertEquals( + (4L << 56) + | (5L << 48) + | (6L << 40) + | (7L << 32) + | (8L << 24) + | (9L << 16) + | (10L << 8) + | 11L, + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 8))); + assertEquals( + (4L << 56) + | (5L << 48) + | (6L << 40) + | (7L << 32) + | (8L << 24) + | (9L << 16) + | (10L << 8) + | 11L, + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 9))); + } +} From 10ebd9025ae7defa486a6aaf8676246ed6d65cce Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Fri, 3 Nov 2023 09:08:34 +0100 Subject: [PATCH 15/19] Fix test after #12549 take 2. --- .../test/org/apache/lucene/index/TestIndexWriter.java | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 1990ce93deb3..bd71fedb05ee 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -2395,11 +2395,12 @@ public void testHasUncommittedChanges() throws IOException { writer.addDocument(doc); assertTrue(writer.hasUncommittedChanges()); - // Must commit, waitForMerges, commit again, to be - // certain that hasUncommittedChanges returns false: - writer.commit(); - writer.waitForMerges(); - writer.commit(); + // Must commit and wait for merges as long as the commit triggers merges to be certain that + // hasUncommittedChanges returns false + do { + writer.waitForMerges(); + writer.commit(); + } while (writer.hasPendingMerges()); assertFalse(writer.hasUncommittedChanges()); writer.deleteDocuments(new Term("id", "xyz")); assertTrue(writer.hasUncommittedChanges()); From 1f3f3ae14f7ff07090675ef8cb5c6c193b7b6018 Mon Sep 17 00:00:00 2001 From: Mike McCandless Date: Fri, 3 Nov 2023 07:45:45 -0400 Subject: [PATCH 16/19] fix comment: Test2BFST was exaggerating a bit about how big an FST it was really making --- lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java index 54653b9d2560..eac2fae1ef48 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java @@ -56,7 +56,7 @@ public void test() throws Exception { for (int iter = 0; iter < 1; iter++) { // Build FST w/ NoOutputs and stop when nodeCount > 2.2B { - System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); + System.out.println("\nTEST: ~2.2B nodes; output=NO_OUTPUTS"); Outputs outputs = NoOutputs.getSingleton(); Object NO_OUTPUT = outputs.getNoOutput(); final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); From d6836d3d0e5d33a98b35c0885b9787f46c4be47e Mon Sep 17 00:00:00 2001 From: Dawid Weiss Date: Fri, 3 Nov 2023 17:05:17 +0100 Subject: [PATCH 17/19] tests.multiplier could be omitted in failed test reproduce line (#12752) The default tests.multiplier passed from gradle was 1, but LuceneTestCase tried to compute its default value from TESTS_NIGHTLY. This could lead to subtle errors: nightly mode failures would not report tests.multipler=1 and when started from the IDE, the tests.multiplier would be set to 2 (leading to different randomness). --- gradle/testing/randomization.gradle | 2 +- lucene/CHANGES.txt | 3 +++ .../java/org/apache/lucene/tests/util/LuceneTestCase.java | 7 ++++++- .../lucene/tests/util/RunListenerPrintReproduceInfo.java | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 30aad60e9e0f..1b56044e071e 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -67,7 +67,7 @@ allprojects { // seed, repetition and amplification. [propName: 'tests.seed', value: { -> rootSeed }, description: "Sets the master randomization seed."], [propName: 'tests.iters', value: null, description: "Duplicate (re-run) each test case N times."], - [propName: 'tests.multiplier', value: 1, description: "Value multiplier for randomized tests."], + [propName: 'tests.multiplier', value: null, description: "Value multiplier for randomized tests."], [propName: 'tests.maxfailures', value: null, description: "Skip tests after a given number of failures."], [propName: 'tests.timeoutSuite', value: null, description: "Timeout (in millis) for an entire suite."], [propName: 'tests.failfast', value: "false", description: "Stop the build early on failure.", buildOnly: true], diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 039afcba68de..7467b9900808 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -285,6 +285,9 @@ Bug Fixes Build --------------------- +* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in + nightly mode). (Dawid Weiss) + * GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed leading to odd runtime errors (Chris Hostetter, Dawid Weiss) diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index 50f87ad0a194..5b114ff9a497 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -476,7 +476,12 @@ public abstract class LuceneTestCase extends Assert { * of iterations to scale your tests (for nightly builds). */ public static final int RANDOM_MULTIPLIER = - systemPropertyAsInt("tests.multiplier", TEST_NIGHTLY ? 2 : 1); + systemPropertyAsInt("tests.multiplier", defaultRandomMultiplier()); + + /** Compute the default value of the random multiplier (based on {@link #TEST_NIGHTLY}). */ + static int defaultRandomMultiplier() { + return TEST_NIGHTLY ? 2 : 1; + } /** Leave temporary files on disk, even on successful runs. */ public static final boolean LEAVE_TEMPORARY; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java index 9fa50a204173..753f27a15c85 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java @@ -189,7 +189,8 @@ private void reportAdditionalFailureInfo(final String testName) { addVmOpt(b, "tests.seed", RandomizedContext.current().getRunnerSeedAsString()); // Test groups and multipliers. - if (RANDOM_MULTIPLIER > 1) addVmOpt(b, "tests.multiplier", RANDOM_MULTIPLIER); + if (RANDOM_MULTIPLIER != LuceneTestCase.defaultRandomMultiplier()) + addVmOpt(b, "tests.multiplier", RANDOM_MULTIPLIER); if (TEST_NIGHTLY) addVmOpt(b, SYSPROP_NIGHTLY, TEST_NIGHTLY); if (TEST_WEEKLY) addVmOpt(b, SYSPROP_WEEKLY, TEST_WEEKLY); if (TEST_MONSTER) addVmOpt(b, SYSPROP_MONSTER, TEST_MONSTER); From a35573eed960870fe8fd3ea9bc5ebc68dfb4c9c5 Mon Sep 17 00:00:00 2001 From: Uwe Schindler Date: Fri, 3 Nov 2023 20:55:36 +0100 Subject: [PATCH 18/19] Refactor access to VM options and move some VM options to oal.util.Constants (#12754) --- lucene/CHANGES.txt | 3 + .../vectorization/VectorizationProvider.java | 23 +---- .../org/apache/lucene/util/Constants.java | 82 ++++++++++++----- .../apache/lucene/util/HotspotVMOptions.java | 90 +++++++++++++++++++ .../apache/lucene/util/RamUsageEstimator.java | 62 ++----------- .../PanamaVectorUtilSupport.java | 34 +------ .../PanamaVectorizationProvider.java | 3 +- 7 files changed, 164 insertions(+), 133 deletions(-) create mode 100644 lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 7467b9900808..5fae1e4b88f0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -210,6 +210,9 @@ Improvements * GITHUB#12689: TaskExecutor to cancel all tasks on exception to avoid needless computation. (Luca Cavanna) +* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL + if SecurityManager prevents access. (Uwe Schindler) + Optimizations --------------------- * GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary) diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index ed4066a94ac4..3d565b650a9b 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -21,8 +21,6 @@ import java.lang.StackWalker.StackFrame; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; -import java.security.AccessController; -import java.security.PrivilegedAction; import java.util.Locale; import java.util.Objects; import java.util.Optional; @@ -31,7 +29,7 @@ import java.util.function.Predicate; import java.util.logging.Logger; import java.util.stream.Stream; -import org.apache.lucene.util.SuppressForbidden; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.VectorUtil; /** @@ -129,7 +127,7 @@ static VectorizationProvider lookup(boolean testMode) { "Vector bitsize and/or integer vectors enforcement; using default vectorization provider outside of testMode"); return new DefaultVectorizationProvider(); } - if (isClientVM()) { + if (Constants.IS_CLIENT_VM) { LOG.warning("C2 compiler is disabled; Java vector incubator API can't be enabled"); return new DefaultVectorizationProvider(); } @@ -188,23 +186,6 @@ private static boolean isAffectedByJDK8301190() { && !Objects.equals("I", "i".toUpperCase(Locale.getDefault())); } - @SuppressWarnings("removal") - @SuppressForbidden(reason = "security manager") - private static boolean isClientVM() { - try { - final PrivilegedAction action = - () -> System.getProperty("java.vm.info", "").contains("emulated-client"); - return AccessController.doPrivileged(action); - } catch ( - @SuppressWarnings("unused") - SecurityException e) { - LOG.warning( - "SecurityManager denies permission to 'java.vm.info' system property, so state of C2 compiler can't be detected. " - + "In case of performance issues allow access to this property."); - return false; - } - } - // add all possible callers here as FQCN: private static final Set VALID_CALLERS = Set.of("org.apache.lucene.util.VectorUtil"); diff --git a/lucene/core/src/java/org/apache/lucene/util/Constants.java b/lucene/core/src/java/org/apache/lucene/util/Constants.java index 090472e6736e..3ef12986bb2b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Constants.java +++ b/lucene/core/src/java/org/apache/lucene/util/Constants.java @@ -16,18 +16,25 @@ */ package org.apache.lucene.util; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.Objects; +import java.util.logging.Logger; + /** Some useful constants. */ public final class Constants { private Constants() {} // can't construct + private static final String UNKNOWN = "Unknown"; + /** JVM vendor info. */ - public static final String JVM_VENDOR = System.getProperty("java.vm.vendor"); + public static final String JVM_VENDOR = getSysProp("java.vm.vendor", UNKNOWN); /** JVM vendor name. */ - public static final String JVM_NAME = System.getProperty("java.vm.name"); + public static final String JVM_NAME = getSysProp("java.vm.name", UNKNOWN); /** The value of System.getProperty("os.name"). * */ - public static final String OS_NAME = System.getProperty("os.name"); + public static final String OS_NAME = getSysProp("os.name", UNKNOWN); /** True iff running on Linux. */ public static final boolean LINUX = OS_NAME.startsWith("Linux"); @@ -45,36 +52,67 @@ private Constants() {} // can't construct public static final boolean FREE_BSD = OS_NAME.startsWith("FreeBSD"); /** The value of System.getProperty("os.arch"). */ - public static final String OS_ARCH = System.getProperty("os.arch"); + public static final String OS_ARCH = getSysProp("os.arch", UNKNOWN); /** The value of System.getProperty("os.version"). */ - public static final String OS_VERSION = System.getProperty("os.version"); + public static final String OS_VERSION = getSysProp("os.version", UNKNOWN); /** The value of System.getProperty("java.vendor"). */ - public static final String JAVA_VENDOR = System.getProperty("java.vendor"); + public static final String JAVA_VENDOR = getSysProp("java.vendor", UNKNOWN); + + /** True iff the Java runtime is a client runtime and C2 compiler is not enabled */ + public static final boolean IS_CLIENT_VM = + getSysProp("java.vm.info", "").contains("emulated-client"); /** True iff running on a 64bit JVM */ - public static final boolean JRE_IS_64BIT; + public static final boolean JRE_IS_64BIT = is64Bit(); + + /** true iff we know fast FMA is supported, to deliver less error */ + public static final boolean HAS_FAST_FMA = + (IS_CLIENT_VM == false) + && Objects.equals(OS_ARCH, "amd64") + && HotspotVMOptions.get("UseFMA").map(Boolean::valueOf).orElse(false); - static { - boolean is64Bit = false; - String datamodel = null; + private static boolean is64Bit() { + final String datamodel = getSysProp("sun.arch.data.model"); + if (datamodel != null) { + return datamodel.contains("64"); + } else { + return (OS_ARCH != null && OS_ARCH.contains("64")); + } + } + + private static String getSysProp(String property) { try { - datamodel = System.getProperty("sun.arch.data.model"); - if (datamodel != null) { - is64Bit = datamodel.contains("64"); - } + return doPrivileged(() -> System.getProperty(property)); } catch ( @SuppressWarnings("unused") - SecurityException ex) { + SecurityException se) { + logSecurityWarning(property); + return null; } - if (datamodel == null) { - if (OS_ARCH != null && OS_ARCH.contains("64")) { - is64Bit = true; - } else { - is64Bit = false; - } + } + + private static String getSysProp(String property, String def) { + try { + return doPrivileged(() -> System.getProperty(property, def)); + } catch ( + @SuppressWarnings("unused") + SecurityException se) { + logSecurityWarning(property); + return def; } - JRE_IS_64BIT = is64Bit; + } + + private static void logSecurityWarning(String property) { + var log = Logger.getLogger(Constants.class.getName()); + log.warning("SecurityManager prevented access to system property: " + property); + } + + // Extracted to a method to be able to apply the SuppressForbidden annotation + @SuppressWarnings("removal") + @SuppressForbidden(reason = "security manager") + private static T doPrivileged(PrivilegedAction action) { + return AccessController.doPrivileged(action); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java b/lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java new file mode 100644 index 000000000000..70f963e1b378 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.lang.reflect.Method; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; +import java.util.logging.Logger; + +/** Accessor to get Hotspot VM Options (if available). */ +final class HotspotVMOptions { + private HotspotVMOptions() {} // can't construct + + /** True if the Java VM is based on Hotspot and has the Hotspot MX bean readable by Lucene */ + public static final boolean IS_HOTSPOT; + + /** + * Returns an optional with the value of a Hotspot VM option. If the VM option does not exist or + * is not readable, returns an empty optional. + */ + public static Optional get(String name) { + return ACCESSOR.apply(Objects.requireNonNull(name, "name")); + } + + private static final String MANAGEMENT_FACTORY_CLASS = "java.lang.management.ManagementFactory"; + private static final String HOTSPOT_BEAN_CLASS = "com.sun.management.HotSpotDiagnosticMXBean"; + private static final Function> ACCESSOR; + + static { + boolean isHotspot = false; + Function> accessor = name -> Optional.empty(); + try { + final Class beanClazz = Class.forName(HOTSPOT_BEAN_CLASS); + // we use reflection for this, because the management factory is not part + // of java.base module: + final Object hotSpotBean = + Class.forName(MANAGEMENT_FACTORY_CLASS) + .getMethod("getPlatformMXBean", Class.class) + .invoke(null, beanClazz); + if (hotSpotBean != null) { + final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class); + final Method getValueMethod = getVMOptionMethod.getReturnType().getMethod("getValue"); + isHotspot = true; + accessor = + name -> { + try { + final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, name); + return Optional.of(getValueMethod.invoke(vmOption).toString()); + } catch (@SuppressWarnings("unused") + ReflectiveOperationException + | RuntimeException e) { + return Optional.empty(); + } + }; + } + } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { + isHotspot = false; + final Logger log = Logger.getLogger(HotspotVMOptions.class.getName()); + final Module module = HotspotVMOptions.class.getModule(); + final ModuleLayer layer = module.getLayer(); + // classpath / unnamed module has no layer, so we need to check: + if (layer != null + && layer.findModule("jdk.management").map(module::canRead).orElse(false) == false) { + log.warning( + "Lucene cannot access JVM internals to optimize algorithms or calculate object sizes, unless the 'jdk.management' Java module " + + "is readable [please add 'jdk.management' to modular application either by command line or its module descriptor]."); + } else { + log.warning( + "Lucene cannot optimize algorithms or calculate object sizes for JVMs that are not based on Hotspot or a compatible implementation."); + } + } + IS_HOTSPOT = isHotspot; + ACCESSOR = accessor; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java index 1d363170a940..7e0bdfdee845 100644 --- a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java +++ b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java @@ -18,7 +18,6 @@ import java.lang.reflect.Array; import java.lang.reflect.Field; -import java.lang.reflect.Method; import java.lang.reflect.Modifier; import java.security.AccessControlException; import java.security.AccessController; @@ -30,7 +29,6 @@ import java.util.IdentityHashMap; import java.util.Locale; import java.util.Map; -import java.util.logging.Logger; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.Query; @@ -112,64 +110,16 @@ private RamUsageEstimator() {} /** For testing only */ static final boolean JVM_IS_HOTSPOT_64BIT; - static final String MANAGEMENT_FACTORY_CLASS = "java.lang.management.ManagementFactory"; - static final String HOTSPOT_BEAN_CLASS = "com.sun.management.HotSpotDiagnosticMXBean"; - /** Initialize constants and try to collect information about the JVM internals. */ static { - if (Constants.JRE_IS_64BIT) { + if (Constants.JRE_IS_64BIT && HotspotVMOptions.IS_HOTSPOT) { // Try to get compressed oops and object alignment (the default seems to be 8 on Hotspot); // (this only works on 64 bit, on 32 bits the alignment and reference size is fixed): - boolean compressedOops = false; - int objectAlignment = 8; - boolean isHotspot = false; - try { - final Class beanClazz = Class.forName(HOTSPOT_BEAN_CLASS); - // we use reflection for this, because the management factory is not part - // of Java 8's compact profile: - final Object hotSpotBean = - Class.forName(MANAGEMENT_FACTORY_CLASS) - .getMethod("getPlatformMXBean", Class.class) - .invoke(null, beanClazz); - if (hotSpotBean != null) { - isHotspot = true; - final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class); - try { - final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "UseCompressedOops"); - compressedOops = - Boolean.parseBoolean( - vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()); - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - isHotspot = false; - } - try { - final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "ObjectAlignmentInBytes"); - objectAlignment = - Integer.parseInt( - vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()); - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - isHotspot = false; - } - } - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - isHotspot = false; - final Logger log = Logger.getLogger(RamUsageEstimator.class.getName()); - final Module module = RamUsageEstimator.class.getModule(); - final ModuleLayer layer = module.getLayer(); - // classpath / unnamed module has no layer, so we need to check: - if (layer != null - && layer.findModule("jdk.management").map(module::canRead).orElse(false) == false) { - log.warning( - "Lucene cannot correctly calculate object sizes on 64bit JVMs, unless the 'jdk.management' Java module " - + "is readable [please add 'jdk.management' to modular application either by command line or its module descriptor]"); - } else { - log.warning( - "Lucene cannot correctly calculate object sizes on 64bit JVMs that are not based on Hotspot or a compatible implementation."); - } - } - JVM_IS_HOTSPOT_64BIT = isHotspot; - COMPRESSED_REFS_ENABLED = compressedOops; - NUM_BYTES_OBJECT_ALIGNMENT = objectAlignment; + JVM_IS_HOTSPOT_64BIT = true; + COMPRESSED_REFS_ENABLED = + HotspotVMOptions.get("UseCompressedOops").map(Boolean::valueOf).orElse(false); + NUM_BYTES_OBJECT_ALIGNMENT = + HotspotVMOptions.get("ObjectAlignmentInBytes").map(Integer::valueOf).orElse(8); // reference size is 4, if we have compressed oops: NUM_BYTES_OBJECT_REF = COMPRESSED_REFS_ENABLED ? 4 : 8; // "best guess" based on reference size: diff --git a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 5b382c4c7c25..d4e8a50ef8f4 100644 --- a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -77,41 +77,9 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { VectorizationProvider.TESTS_FORCE_INTEGER_VECTORS || (isAMD64withoutAVX2 == false); } - private static final String MANAGEMENT_FACTORY_CLASS = "java.lang.management.ManagementFactory"; - private static final String HOTSPOT_BEAN_CLASS = "com.sun.management.HotSpotDiagnosticMXBean"; - - // best effort to see if FMA is fast (this is architecture-independent option) - private static boolean hasFastFMA() { - // on ARM cpus, FMA works fine but is a slight slowdown: don't use it. - if (Constants.OS_ARCH.equals("amd64") == false) { - return false; - } - try { - final Class beanClazz = Class.forName(HOTSPOT_BEAN_CLASS); - // we use reflection for this, because the management factory is not part - // of Java 8's compact profile: - final Object hotSpotBean = - Class.forName(MANAGEMENT_FACTORY_CLASS) - .getMethod("getPlatformMXBean", Class.class) - .invoke(null, beanClazz); - if (hotSpotBean != null) { - final var getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class); - final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "UseFMA"); - return Boolean.parseBoolean( - vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()); - } - return false; - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - return false; - } - } - - // true if we know FMA is supported, to deliver less error - static final boolean HAS_FAST_FMA = hasFastFMA(); - // the way FMA should work! if available use it, otherwise fall back to mul/add private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) { - if (HAS_FAST_FMA) { + if (Constants.HAS_FAST_FMA) { return a.fma(b, c); } else { return a.mul(b).add(c); diff --git a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index fc303a687a07..ffd18df1a270 100644 --- a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -21,6 +21,7 @@ import java.util.Locale; import java.util.logging.Logger; import jdk.incubator.vector.FloatVector; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.SuppressForbidden; /** A vectorization provider that leverages the Panama Vector API. */ @@ -62,7 +63,7 @@ private static T doPrivileged(PrivilegedAction action) { Locale.ENGLISH, "Java vector incubator API enabled; uses preferredBitSize=%d%s%s", PanamaVectorUtilSupport.VECTOR_BITSIZE, - PanamaVectorUtilSupport.HAS_FAST_FMA ? "; FMA enabled" : "", + Constants.HAS_FAST_FMA ? "; FMA enabled" : "", PanamaVectorUtilSupport.HAS_FAST_INTEGER_VECTORS ? "" : "; floating-point vectors only")); From 5ef651fc4c5cd669e4a69f255ed0dbee4edc93f9 Mon Sep 17 00:00:00 2001 From: Chris Hegarty <62058229+ChrisHegarty@users.noreply.github.com> Date: Fri, 3 Nov 2023 20:02:22 +0000 Subject: [PATCH 19/19] Replace usage of deprecated java.net.URL constructor with URI (#12755) This commit replaces the usage of the deprecated java.net.URL constructor with URI, later converting toURL where necessary to interoperate with the URLConnection API. --- .../standard/GenerateJflexTLDMacros.java | 3 ++- .../analysis/icu/GenerateUTR30DataFiles.java | 17 +++++++++-------- .../lucene/luke/app/desktop/util/URLLabel.java | 5 +++-- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java b/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java index 24646e31a411..c205a9eea3f5 100644 --- a/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java +++ b/lucene/analysis/common/src/tools/java/org/apache/lucene/analysis/standard/GenerateJflexTLDMacros.java @@ -21,6 +21,7 @@ import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.Writer; +import java.net.URI; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; @@ -111,7 +112,7 @@ public static void main(String... args) throws Exception { public GenerateJflexTLDMacros(String tldFileURL, String jflexFile, String tldListFile) throws Exception { - this.tldFileURL = new URL(tldFileURL); + this.tldFileURL = URI.create(tldFileURL).toURL(); this.jflexMacroFile = Paths.get(jflexFile); this.tldListFile = Paths.get(tldListFile); } diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java index c48ed42d5a74..6de1d6078355 100644 --- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java +++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java @@ -27,6 +27,7 @@ import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; +import java.net.URI; import java.net.URL; import java.net.URLConnection; import java.nio.charset.StandardCharsets; @@ -155,19 +156,19 @@ private static void expandDataFileRules(Path file) throws IOException { } private static void getNFKCDataFilesFromIcuProject(String releaseTag) throws IOException { - URL icuTagsURL = new URL(ICU_GIT_TAG_URL + "/"); - URL icuReleaseTagURL = new URL(icuTagsURL, releaseTag + "/"); - URL norm2url = new URL(icuReleaseTagURL, ICU_DATA_NORM2_PATH + "/"); + URI icuTagsURI = URI.create(ICU_GIT_TAG_URL + "/"); + URI icuReleaseTagURI = icuTagsURI.resolve(releaseTag + "/"); + URI norm2uri = icuReleaseTagURI.resolve(ICU_DATA_NORM2_PATH + "/"); System.err.print("Downloading " + NFKC_TXT + " ... "); - download(new URL(norm2url, NFKC_TXT), NFKC_TXT); + download(norm2uri.resolve(NFKC_TXT), NFKC_TXT); System.err.println("done."); System.err.print("Downloading " + NFKC_CF_TXT + " ... "); - download(new URL(norm2url, NFKC_CF_TXT), NFKC_CF_TXT); + download(norm2uri.resolve(NFKC_CF_TXT), NFKC_CF_TXT); System.err.println("done."); System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... "); - URLConnection connection = openConnection(new URL(norm2url, NFC_TXT)); + URLConnection connection = openConnection(norm2uri.resolve(NFC_TXT).toURL()); try (BufferedReader reader = new BufferedReader( new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8)); @@ -210,8 +211,8 @@ private static void getNFKCDataFilesFromIcuProject(String releaseTag) throws IOE System.err.println("done."); } - private static void download(URL url, String outputFile) throws IOException { - final URLConnection connection = openConnection(url); + private static void download(URI uri, String outputFile) throws IOException { + final URLConnection connection = openConnection(uri.toURL()); try (InputStream inputStream = connection.getInputStream(); OutputStream outputStream = Files.newOutputStream(Path.of(outputFile))) { inputStream.transferTo(outputStream); diff --git a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java index 51e188e9af52..61de291d866e 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java @@ -23,6 +23,7 @@ import java.awt.event.MouseEvent; import java.io.IOException; import java.net.MalformedURLException; +import java.net.URI; import java.net.URISyntaxException; import java.net.URL; import javax.swing.JLabel; @@ -37,8 +38,8 @@ public URLLabel(String text) { super(text); try { - this.link = new URL(text); - } catch (MalformedURLException e) { + this.link = (new URI(text)).toURL(); + } catch (URISyntaxException | MalformedURLException e) { throw new LukeException(e.getMessage(), e); }