diff --git a/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java index b8d6735c9089..34f046ffbe36 100644 --- a/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java +++ b/buildSrc/src/main/java/org/apache/lucene/gradle/datasets/ExtractReuters.java @@ -27,6 +27,7 @@ import java.nio.file.StandardCopyOption; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Stream; /** * Split the Reuters SGML documents into Simple Text files containing: @@ -44,9 +45,10 @@ public ExtractReuters(Path reutersDir, Path outputDir) throws IOException { public void extract() throws IOException { long count = 0; Files.createDirectories(outputDir); - - if (Files.list(outputDir).count() > 0) { - throw new IOException("The output directory must be empty: " + outputDir); + try(Stream files = Files.list(outputDir)) { + if (files.count() > 0) { + throw new IOException("The output directory must be empty: " + outputDir); + } } try (DirectoryStream stream = Files.newDirectoryStream(reutersDir, "*.sgm")) { diff --git a/dev-tools/scripts/releaseWizard.py b/dev-tools/scripts/releaseWizard.py index b57eefb50d9a..2fe72a65f2c7 100755 --- a/dev-tools/scripts/releaseWizard.py +++ b/dev-tools/scripts/releaseWizard.py @@ -63,7 +63,6 @@ import scriptutil from consolemenu import ConsoleMenu from consolemenu.items import FunctionItem, SubmenuItem, ExitItem -from consolemenu.screen import Screen from scriptutil import BranchType, Version, download, run # Lucene-to-Java version mapping @@ -654,8 +653,8 @@ def get_title(self): return "%s%s (%d/%d)" % (prefix, self.title, self.num_done(), self.num_applies()) def get_submenu(self): - menu = UpdatableConsoleMenu(title=self.title, subtitle=self.get_subtitle, prologue_text=self.get_description(), - screen=MyScreen()) + menu = ConsoleMenu(title=self.title, subtitle=self.get_subtitle, prologue_text=self.get_description(), + clear_screen=False) menu.exit_item = CustomExitItem("Return") for todo in self.get_todos(): if todo.applies(state.release_type): @@ -663,7 +662,7 @@ def get_submenu(self): return menu def get_menu_item(self): - item = UpdatableSubmenuItem(self.get_title, self.get_submenu()) + item = SubmenuItem(self.get_title, self.get_submenu()) return item def get_todos(self): @@ -820,7 +819,7 @@ def display_and_confirm(self): print("ERROR while executing todo %s (%s)" % (self.get_title(), e)) def get_menu_item(self): - return UpdatableFunctionItem(self.get_title, self.display_and_confirm) + return FunctionItem(self.get_title, self.display_and_confirm) def clone(self): clone = Todo(self.id, self.title, description=self.description) @@ -1234,104 +1233,6 @@ def pause(fun=None): input("\nPress ENTER to continue...") -# Custom classes for ConsoleMenu, to make menu texts dynamic -# Needed until https://github.com/aegirhall/console-menu/pull/25 is released -# See https://pypi.org/project/console-menu/ for other docs - -class UpdatableConsoleMenu(ConsoleMenu): - - def __repr__(self): - return "%s: %s. %d items" % (self.get_title(), self.get_subtitle(), len(self.items)) - - def draw(self): - """ - Refreshes the screen and redraws the menu. Should be called whenever something changes that needs to be redrawn. - """ - self.screen.printf(self.formatter.format(title=self.get_title(), subtitle=self.get_subtitle(), items=self.items, - prologue_text=self.get_prologue_text(), epilogue_text=self.get_epilogue_text())) - - # Getters to get text in case method reference - def get_title(self): - return self.title() if callable(self.title) else self.title - - def get_subtitle(self): - return self.subtitle() if callable(self.subtitle) else self.subtitle - - def get_prologue_text(self): - return self.prologue_text() if callable(self.prologue_text) else self.prologue_text - - def get_epilogue_text(self): - return self.epilogue_text() if callable(self.epilogue_text) else self.epilogue_text - - -class UpdatableSubmenuItem(SubmenuItem): - def __init__(self, text, submenu, menu=None, should_exit=False): - """ - :ivar ConsoleMenu self.submenu: The submenu to be opened when this item is selected - """ - super(UpdatableSubmenuItem, self).__init__(text=text, menu=menu, should_exit=should_exit, submenu=submenu) - - if menu: - self.get_submenu().parent = menu - - def show(self, index): - return "%2d - %s" % (index + 1, self.get_text()) - - # Getters to get text in case method reference - def get_text(self): - return self.text() if callable(self.text) else self.text - - def set_menu(self, menu): - """ - Sets the menu of this item. - Should be used instead of directly accessing the menu attribute for this class. - - :param ConsoleMenu menu: the menu - """ - self.menu = menu - self.get_submenu().parent = menu - - def action(self): - """ - This class overrides this method - """ - self.get_submenu().start() - - def clean_up(self): - """ - This class overrides this method - """ - self.get_submenu().join() - self.menu.clear_screen() - self.menu.resume() - - def get_return(self): - """ - :return: The returned value in the submenu - """ - return self.get_submenu().returned_value - - def get_submenu(self): - """ - We unwrap the submenu variable in case it is a reference to a method that returns a submenu - """ - return self.submenu if not callable(self.submenu) else self.submenu() - - -class UpdatableFunctionItem(FunctionItem): - def show(self, index): - return "%2d - %s" % (index + 1, self.get_text()) - - # Getters to get text in case method reference - def get_text(self): - return self.text() if callable(self.text) else self.text - - -class MyScreen(Screen): - def clear(self): - return - - class CustomExitItem(ExitItem): def show(self, index): return super(CustomExitItem, self).show(index) @@ -1346,6 +1247,13 @@ def main(): global templates print("Lucene releaseWizard v%s" % getScriptVersion()) + + try: + ConsoleMenu(clear_screen=True) + except Exception as e: + sys.exit("You need to install 'consolemenu' package version 0.7.1 for the Wizard to function. Please run 'pip " + "install -r requirements.txt'") + c = parse_config() if c.dry: @@ -1402,18 +1310,18 @@ def main(): lucene_news_file = os.path.join(state.get_website_git_folder(), 'content', 'core', 'core_news', "%s-%s-available.md" % (state.get_release_date_iso(), state.release_version.replace(".", "-"))) - main_menu = UpdatableConsoleMenu(title="Lucene ReleaseWizard", + main_menu = ConsoleMenu(title="Lucene ReleaseWizard", subtitle=get_releasing_text, prologue_text="Welcome to the release wizard. From here you can manage the process including creating new RCs. " "All changes are persisted, so you can exit any time and continue later. Make sure to read the Help section.", epilogue_text="® 2022 The Lucene project. Licensed under the Apache License 2.0\nScript version v%s)" % getScriptVersion(), - screen=MyScreen()) + clear_screen=False) - todo_menu = UpdatableConsoleMenu(title=get_releasing_text, + todo_menu = ConsoleMenu(title=get_releasing_text, subtitle=get_subtitle, prologue_text=None, epilogue_text=None, - screen=MyScreen()) + clear_screen=False) todo_menu.exit_item = CustomExitItem("Return") for todo_group in state.todo_groups: @@ -1422,14 +1330,14 @@ def main(): menu_item.set_menu(todo_menu) todo_menu.append_item(menu_item) - main_menu.append_item(UpdatableSubmenuItem(get_todo_menuitem_title, todo_menu, menu=main_menu)) - main_menu.append_item(UpdatableFunctionItem(get_start_new_rc_menu_title, start_new_rc)) - main_menu.append_item(UpdatableFunctionItem('Clear and restart current RC', state.clear_rc)) - main_menu.append_item(UpdatableFunctionItem("Clear all state, restart the %s release" % state.release_version, reset_state)) - main_menu.append_item(UpdatableFunctionItem('Start release for a different version', release_other_version)) - main_menu.append_item(UpdatableFunctionItem('Generate Asciidoc guide for this release', generate_asciidoc)) - # main_menu.append_item(UpdatableFunctionItem('Dump YAML', dump_yaml)) - main_menu.append_item(UpdatableFunctionItem('Help', help)) + main_menu.append_item(SubmenuItem(get_todo_menuitem_title, todo_menu, menu=main_menu)) + main_menu.append_item(FunctionItem(get_start_new_rc_menu_title, start_new_rc)) + main_menu.append_item(FunctionItem('Clear and restart current RC', state.clear_rc)) + main_menu.append_item(FunctionItem("Clear all state, restart the %s release" % state.release_version, reset_state)) + main_menu.append_item(FunctionItem('Start release for a different version', release_other_version)) + main_menu.append_item(FunctionItem('Generate Asciidoc guide for this release', generate_asciidoc)) + # main_menu.append_item(FunctionItem('Dump YAML', dump_yaml)) + main_menu.append_item(FunctionItem('Help', help)) main_menu.show() diff --git a/dev-tools/scripts/releaseWizard.yaml b/dev-tools/scripts/releaseWizard.yaml index a25407c4e275..ec7f61774302 100644 --- a/dev-tools/scripts/releaseWizard.yaml +++ b/dev-tools/scripts/releaseWizard.yaml @@ -521,7 +521,7 @@ groups: addition wait a couple more days? Merges of bug fixes into the branch may become more difficult. * Only Github issues with Milestone {{ release_version_major }}.{{ release_version_minor }} - and priority "Blocker" will delay a release candidate build. + will delay a release candidate build. ---- types: - major @@ -979,8 +979,8 @@ groups: title: Publish docs, changes and javadocs description: | Ensure your refrigerator has at least 2 beers - the svn import operation can take a while, - depending on your upload bandwidth. We'll publish this directly to the production tree. - At the end of the task, the two links below shall work. + depending on your upload bandwidth. We'll publish this directly to the production tree. At + the end of the task, the two links below shall work. links: - http://lucene.apache.org/core/{{ version }} vars: @@ -1126,12 +1126,18 @@ groups: comment: Push all changes logfile: push-website.log post_description: | - Wait a few minutes for the build to happen. You can follow the site build at https://ci2.apache.org/#/builders/3 - and view the staged site at https://lucene.staged.apache.org - Verify that correct links and versions are mentioned in download pages, download buttons etc. - If you find anything wrong, then commit and push any changes and check again. - - Next step is to merge the changes to branch 'production' in order to publish the site. + Wait a few minutes for the build to happen. You can follow the site build at + https://ci2.apache.org/#/builders/3 and view the staged site at + https://lucene.staged.apache.org Verify that correct links and versions are mentioned in + download pages, download buttons etc. If you find anything wrong, then commit and push any + changes and check again. You may find that the publish fails, leaving a directory listing + instead a beautiful website. If this happens, check the "builder" link and click through into + its details to find possible error messages produced by the website publication process. You + may have produced malformed Markdown. Or the website publish may just fail for some reason out + of your control. If this happens, you can attempt to retrigger the publishing with some + innocuous changes. Next step is to merge the changes to branch 'production' in order to + publish the site. Before doing this, you may want to replenish your stock of beers, or get + stronger stuff. links: - https://ci2.apache.org/#/builders/3 - https://lucene.staged.apache.org @@ -1159,7 +1165,8 @@ groups: post_description: | Wait a few minutes for the build to happen. You can follow the site build at https://ci2.apache.org/#/builders/3 - Verify on https://lucene.apache.org that the site is OK. + Verify on https://lucene.apache.org that the site is OK. It really should be, but see staging + site publication instructions for possible debugging/recovery options if it is not. You can now also verify that http://lucene.apache.org/core/api/core/ redirects to the latest version links: diff --git a/dev-tools/scripts/requirements.txt b/dev-tools/scripts/requirements.txt index b8a124b8f828..0617ad153f54 100644 --- a/dev-tools/scripts/requirements.txt +++ b/dev-tools/scripts/requirements.txt @@ -1,8 +1,8 @@ -six>=1.11.0 -Jinja2>=2.10.1 -PyYAML>=5.1 -holidays>=0.9.10 -ics>=0.4 -console-menu>=0.5.1 -PyGithub -jira \ No newline at end of file +six~=1.16.0 +Jinja2~=3.1.1 +PyYAML~=6.0 +holidays~=0.16 +ics~=0.7.2 +console-menu~=0.7.1 +PyGithub~=1.56 +jira~=3.4.1 \ No newline at end of file diff --git a/gradle/java/modules.gradle b/gradle/java/modules.gradle index f9ebac3d345b..cb8f7c8df34c 100644 --- a/gradle/java/modules.gradle +++ b/gradle/java/modules.gradle @@ -67,6 +67,12 @@ allprojects { tasks.named(sourceSet.getCompileJavaTaskName()).configure({ JavaCompile task -> task.dependsOn modularPaths.compileModulePathConfiguration + // GH-12742: add the modular path as inputs so that if anything changes, the task + // is not up to date and is re-run. I [dw] believe this should be a @Classpath parameter + // on the task itself... but I don't know how to implement this on an existing class. + // this is a workaround but should work just fine though. + task.inputs.files(modularPaths.compileModulePathConfiguration) + // LUCENE-10327: don't allow gradle to emit an empty sourcepath as it would break // compilation of modules. task.options.setSourcepath(sourceSet.java.sourceDirectories) diff --git a/gradle/testing/randomization.gradle b/gradle/testing/randomization.gradle index 30aad60e9e0f..1b56044e071e 100644 --- a/gradle/testing/randomization.gradle +++ b/gradle/testing/randomization.gradle @@ -67,7 +67,7 @@ allprojects { // seed, repetition and amplification. [propName: 'tests.seed', value: { -> rootSeed }, description: "Sets the master randomization seed."], [propName: 'tests.iters', value: null, description: "Duplicate (re-run) each test case N times."], - [propName: 'tests.multiplier', value: 1, description: "Value multiplier for randomized tests."], + [propName: 'tests.multiplier', value: null, description: "Value multiplier for randomized tests."], [propName: 'tests.maxfailures', value: null, description: "Skip tests after a given number of failures."], [propName: 'tests.timeoutSuite', value: null, description: "Timeout (in millis) for an entire suite."], [propName: 'tests.failfast', value: "false", description: "Stop the build early on failure.", buildOnly: true], diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 6a92ae5c6e20..5fae1e4b88f0 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -62,9 +62,11 @@ API Changes * GITHUB#12599: Add RandomAccessInput#readBytes method to the RandomAccessInput interface. (Ignacio Vera) -* GITHUB#12709 Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods +* GITHUB#12709: Consolidate FSTStore and BytesStore in FST. Created FSTReader which contains the common methods of the two (Anh Dung Bui) +* GITHUB#12735: Remove FSTCompiler#getTermCount() and FSTCompiler.UnCompiledNode#inputCount (Anh Dung Bui) + New Features --------------------- @@ -208,6 +210,9 @@ Improvements * GITHUB#12689: TaskExecutor to cancel all tasks on exception to avoid needless computation. (Luca Cavanna) +* GITHUB#12754: Refactor lookup of Hotspot VM options and do not initialize constants with NULL + if SecurityManager prevents access. (Uwe Schindler) + Optimizations --------------------- * GITHUB#12183: Make TermStates#build concurrent. (Shubham Chaudhary) @@ -251,6 +256,11 @@ Optimizations * GITHUB#12719: Top-level conjunctions that are not sorted by score now have a specialized bulk scorer. (Adrien Grand) +* GITHUB#1052: Faster merging of terms enums. (Adrien Grand) + +* GITHUB#11903: Faster sort on high-cardinality string fields. (Adrien Grand) + + Changes in runtime behavior --------------------- @@ -278,7 +288,14 @@ Bug Fixes Build --------------------- +* GITHUB#12752: tests.multiplier could be omitted in test failure reproduce lines (esp. in + nightly mode). (Dawid Weiss) + +* GITHUB#12742: JavaCompile tasks may be in up-to-date state when modular dependencies have changed + leading to odd runtime errors (Chris Hostetter, Dawid Weiss) + * GITHUB#12612: Upgrade forbiddenapis to version 3.6 and ASM for APIJAR extraction to 9.6. (Uwe Schindler) + * GITHUB#12655: Upgrade to Gradle 8.4 (Kevin Risden) Other diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CSVUtil.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CSVUtil.java similarity index 94% rename from lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CSVUtil.java rename to lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CSVUtil.java index e3662f291ca2..36d6e0560e77 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/CSVUtil.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/util/CSVUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ja.dict; +package org.apache.lucene.analysis.util; import java.util.ArrayList; import java.util.regex.Matcher; @@ -69,7 +69,7 @@ public static String[] parse(String line) { return new String[0]; } - return result.toArray(new String[result.size()]); + return result.toArray(new String[0]); } private static String unQuoteUnEscape(String original) { @@ -83,7 +83,7 @@ private static String unQuoteUnEscape(String original) { } // Unescape - if (result.indexOf(ESCAPED_QUOTE) >= 0) { + if (result.contains(ESCAPED_QUOTE)) { result = result.replace(ESCAPED_QUOTE, "\""); } } diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestCSVUtil.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCSVUtil.java similarity index 95% rename from lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestCSVUtil.java rename to lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCSVUtil.java index 8cc6fb66e5aa..85901ca0e46a 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/TestCSVUtil.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/util/TestCSVUtil.java @@ -14,10 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.analysis.ja; +package org.apache.lucene.analysis.util; import java.io.IOException; -import org.apache.lucene.analysis.ja.dict.CSVUtil; import org.apache.lucene.tests.util.LuceneTestCase; /* diff --git a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java index 1735643bf54b..6de1d6078355 100644 --- a/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java +++ b/lucene/analysis/icu/src/tools/java/org/apache/lucene/analysis/icu/GenerateUTR30DataFiles.java @@ -156,19 +156,19 @@ private static void expandDataFileRules(Path file) throws IOException { } private static void getNFKCDataFilesFromIcuProject(String releaseTag) throws IOException { - URI icuTagsURL = URI.create(ICU_GIT_TAG_URL + "/"); - URI icuReleaseTagURL = icuTagsURL.resolve(releaseTag + "/"); - URI norm2url = icuReleaseTagURL.resolve(ICU_DATA_NORM2_PATH + "/"); + URI icuTagsURI = URI.create(ICU_GIT_TAG_URL + "/"); + URI icuReleaseTagURI = icuTagsURI.resolve(releaseTag + "/"); + URI norm2uri = icuReleaseTagURI.resolve(ICU_DATA_NORM2_PATH + "/"); System.err.print("Downloading " + NFKC_TXT + " ... "); - download(norm2url.resolve(NFKC_TXT), NFKC_TXT); + download(norm2uri.resolve(NFKC_TXT), NFKC_TXT); System.err.println("done."); System.err.print("Downloading " + NFKC_CF_TXT + " ... "); - download(norm2url.resolve(NFKC_CF_TXT), NFKC_CF_TXT); + download(norm2uri.resolve(NFKC_CF_TXT), NFKC_CF_TXT); System.err.println("done."); System.err.print("Downloading " + NFKC_CF_TXT + " and making diacritic rules one-way ... "); - URLConnection connection = openConnection(norm2url.resolve(NFC_TXT).toURL()); + URLConnection connection = openConnection(norm2uri.resolve(NFC_TXT).toURL()); try (BufferedReader reader = new BufferedReader( new InputStreamReader(connection.getInputStream(), StandardCharsets.UTF_8)); diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java index 80b1cef6c327..5a16db673ced 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryBuilder.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java index e5270b32844c..4bdfe5095a6c 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/TokenInfoDictionaryEntryWriter.java @@ -20,6 +20,7 @@ import java.io.OutputStream; import java.nio.ByteBuffer; import org.apache.lucene.analysis.morph.DictionaryEntryWriter; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java index a367c49ca4da..ba5bc0e6a058 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UnknownDictionaryBuilder.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import org.apache.lucene.analysis.util.CSVUtil; class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*"; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java index 52604c4e1952..de69c726ee2a 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserDictionary.java @@ -26,6 +26,7 @@ import java.util.Map; import java.util.TreeMap; import org.apache.lucene.analysis.morph.Dictionary; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; diff --git a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java index be895f1268a6..6bc4dc72d28a 100644 --- a/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java +++ b/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/dict/UserMorphData.java @@ -19,6 +19,8 @@ import static org.apache.lucene.analysis.ja.dict.UserDictionary.CUSTOM_DICTIONARY_WORD_ID_OFFSET; import static org.apache.lucene.analysis.ja.dict.UserDictionary.INTERNAL_SEPARATOR; +import org.apache.lucene.analysis.util.CSVUtil; + /** Morphological information for user dictionary. */ final class UserMorphData implements JaMorphData { public static final int WORD_COST = -100000; diff --git a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java index 5ccdaa6b926c..2d245c7a599c 100644 --- a/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java +++ b/lucene/analysis/kuromoji/src/test/org/apache/lucene/analysis/ja/dict/TestUnknownDictionary.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.analysis.ja.dict; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java deleted file mode 100644 index b9e3ff9483bc..000000000000 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/CSVUtil.java +++ /dev/null @@ -1,93 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.lucene.analysis.ko.dict; - -import java.util.ArrayList; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -/** Utility class for parsing CSV text */ -public final class CSVUtil { - private static final char QUOTE = '"'; - - private static final char COMMA = ','; - - private static final Pattern QUOTE_REPLACE_PATTERN = Pattern.compile("^\"([^\"]+)\"$"); - - private static final String ESCAPED_QUOTE = "\"\""; - - private CSVUtil() {} // no instance!!! - - /** - * Parse CSV line - * - * @param line line containing csv-encoded data - * @return Array of values - */ - public static String[] parse(String line) { - boolean insideQuote = false; - ArrayList result = new ArrayList<>(); - int quoteCount = 0; - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < line.length(); i++) { - char c = line.charAt(i); - - if (c == QUOTE) { - insideQuote = !insideQuote; - quoteCount++; - } - - if (c == COMMA && !insideQuote) { - String value = sb.toString(); - value = unQuoteUnEscape(value); - result.add(value); - sb.setLength(0); - continue; - } - - sb.append(c); - } - - result.add(sb.toString()); - - // Validate - if (quoteCount % 2 != 0) { - return new String[0]; - } - - return result.toArray(new String[0]); - } - - private static String unQuoteUnEscape(String original) { - String result = original; - - // Unquote - if (result.indexOf('\"') >= 0) { - Matcher m = QUOTE_REPLACE_PATTERN.matcher(original); - if (m.matches()) { - result = m.group(1); - } - - // Unescape - if (result.contains(ESCAPED_QUOTE)) { - result = result.replace(ESCAPED_QUOTE, "\""); - } - } - - return result; - } -} diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java index 3726f9e6673b..e3db26b08b82 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryBuilder.java @@ -28,6 +28,7 @@ import java.util.List; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.util.IntsRefBuilder; import org.apache.lucene.util.fst.FST; import org.apache.lucene.util.fst.FSTCompiler; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java index f7ee696a1970..95ce0277a9d5 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/TokenInfoDictionaryEntryWriter.java @@ -24,6 +24,7 @@ import java.util.List; import org.apache.lucene.analysis.ko.POS; import org.apache.lucene.analysis.morph.DictionaryEntryWriter; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.ArrayUtil; diff --git a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java index 1004ab89581d..71099b2f0737 100644 --- a/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java +++ b/lucene/analysis/nori/src/java/org/apache/lucene/analysis/ko/dict/UnknownDictionaryBuilder.java @@ -25,6 +25,7 @@ import java.util.ArrayList; import java.util.Comparator; import java.util.List; +import org.apache.lucene.analysis.util.CSVUtil; class UnknownDictionaryBuilder { private static final String NGRAM_DICTIONARY_ENTRY = "NGRAM,1801,3559,3677,SY,*,*,*,*,*,*,*"; diff --git a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java index dbce890deda1..13190b21a73a 100644 --- a/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java +++ b/lucene/analysis/nori/src/test/org/apache/lucene/analysis/ko/dict/TestUnknownDictionary.java @@ -16,6 +16,7 @@ */ package org.apache.lucene.analysis.ko.dict; +import org.apache.lucene.analysis.util.CSVUtil; import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.Test; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java index 4e99d3a9f5cf..690bfa501f4b 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene84/PForUtil.java @@ -92,7 +92,7 @@ void encode(long[] longs, DataOutput out) throws IOException { out.writeBytes(exceptions, exceptions.length); } - /** Decode 128 integers into {@code ints}. */ + /** Decode 128 integers into {@code longs}. */ void decode(DataInput in, long[] longs) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java index b130bc3422fb..0ba817d410c8 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/VectorUtilBenchmark.java @@ -24,8 +24,14 @@ @BenchmarkMode(Mode.Throughput) @OutputTimeUnit(TimeUnit.MICROSECONDS) @State(Scope.Benchmark) -@Warmup(iterations = 3, time = 3) -@Measurement(iterations = 5, time = 3) +// first iteration is complete garbage, so make sure we really warmup +@Warmup(iterations = 4, time = 1) +// real iterations. not useful to spend tons of time here, better to fork more +@Measurement(iterations = 5, time = 1) +// engage some noise reduction +@Fork( + value = 3, + jvmArgsAppend = {"-Xmx2g", "-Xms2g", "-XX:+AlwaysPreTouch"}) public class VectorUtilBenchmark { private byte[] bytesA; @@ -36,7 +42,7 @@ public class VectorUtilBenchmark { @Param({"1", "128", "207", "256", "300", "512", "702", "1024"}) int size; - @Setup(Level.Trial) + @Setup(Level.Iteration) public void init() { ThreadLocalRandom random = ThreadLocalRandom.current(); @@ -56,84 +62,72 @@ public void init() { } @Benchmark - @Fork(value = 1) public float binaryCosineScalar() { return VectorUtil.cosine(bytesA, bytesB); } @Benchmark - @Fork( - value = 1, - jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float binaryCosineVector() { return VectorUtil.cosine(bytesA, bytesB); } @Benchmark - @Fork(value = 1) public int binaryDotProductScalar() { return VectorUtil.dotProduct(bytesA, bytesB); } @Benchmark - @Fork( - value = 1, - jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public int binaryDotProductVector() { return VectorUtil.dotProduct(bytesA, bytesB); } @Benchmark - @Fork(value = 1) public int binarySquareScalar() { return VectorUtil.squareDistance(bytesA, bytesB); } @Benchmark - @Fork( - value = 1, - jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) + @Fork(jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public int binarySquareVector() { return VectorUtil.squareDistance(bytesA, bytesB); } @Benchmark - @Fork(value = 1) public float floatCosineScalar() { return VectorUtil.cosine(floatsA, floatsB); } @Benchmark @Fork( - value = 1, + value = 15, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float floatCosineVector() { return VectorUtil.cosine(floatsA, floatsB); } @Benchmark - @Fork(value = 1) public float floatDotProductScalar() { return VectorUtil.dotProduct(floatsA, floatsB); } @Benchmark @Fork( - value = 1, + value = 15, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float floatDotProductVector() { return VectorUtil.dotProduct(floatsA, floatsB); } @Benchmark - @Fork(value = 1) public float floatSquareScalar() { return VectorUtil.squareDistance(floatsA, floatsB); } @Benchmark @Fork( - value = 1, + value = 15, jvmArgsPrepend = {"--add-modules=jdk.incubator.vector"}) public float floatSquareVector() { return VectorUtil.squareDistance(floatsA, floatsB); diff --git a/lucene/benchmark/conf/analyzer.alg b/lucene/benchmark/conf/analyzer.alg index 497ec3d216d8..4ed777915bd6 100644 --- a/lucene/benchmark/conf/analyzer.alg +++ b/lucene/benchmark/conf/analyzer.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/collector-small.alg b/lucene/benchmark/conf/collector-small.alg index 763cb0454ad8..e57ee8646b11 100644 --- a/lucene/benchmark/conf/collector-small.alg +++ b/lucene/benchmark/conf/collector-small.alg @@ -21,7 +21,7 @@ # Fully Qualified Class Name of a Collector with a empty constructor # topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs # topScoreDocUnordered - Like above, but allows out of order -collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered +collector.class=coll:topScoreDoc analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer directory=FSDirectory diff --git a/lucene/benchmark/conf/collector.alg b/lucene/benchmark/conf/collector.alg index d85582a7ba29..e2843492dcab 100644 --- a/lucene/benchmark/conf/collector.alg +++ b/lucene/benchmark/conf/collector.alg @@ -21,7 +21,7 @@ # Fully Qualified Class Name of a Collector with a empty constructor # topScoreDocOrdered - Creates a TopScoreDocCollector that requires in order docs # topScoreDocUnordered - Like above, but allows out of order -collector.class=coll:topScoreDocOrdered:topScoreDocUnordered:topScoreDocOrdered:topScoreDocUnordered +collector.class=coll:topScoreDoc analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer directory=FSDirectory diff --git a/lucene/benchmark/conf/compound-penalty.alg b/lucene/benchmark/conf/compound-penalty.alg index 06b2821f04b9..8626baa571a2 100644 --- a/lucene/benchmark/conf/compound-penalty.alg +++ b/lucene/benchmark/conf/compound-penalty.alg @@ -37,8 +37,8 @@ doc.term.vector=vector:true:true:false:false log.step=500 log.step.DeleteDoc=100 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/english-porter-comparison.alg b/lucene/benchmark/conf/english-porter-comparison.alg index e83f04a8dae2..e391c0b0d8d8 100644 --- a/lucene/benchmark/conf/english-porter-comparison.alg +++ b/lucene/benchmark/conf/english-porter-comparison.alg @@ -20,7 +20,8 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource doc.tokenized=false doc.body.tokenized=true -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 -AnalyzerFactory(name:original-porter-stemmer,StandardTokenizer, EnglishPossessiveFilter,LowerCaseFilter,StopFilter, diff --git a/lucene/benchmark/conf/facets.alg b/lucene/benchmark/conf/facets.alg index 63e7cac73748..32d7270e3b49 100644 --- a/lucene/benchmark/conf/facets.alg +++ b/lucene/benchmark/conf/facets.alg @@ -30,7 +30,8 @@ doc.tokenized=true doc.term.vector=false log.step=1000 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/highlights.alg b/lucene/benchmark/conf/highlights.alg index 88b056ecee40..7c5fd7d73378 100644 --- a/lucene/benchmark/conf/highlights.alg +++ b/lucene/benchmark/conf/highlights.alg @@ -30,7 +30,8 @@ doc.term.vector.offsets=false doc.term.vector.positions=false log.step=2000 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg b/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg index 43a6c91bbebb..d86e182a172f 100644 --- a/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg +++ b/lucene/benchmark/conf/indexing-flush-by-RAM-multithreaded.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing-flush-by-RAM.alg b/lucene/benchmark/conf/indexing-flush-by-RAM.alg index 0b6c79762ef5..0a911c940863 100644 --- a/lucene/benchmark/conf/indexing-flush-by-RAM.alg +++ b/lucene/benchmark/conf/indexing-flush-by-RAM.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing-multithreaded.alg b/lucene/benchmark/conf/indexing-multithreaded.alg index 1d2e18e260dd..b34b8266178a 100644 --- a/lucene/benchmark/conf/indexing-multithreaded.alg +++ b/lucene/benchmark/conf/indexing-multithreaded.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/indexing.alg b/lucene/benchmark/conf/indexing.alg index e31f87185b16..b4a4d92fc26c 100644 --- a/lucene/benchmark/conf/indexing.alg +++ b/lucene/benchmark/conf/indexing.alg @@ -32,8 +32,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/micro-standard-flush-by-ram.alg b/lucene/benchmark/conf/micro-standard-flush-by-ram.alg index 993e58a883d8..d4a22f12495e 100644 --- a/lucene/benchmark/conf/micro-standard-flush-by-ram.alg +++ b/lucene/benchmark/conf/micro-standard-flush-by-ram.alg @@ -31,8 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/sample.alg b/lucene/benchmark/conf/sample.alg index 4f93230bfc78..aa63293de6c7 100644 --- a/lucene/benchmark/conf/sample.alg +++ b/lucene/benchmark/conf/sample.alg @@ -42,8 +42,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource #content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/shingle.alg b/lucene/benchmark/conf/shingle.alg index b0744341c76a..67b513064a1e 100644 --- a/lucene/benchmark/conf/shingle.alg +++ b/lucene/benchmark/conf/shingle.alg @@ -16,7 +16,8 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource doc.tokenized=false doc.body.tokenized=true -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 log.step=1000 -AnalyzerFactory(name:shingle-bigrams-unigrams, diff --git a/lucene/benchmark/conf/sloppy-phrase.alg b/lucene/benchmark/conf/sloppy-phrase.alg index 4d06d6fdbe0f..4c49ddd59e61 100644 --- a/lucene/benchmark/conf/sloppy-phrase.alg +++ b/lucene/benchmark/conf/sloppy-phrase.alg @@ -30,7 +30,8 @@ doc.tokenized=true doc.term.vector=false log.step=500 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 #docs.dir=reuters-111 content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource diff --git a/lucene/benchmark/conf/sort-standard.alg b/lucene/benchmark/conf/sort-standard.alg index 48cae964dbee..08c7b90b0cdc 100644 --- a/lucene/benchmark/conf/sort-standard.alg +++ b/lucene/benchmark/conf/sort-standard.alg @@ -31,7 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=100000 -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 content.source=org.apache.lucene.benchmark.byTask.feeds.SortableSingleDocSource diff --git a/lucene/benchmark/conf/standard-flush-by-RAM.alg b/lucene/benchmark/conf/standard-flush-by-RAM.alg index 3ceed106fae7..c3cb2789b987 100644 --- a/lucene/benchmark/conf/standard-flush-by-RAM.alg +++ b/lucene/benchmark/conf/standard-flush-by-RAM.alg @@ -31,8 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/standard.alg b/lucene/benchmark/conf/standard.alg index 4d0b0480ffe7..4885593954b7 100644 --- a/lucene/benchmark/conf/standard.alg +++ b/lucene/benchmark/conf/standard.alg @@ -31,8 +31,8 @@ doc.tokenized=true doc.term.vector=false log.step=2000 -docs.dir=reuters-out -#docs.dir=reuters-111 +work.dir=data +docs.dir=reuters21578 #content.source=org.apache.lucene.benchmark.byTask.feeds.SingleDocSource content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource diff --git a/lucene/benchmark/conf/wstok.alg b/lucene/benchmark/conf/wstok.alg index c43759032c3f..ab6a6593c5ce 100644 --- a/lucene/benchmark/conf/wstok.alg +++ b/lucene/benchmark/conf/wstok.alg @@ -18,7 +18,8 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.ReutersContentSource doc.tokenized=false doc.body.tokenized=true -docs.dir=reuters-out +work.dir=data +docs.dir=reuters21578 -AnalyzerFactory(name:WhitespaceTokenizer, WhitespaceTokenizer(rule:java)) diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java index 2248756998e9..032019f1e4ed 100644 --- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java +++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/NewAnalyzerTask.java @@ -23,9 +23,9 @@ import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.benchmark.byTask.PerfRunData; import org.apache.lucene.benchmark.byTask.utils.AnalyzerFactory; -import org.apache.lucene.util.Version; /** * Create a new {@link org.apache.lucene.analysis.Analyzer} and set it in the getRunData() for use @@ -42,17 +42,13 @@ public NewAnalyzerTask(PerfRunData runData) { public static final Analyzer createAnalyzer(String className) throws Exception { final Class clazz = Class.forName(className).asSubclass(Analyzer.class); - try { - // first try to use a ctor with version parameter (needed for many new Analyzers that have no - // default one anymore - Constructor cnstr = clazz.getConstructor(Version.class); - return cnstr.newInstance(Version.LATEST); - } catch ( - @SuppressWarnings("unused") - NoSuchMethodException nsme) { - // otherwise use default ctor - return clazz.getConstructor().newInstance(); + Constructor cnstr; + if (className.equals("org.apache.lucene.analysis.core.StopAnalyzer")) { + cnstr = clazz.getConstructor(CharArraySet.class); + return cnstr.newInstance(CharArraySet.EMPTY_SET); } + cnstr = clazz.getConstructor(); + return cnstr.newInstance(); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java index eb735c84b83f..211912142a45 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene90/PForUtil.java @@ -116,7 +116,7 @@ void encode(long[] longs, DataOutput out) throws IOException { out.writeBytes(exceptions, exceptions.length); } - /** Decode 128 integers into {@code ints}. */ + /** Decode 128 integers into {@code longs}. */ void decode(DataInput in, long[] longs) throws IOException { final int token = Byte.toUnsignedInt(in.readByte()); final int bitsPerValue = token & 0x1f; diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java b/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java index f56989d5a622..7f61da627ec2 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiTerms.java @@ -136,17 +136,16 @@ public ReaderSlice[] getSubSlices() { @Override public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { - final List termsEnums = new ArrayList<>(); + final List termsEnums = new ArrayList<>(); for (int i = 0; i < subs.length; i++) { final TermsEnum termsEnum = subs[i].intersect(compiled, startTerm); if (termsEnum != null) { - termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); + termsEnums.add(new TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { - return new MultiTermsEnum(subSlices) - .reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); + return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } @@ -181,17 +180,16 @@ public BytesRef getMax() throws IOException { @Override public TermsEnum iterator() throws IOException { - final List termsEnums = new ArrayList<>(); + final List termsEnums = new ArrayList<>(); for (int i = 0; i < subs.length; i++) { final TermsEnum termsEnum = subs[i].iterator(); if (termsEnum != null) { - termsEnums.add(new MultiTermsEnum.TermsEnumIndex(termsEnum, i)); + termsEnums.add(new TermsEnumIndex(termsEnum, i)); } } if (termsEnums.size() > 0) { - return new MultiTermsEnum(subSlices) - .reset(termsEnums.toArray(MultiTermsEnum.TermsEnumIndex.EMPTY_ARRAY)); + return new MultiTermsEnum(subSlices).reset(termsEnums.toArray(TermsEnumIndex.EMPTY_ARRAY)); } else { return TermsEnum.EMPTY; } diff --git a/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java b/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java index b849b07cec90..f4cbb4cc1f5b 100644 --- a/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java +++ b/lucene/core/src/java/org/apache/lucene/index/MultiTermsEnum.java @@ -36,7 +36,7 @@ public final class MultiTermsEnum extends BaseTermsEnum { new Comparator() { @Override public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) { - return o1.index - o2.index; + return o1.subIndex - o2.subIndex; } }; @@ -56,17 +56,6 @@ public int compare(TermsEnumWithSlice o1, TermsEnumWithSlice o2) { private int numSubs; private BytesRef current; - static class TermsEnumIndex { - public static final TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; - final int subIndex; - final TermsEnum termsEnum; - - public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { - this.termsEnum = termsEnum; - this.subIndex = subIndex; - } - } - /** Returns how many sub-reader slices contain the current term. @see #getMatchArray */ public int getMatchCount() { return numTop; @@ -114,10 +103,10 @@ public TermsEnum reset(TermsEnumIndex[] termsEnumsIndex) throws IOException { final TermsEnumIndex termsEnumIndex = termsEnumsIndex[i]; assert termsEnumIndex != null; - final BytesRef term = termsEnumIndex.termsEnum.next(); + final BytesRef term = termsEnumIndex.next(); if (term != null) { final TermsEnumWithSlice entry = subs[termsEnumIndex.subIndex]; - entry.reset(termsEnumIndex.termsEnum, term); + entry.reset(termsEnumIndex); queue.add(entry); currentSubs[numSubs++] = entry; } else { @@ -154,7 +143,7 @@ public boolean seekExact(BytesRef term) throws IOException { // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { - final BytesRef curTerm = currentSubs[i].current; + final BytesRef curTerm = currentSubs[i].term(); if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { @@ -162,19 +151,19 @@ public boolean seekExact(BytesRef term) throws IOException { } else if (cmp < 0) { status = false; } else { - status = currentSubs[i].terms.seekExact(term); + status = currentSubs[i].seekExact(term); } } else { status = false; } } else { - status = currentSubs[i].terms.seekExact(term); + status = currentSubs[i].seekExact(term); } if (status) { top[numTop++] = currentSubs[i]; - current = currentSubs[i].current = currentSubs[i].terms.term(); - assert term.equals(currentSubs[i].current); + current = currentSubs[i].term(); + assert term.equals(currentSubs[i].term()); } } @@ -206,7 +195,7 @@ public SeekStatus seekCeil(BytesRef term) throws IOException { // Doing so is a waste because this sub will simply // seek to the same spot. if (seekOpt) { - final BytesRef curTerm = currentSubs[i].current; + final BytesRef curTerm = currentSubs[i].term(); if (curTerm != null) { final int cmp = term.compareTo(curTerm); if (cmp == 0) { @@ -214,28 +203,25 @@ public SeekStatus seekCeil(BytesRef term) throws IOException { } else if (cmp < 0) { status = SeekStatus.NOT_FOUND; } else { - status = currentSubs[i].terms.seekCeil(term); + status = currentSubs[i].seekCeil(term); } } else { status = SeekStatus.END; } } else { - status = currentSubs[i].terms.seekCeil(term); + status = currentSubs[i].seekCeil(term); } if (status == SeekStatus.FOUND) { top[numTop++] = currentSubs[i]; - current = currentSubs[i].current = currentSubs[i].terms.term(); + current = currentSubs[i].term(); queue.add(currentSubs[i]); } else { if (status == SeekStatus.NOT_FOUND) { - currentSubs[i].current = currentSubs[i].terms.term(); - assert currentSubs[i].current != null; + assert currentSubs[i].term() != null; queue.add(currentSubs[i]); } else { assert status == SeekStatus.END; - // enum exhausted - currentSubs[i].current = null; } } } @@ -269,15 +255,14 @@ private void pullTop() { // top term assert numTop == 0; numTop = queue.fillTop(top); - current = top[0].current; + current = top[0].term(); } private void pushTop() throws IOException { // call next() on each top, and reorder queue for (int i = 0; i < numTop; i++) { TermsEnumWithSlice top = queue.top(); - top.current = top.terms.next(); - if (top.current == null) { + if (top.next() == null) { queue.pop(); } else { queue.updateTop(); @@ -320,7 +305,7 @@ public BytesRef next() throws IOException { public int docFreq() throws IOException { int sum = 0; for (int i = 0; i < numTop; i++) { - sum += top[i].terms.docFreq(); + sum += top[i].termsEnum.docFreq(); } return sum; } @@ -329,7 +314,7 @@ public int docFreq() throws IOException { public long totalTermFreq() throws IOException { long sum = 0; for (int i = 0; i < numTop; i++) { - final long v = top[i].terms.totalTermFreq(); + final long v = top[i].termsEnum.totalTermFreq(); assert v != -1; sum += v; } @@ -359,12 +344,12 @@ public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { final TermsEnumWithSlice entry = top[i]; - assert entry.index < docsEnum.subPostingsEnums.length - : entry.index + " vs " + docsEnum.subPostingsEnums.length + "; " + subs.length; + assert entry.subIndex < docsEnum.subPostingsEnums.length + : entry.subIndex + " vs " + docsEnum.subPostingsEnums.length + "; " + subs.length; final PostingsEnum subPostingsEnum = - entry.terms.postings(docsEnum.subPostingsEnums[entry.index], flags); + entry.termsEnum.postings(docsEnum.subPostingsEnums[entry.subIndex], flags); assert subPostingsEnum != null; - docsEnum.subPostingsEnums[entry.index] = subPostingsEnum; + docsEnum.subPostingsEnums[entry.subIndex] = subPostingsEnum; subDocs[upto].postingsEnum = subPostingsEnum; subDocs[upto].slice = entry.subSlice; upto++; @@ -379,26 +364,18 @@ public ImpactsEnum impacts(int flags) throws IOException { return new SlowImpactsEnum(postings(null, flags)); } - static final class TermsEnumWithSlice { + static final class TermsEnumWithSlice extends TermsEnumIndex { private final ReaderSlice subSlice; - TermsEnum terms; - public BytesRef current; - final int index; public TermsEnumWithSlice(int index, ReaderSlice subSlice) { + super(null, index); this.subSlice = subSlice; - this.index = index; assert subSlice.length >= 0 : "length=" + subSlice.length; } - public void reset(TermsEnum terms, BytesRef term) { - this.terms = terms; - current = term; - } - @Override public String toString() { - return subSlice.toString() + ":" + terms; + return subSlice.toString() + ":" + super.toString(); } } @@ -413,7 +390,7 @@ private static final class TermMergeQueue extends PriorityQueue { - public TermsEnumIndex(TermsEnum termsEnum, int subIndex) { - this.termsEnum = termsEnum; - this.subIndex = subIndex; + TermsEnumPriorityQueue(int size) { + super(size); } - public BytesRef next() throws IOException { - currentTerm = termsEnum.next(); - return currentTerm; + @Override + protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { + return a.compareTermTo(b) < 0; } } @@ -227,13 +221,7 @@ public static OrdinalMap build( long[] segmentOrds = new long[subs.length]; // Just merge-sorts by term: - PriorityQueue queue = - new PriorityQueue(subs.length) { - @Override - protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { - return a.currentTerm.compareTo(b.currentTerm) < 0; - } - }; + TermsEnumPriorityQueue queue = new TermsEnumPriorityQueue(subs.length); for (int i = 0; i < subs.length; i++) { TermsEnumIndex sub = new TermsEnumIndex(subs[segmentMap.newToOld(i)], i); @@ -242,19 +230,18 @@ protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { } } - BytesRefBuilder scratch = new BytesRefBuilder(); + TermsEnumIndex.TermState topState = new TermsEnumIndex.TermState(); long globalOrd = 0; while (queue.size() != 0) { TermsEnumIndex top = queue.top(); - scratch.copyBytes(top.currentTerm); + topState.copyFrom(top); int firstSegmentIndex = Integer.MAX_VALUE; long globalOrdDelta = Long.MAX_VALUE; // Advance past this term, recording the per-segment ord deltas: while (true) { - top = queue.top(); long segmentOrd = top.termsEnum.ord(); long delta = globalOrd - segmentOrd; int segmentIndex = top.subIndex; @@ -284,10 +271,11 @@ protected boolean lessThan(TermsEnumIndex a, TermsEnumIndex b) { if (queue.size() == 0) { break; } + top = queue.top(); } else { - queue.updateTop(); + top = queue.updateTop(); } - if (queue.top().currentTerm.equals(scratch.get()) == false) { + if (top.termEquals(topState) == false) { break; } } diff --git a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java index 0f579b9d266a..9713923916bb 100644 --- a/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java +++ b/lucene/core/src/java/org/apache/lucene/index/ReadersAndUpdates.java @@ -18,8 +18,6 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; @@ -555,8 +553,6 @@ public synchronized boolean writeFieldUpdates( FieldInfos fieldInfos = null; boolean any = false; for (List updates : pendingDVUpdates.values()) { - // Sort by increasing delGen: - Collections.sort(updates, Comparator.comparingLong(a -> a.delGen)); for (DocValuesFieldUpdates update : updates) { if (update.delGen <= maxDelGen && update.any()) { any = true; diff --git a/lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java b/lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java new file mode 100644 index 000000000000..57a5d5ae0cba --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/index/TermsEnumIndex.java @@ -0,0 +1,183 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Objects; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** + * Wrapper around a {@link TermsEnum} and an integer that identifies it. All operations that move + * the current position of the {@link TermsEnum} must be performed via this wrapper class, not + * directly on the wrapped {@link TermsEnum}. + */ +class TermsEnumIndex { + + static final TermsEnumIndex[] EMPTY_ARRAY = new TermsEnumIndex[0]; + + /** + * Copy the first 8 bytes of the given term as a comparable unsigned long. In case the term has + * less than 8 bytes, missing bytes will be replaced with zeroes. Note that two terms that produce + * the same long could still be different due to the fact that missing bytes are replaced with + * zeroes, e.g. {@code [1, 0]} and {@code [1]} get mapped to the same long. + */ + static long prefix8ToComparableUnsignedLong(BytesRef term) { + // Use Big Endian so that longs are comparable + if (term.length >= Long.BYTES) { + return (long) BitUtil.VH_BE_LONG.get(term.bytes, term.offset); + } else { + long l; + int o; + if (Integer.BYTES <= term.length) { + l = (int) BitUtil.VH_BE_INT.get(term.bytes, term.offset); + o = Integer.BYTES; + } else { + l = 0; + o = 0; + } + if (o + Short.BYTES <= term.length) { + l = + (l << Short.SIZE) + | Short.toUnsignedLong( + (short) BitUtil.VH_BE_SHORT.get(term.bytes, term.offset + o)); + o += Short.BYTES; + } + if (o < term.length) { + l = (l << Byte.SIZE) | Byte.toUnsignedLong(term.bytes[term.offset + o]); + } + l <<= (Long.BYTES - term.length) << 3; + return l; + } + } + + final int subIndex; + TermsEnum termsEnum; + private BytesRef currentTerm; + private long currentTermPrefix8; + + TermsEnumIndex(TermsEnum termsEnum, int subIndex) { + this.termsEnum = termsEnum; + this.subIndex = subIndex; + } + + BytesRef term() { + return currentTerm; + } + + private void setTerm(BytesRef term) { + currentTerm = term; + if (currentTerm == null) { + currentTermPrefix8 = 0; + } else { + currentTermPrefix8 = prefix8ToComparableUnsignedLong(currentTerm); + } + } + + BytesRef next() throws IOException { + BytesRef term = termsEnum.next(); + setTerm(term); + return term; + } + + SeekStatus seekCeil(BytesRef term) throws IOException { + SeekStatus status = termsEnum.seekCeil(term); + if (status == SeekStatus.END) { + setTerm(null); + } else { + setTerm(termsEnum.term()); + } + return status; + } + + boolean seekExact(BytesRef term) throws IOException { + boolean found = termsEnum.seekExact(term); + if (found) { + setTerm(termsEnum.term()); + } else { + setTerm(null); + } + return found; + } + + void seekExact(long ord) throws IOException { + termsEnum.seekExact(ord); + setTerm(termsEnum.term()); + } + + void reset(TermsEnumIndex tei) throws IOException { + termsEnum = tei.termsEnum; + currentTerm = tei.currentTerm; + currentTermPrefix8 = tei.currentTermPrefix8; + } + + int compareTermTo(TermsEnumIndex that) { + if (currentTermPrefix8 != that.currentTermPrefix8) { + int cmp = Long.compareUnsigned(currentTermPrefix8, that.currentTermPrefix8); + assert Integer.signum(cmp) + == Integer.signum( + Arrays.compareUnsigned( + currentTerm.bytes, + currentTerm.offset, + currentTerm.offset + currentTerm.length, + that.currentTerm.bytes, + that.currentTerm.offset, + that.currentTerm.offset + that.currentTerm.length)); + return cmp; + } + + return Arrays.compareUnsigned( + currentTerm.bytes, + currentTerm.offset, + currentTerm.offset + currentTerm.length, + that.currentTerm.bytes, + that.currentTerm.offset, + that.currentTerm.offset + that.currentTerm.length); + } + + @Override + public String toString() { + return Objects.toString(termsEnum); + } + + /** Wrapper around a term that allows for quick equals comparisons. */ + static class TermState { + private final BytesRefBuilder term = new BytesRefBuilder(); + private long termPrefix8; + + void copyFrom(TermsEnumIndex tei) { + term.copyBytes(tei.term()); + termPrefix8 = tei.currentTermPrefix8; + } + } + + boolean termEquals(TermState that) { + if (currentTermPrefix8 != that.termPrefix8) { + return false; + } + return Arrays.equals( + currentTerm.bytes, + currentTerm.offset, + currentTerm.offset + currentTerm.length, + that.term.bytes(), + 0, + that.term.length()); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index ed4066a94ac4..3d565b650a9b 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -21,8 +21,6 @@ import java.lang.StackWalker.StackFrame; import java.lang.invoke.MethodHandles; import java.lang.invoke.MethodType; -import java.security.AccessController; -import java.security.PrivilegedAction; import java.util.Locale; import java.util.Objects; import java.util.Optional; @@ -31,7 +29,7 @@ import java.util.function.Predicate; import java.util.logging.Logger; import java.util.stream.Stream; -import org.apache.lucene.util.SuppressForbidden; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.VectorUtil; /** @@ -129,7 +127,7 @@ static VectorizationProvider lookup(boolean testMode) { "Vector bitsize and/or integer vectors enforcement; using default vectorization provider outside of testMode"); return new DefaultVectorizationProvider(); } - if (isClientVM()) { + if (Constants.IS_CLIENT_VM) { LOG.warning("C2 compiler is disabled; Java vector incubator API can't be enabled"); return new DefaultVectorizationProvider(); } @@ -188,23 +186,6 @@ private static boolean isAffectedByJDK8301190() { && !Objects.equals("I", "i".toUpperCase(Locale.getDefault())); } - @SuppressWarnings("removal") - @SuppressForbidden(reason = "security manager") - private static boolean isClientVM() { - try { - final PrivilegedAction action = - () -> System.getProperty("java.vm.info", "").contains("emulated-client"); - return AccessController.doPrivileged(action); - } catch ( - @SuppressWarnings("unused") - SecurityException e) { - LOG.warning( - "SecurityManager denies permission to 'java.vm.info' system property, so state of C2 compiler can't be detected. " - + "In case of performance issues allow access to this property."); - return false; - } - } - // add all possible callers here as FQCN: private static final Set VALID_CALLERS = Set.of("org.apache.lucene.util.VectorUtil"); diff --git a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java index 548bbb401b20..616b8cf7a7bc 100644 --- a/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java +++ b/lucene/core/src/java/org/apache/lucene/search/comparators/TermOrdValComparator.java @@ -475,7 +475,7 @@ private static class PostingsEnumAndOrd { private class CompetitiveIterator extends DocIdSetIterator { - private static final int MAX_TERMS = 128; + private static final int MAX_TERMS = 1024; private final LeafReaderContext context; private final int maxDoc; diff --git a/lucene/core/src/java/org/apache/lucene/util/Constants.java b/lucene/core/src/java/org/apache/lucene/util/Constants.java index 090472e6736e..3ef12986bb2b 100644 --- a/lucene/core/src/java/org/apache/lucene/util/Constants.java +++ b/lucene/core/src/java/org/apache/lucene/util/Constants.java @@ -16,18 +16,25 @@ */ package org.apache.lucene.util; +import java.security.AccessController; +import java.security.PrivilegedAction; +import java.util.Objects; +import java.util.logging.Logger; + /** Some useful constants. */ public final class Constants { private Constants() {} // can't construct + private static final String UNKNOWN = "Unknown"; + /** JVM vendor info. */ - public static final String JVM_VENDOR = System.getProperty("java.vm.vendor"); + public static final String JVM_VENDOR = getSysProp("java.vm.vendor", UNKNOWN); /** JVM vendor name. */ - public static final String JVM_NAME = System.getProperty("java.vm.name"); + public static final String JVM_NAME = getSysProp("java.vm.name", UNKNOWN); /** The value of System.getProperty("os.name"). * */ - public static final String OS_NAME = System.getProperty("os.name"); + public static final String OS_NAME = getSysProp("os.name", UNKNOWN); /** True iff running on Linux. */ public static final boolean LINUX = OS_NAME.startsWith("Linux"); @@ -45,36 +52,67 @@ private Constants() {} // can't construct public static final boolean FREE_BSD = OS_NAME.startsWith("FreeBSD"); /** The value of System.getProperty("os.arch"). */ - public static final String OS_ARCH = System.getProperty("os.arch"); + public static final String OS_ARCH = getSysProp("os.arch", UNKNOWN); /** The value of System.getProperty("os.version"). */ - public static final String OS_VERSION = System.getProperty("os.version"); + public static final String OS_VERSION = getSysProp("os.version", UNKNOWN); /** The value of System.getProperty("java.vendor"). */ - public static final String JAVA_VENDOR = System.getProperty("java.vendor"); + public static final String JAVA_VENDOR = getSysProp("java.vendor", UNKNOWN); + + /** True iff the Java runtime is a client runtime and C2 compiler is not enabled */ + public static final boolean IS_CLIENT_VM = + getSysProp("java.vm.info", "").contains("emulated-client"); /** True iff running on a 64bit JVM */ - public static final boolean JRE_IS_64BIT; + public static final boolean JRE_IS_64BIT = is64Bit(); + + /** true iff we know fast FMA is supported, to deliver less error */ + public static final boolean HAS_FAST_FMA = + (IS_CLIENT_VM == false) + && Objects.equals(OS_ARCH, "amd64") + && HotspotVMOptions.get("UseFMA").map(Boolean::valueOf).orElse(false); - static { - boolean is64Bit = false; - String datamodel = null; + private static boolean is64Bit() { + final String datamodel = getSysProp("sun.arch.data.model"); + if (datamodel != null) { + return datamodel.contains("64"); + } else { + return (OS_ARCH != null && OS_ARCH.contains("64")); + } + } + + private static String getSysProp(String property) { try { - datamodel = System.getProperty("sun.arch.data.model"); - if (datamodel != null) { - is64Bit = datamodel.contains("64"); - } + return doPrivileged(() -> System.getProperty(property)); } catch ( @SuppressWarnings("unused") - SecurityException ex) { + SecurityException se) { + logSecurityWarning(property); + return null; } - if (datamodel == null) { - if (OS_ARCH != null && OS_ARCH.contains("64")) { - is64Bit = true; - } else { - is64Bit = false; - } + } + + private static String getSysProp(String property, String def) { + try { + return doPrivileged(() -> System.getProperty(property, def)); + } catch ( + @SuppressWarnings("unused") + SecurityException se) { + logSecurityWarning(property); + return def; } - JRE_IS_64BIT = is64Bit; + } + + private static void logSecurityWarning(String property) { + var log = Logger.getLogger(Constants.class.getName()); + log.warning("SecurityManager prevented access to system property: " + property); + } + + // Extracted to a method to be able to apply the SuppressForbidden annotation + @SuppressWarnings("removal") + @SuppressForbidden(reason = "security manager") + private static T doPrivileged(PrivilegedAction action) { + return AccessController.doPrivileged(action); } } diff --git a/lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java b/lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java new file mode 100644 index 000000000000..70f963e1b378 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/util/HotspotVMOptions.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.util; + +import java.lang.reflect.Method; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; +import java.util.logging.Logger; + +/** Accessor to get Hotspot VM Options (if available). */ +final class HotspotVMOptions { + private HotspotVMOptions() {} // can't construct + + /** True if the Java VM is based on Hotspot and has the Hotspot MX bean readable by Lucene */ + public static final boolean IS_HOTSPOT; + + /** + * Returns an optional with the value of a Hotspot VM option. If the VM option does not exist or + * is not readable, returns an empty optional. + */ + public static Optional get(String name) { + return ACCESSOR.apply(Objects.requireNonNull(name, "name")); + } + + private static final String MANAGEMENT_FACTORY_CLASS = "java.lang.management.ManagementFactory"; + private static final String HOTSPOT_BEAN_CLASS = "com.sun.management.HotSpotDiagnosticMXBean"; + private static final Function> ACCESSOR; + + static { + boolean isHotspot = false; + Function> accessor = name -> Optional.empty(); + try { + final Class beanClazz = Class.forName(HOTSPOT_BEAN_CLASS); + // we use reflection for this, because the management factory is not part + // of java.base module: + final Object hotSpotBean = + Class.forName(MANAGEMENT_FACTORY_CLASS) + .getMethod("getPlatformMXBean", Class.class) + .invoke(null, beanClazz); + if (hotSpotBean != null) { + final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class); + final Method getValueMethod = getVMOptionMethod.getReturnType().getMethod("getValue"); + isHotspot = true; + accessor = + name -> { + try { + final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, name); + return Optional.of(getValueMethod.invoke(vmOption).toString()); + } catch (@SuppressWarnings("unused") + ReflectiveOperationException + | RuntimeException e) { + return Optional.empty(); + } + }; + } + } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { + isHotspot = false; + final Logger log = Logger.getLogger(HotspotVMOptions.class.getName()); + final Module module = HotspotVMOptions.class.getModule(); + final ModuleLayer layer = module.getLayer(); + // classpath / unnamed module has no layer, so we need to check: + if (layer != null + && layer.findModule("jdk.management").map(module::canRead).orElse(false) == false) { + log.warning( + "Lucene cannot access JVM internals to optimize algorithms or calculate object sizes, unless the 'jdk.management' Java module " + + "is readable [please add 'jdk.management' to modular application either by command line or its module descriptor]."); + } else { + log.warning( + "Lucene cannot optimize algorithms or calculate object sizes for JVMs that are not based on Hotspot or a compatible implementation."); + } + } + IS_HOTSPOT = isHotspot; + ACCESSOR = accessor; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java index 1d363170a940..7e0bdfdee845 100644 --- a/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java +++ b/lucene/core/src/java/org/apache/lucene/util/RamUsageEstimator.java @@ -18,7 +18,6 @@ import java.lang.reflect.Array; import java.lang.reflect.Field; -import java.lang.reflect.Method; import java.lang.reflect.Modifier; import java.security.AccessControlException; import java.security.AccessController; @@ -30,7 +29,6 @@ import java.util.IdentityHashMap; import java.util.Locale; import java.util.Map; -import java.util.logging.Logger; import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.Query; @@ -112,64 +110,16 @@ private RamUsageEstimator() {} /** For testing only */ static final boolean JVM_IS_HOTSPOT_64BIT; - static final String MANAGEMENT_FACTORY_CLASS = "java.lang.management.ManagementFactory"; - static final String HOTSPOT_BEAN_CLASS = "com.sun.management.HotSpotDiagnosticMXBean"; - /** Initialize constants and try to collect information about the JVM internals. */ static { - if (Constants.JRE_IS_64BIT) { + if (Constants.JRE_IS_64BIT && HotspotVMOptions.IS_HOTSPOT) { // Try to get compressed oops and object alignment (the default seems to be 8 on Hotspot); // (this only works on 64 bit, on 32 bits the alignment and reference size is fixed): - boolean compressedOops = false; - int objectAlignment = 8; - boolean isHotspot = false; - try { - final Class beanClazz = Class.forName(HOTSPOT_BEAN_CLASS); - // we use reflection for this, because the management factory is not part - // of Java 8's compact profile: - final Object hotSpotBean = - Class.forName(MANAGEMENT_FACTORY_CLASS) - .getMethod("getPlatformMXBean", Class.class) - .invoke(null, beanClazz); - if (hotSpotBean != null) { - isHotspot = true; - final Method getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class); - try { - final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "UseCompressedOops"); - compressedOops = - Boolean.parseBoolean( - vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()); - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - isHotspot = false; - } - try { - final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "ObjectAlignmentInBytes"); - objectAlignment = - Integer.parseInt( - vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()); - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - isHotspot = false; - } - } - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - isHotspot = false; - final Logger log = Logger.getLogger(RamUsageEstimator.class.getName()); - final Module module = RamUsageEstimator.class.getModule(); - final ModuleLayer layer = module.getLayer(); - // classpath / unnamed module has no layer, so we need to check: - if (layer != null - && layer.findModule("jdk.management").map(module::canRead).orElse(false) == false) { - log.warning( - "Lucene cannot correctly calculate object sizes on 64bit JVMs, unless the 'jdk.management' Java module " - + "is readable [please add 'jdk.management' to modular application either by command line or its module descriptor]"); - } else { - log.warning( - "Lucene cannot correctly calculate object sizes on 64bit JVMs that are not based on Hotspot or a compatible implementation."); - } - } - JVM_IS_HOTSPOT_64BIT = isHotspot; - COMPRESSED_REFS_ENABLED = compressedOops; - NUM_BYTES_OBJECT_ALIGNMENT = objectAlignment; + JVM_IS_HOTSPOT_64BIT = true; + COMPRESSED_REFS_ENABLED = + HotspotVMOptions.get("UseCompressedOops").map(Boolean::valueOf).orElse(false); + NUM_BYTES_OBJECT_ALIGNMENT = + HotspotVMOptions.get("ObjectAlignmentInBytes").map(Integer::valueOf).orElse(8); // reference size is 4, if we have compressed oops: NUM_BYTES_OBJECT_REF = COMPRESSED_REFS_ENABLED ? 4 : 8; // "best guess" based on reference size: diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java index ed1688efd301..0d17a6fcab47 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/RegExp.java @@ -1128,6 +1128,10 @@ else if (match('{')) { if (start != pos) m = Integer.parseInt(originalString.substring(start, pos)); } else m = n; if (!match('}')) throw new IllegalArgumentException("expected '}' at position " + pos); + if (m != -1 && n > m) { + throw new IllegalArgumentException( + "invalid repetition range(out of order): " + n + ".." + m); + } if (m == -1) e = makeRepeat(flags, e, n); else e = makeRepeat(flags, e, n, m); } diff --git a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java index f17c220f83d2..3af624100708 100644 --- a/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java +++ b/lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java @@ -270,10 +270,6 @@ public float getDirectAddressingMaxOversizingFactor() { return directAddressingMaxOversizingFactor; } - public long getTermCount() { - return frontier[0].inputCount; - } - public long getNodeCount() { // 1+ in order to count the -1 implicit final node return 1 + nodeCount; @@ -749,7 +745,6 @@ public void add(IntsRef input, T output) throws IOException { // format cannot represent the empty input since // 'finalness' is stored on the incoming arc, not on // the node - frontier[0].inputCount++; frontier[0].isFinal = true; fst.setEmptyOutput(output); return; @@ -760,9 +755,6 @@ public void add(IntsRef input, T output) throws IOException { int pos2 = input.offset; final int pos1Stop = Math.min(lastInput.length(), input.length); while (true) { - frontier[pos1].inputCount++; - // System.out.println(" incr " + pos1 + " ct=" + frontier[pos1].inputCount + " n=" + - // frontier[pos1]); if (pos1 >= pos1Stop || lastInput.intAt(pos1) != input.ints[pos2]) { break; } @@ -786,7 +778,6 @@ public void add(IntsRef input, T output) throws IOException { // init tail states for current input for (int idx = prefixLenPlus1; idx <= input.length; idx++) { frontier[idx - 1].addArc(input.ints[input.offset + idx - 1], frontier[idx]); - frontier[idx].inputCount++; } final UnCompiledNode lastNode = frontier[input.length]; @@ -835,8 +826,6 @@ public void add(IntsRef input, T output) throws IOException { // save last input lastInput.copyInts(input); - - // System.out.println(" count[0]=" + frontier[0].inputCount); } private boolean validOutput(T output) { @@ -906,10 +895,6 @@ static final class UnCompiledNode implements Node { T output; boolean isFinal; - // TODO: remove this tracking? we used to use it for confusingly pruning NodeHash, but - // we switched to LRU by RAM usage instead: - long inputCount; - /** This node's depth, starting from the automaton root. */ final int depth; @@ -935,7 +920,6 @@ void clear() { numArcs = 0; isFinal = false; output = owner.NO_OUTPUT; - inputCount = 0; // We don't clear the depth here because it never changes // for nodes on the frontier (even when reused). diff --git a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java index 5b382c4c7c25..d4e8a50ef8f4 100644 --- a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java +++ b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorUtilSupport.java @@ -77,41 +77,9 @@ final class PanamaVectorUtilSupport implements VectorUtilSupport { VectorizationProvider.TESTS_FORCE_INTEGER_VECTORS || (isAMD64withoutAVX2 == false); } - private static final String MANAGEMENT_FACTORY_CLASS = "java.lang.management.ManagementFactory"; - private static final String HOTSPOT_BEAN_CLASS = "com.sun.management.HotSpotDiagnosticMXBean"; - - // best effort to see if FMA is fast (this is architecture-independent option) - private static boolean hasFastFMA() { - // on ARM cpus, FMA works fine but is a slight slowdown: don't use it. - if (Constants.OS_ARCH.equals("amd64") == false) { - return false; - } - try { - final Class beanClazz = Class.forName(HOTSPOT_BEAN_CLASS); - // we use reflection for this, because the management factory is not part - // of Java 8's compact profile: - final Object hotSpotBean = - Class.forName(MANAGEMENT_FACTORY_CLASS) - .getMethod("getPlatformMXBean", Class.class) - .invoke(null, beanClazz); - if (hotSpotBean != null) { - final var getVMOptionMethod = beanClazz.getMethod("getVMOption", String.class); - final Object vmOption = getVMOptionMethod.invoke(hotSpotBean, "UseFMA"); - return Boolean.parseBoolean( - vmOption.getClass().getMethod("getValue").invoke(vmOption).toString()); - } - return false; - } catch (@SuppressWarnings("unused") ReflectiveOperationException | RuntimeException e) { - return false; - } - } - - // true if we know FMA is supported, to deliver less error - static final boolean HAS_FAST_FMA = hasFastFMA(); - // the way FMA should work! if available use it, otherwise fall back to mul/add private static FloatVector fma(FloatVector a, FloatVector b, FloatVector c) { - if (HAS_FAST_FMA) { + if (Constants.HAS_FAST_FMA) { return a.fma(b, c); } else { return a.mul(b).add(c); diff --git a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java index fc303a687a07..ffd18df1a270 100644 --- a/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java +++ b/lucene/core/src/java20/org/apache/lucene/internal/vectorization/PanamaVectorizationProvider.java @@ -21,6 +21,7 @@ import java.util.Locale; import java.util.logging.Logger; import jdk.incubator.vector.FloatVector; +import org.apache.lucene.util.Constants; import org.apache.lucene.util.SuppressForbidden; /** A vectorization provider that leverages the Panama Vector API. */ @@ -62,7 +63,7 @@ private static T doPrivileged(PrivilegedAction action) { Locale.ENGLISH, "Java vector incubator API enabled; uses preferredBitSize=%d%s%s", PanamaVectorUtilSupport.VECTOR_BITSIZE, - PanamaVectorUtilSupport.HAS_FAST_FMA ? "; FMA enabled" : "", + Constants.HAS_FAST_FMA ? "; FMA enabled" : "", PanamaVectorUtilSupport.HAS_FAST_INTEGER_VECTORS ? "" : "; floating-point vectors only")); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java index f2b56868e6c7..ac2ff786dd96 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestDeletionPolicy.java @@ -459,7 +459,8 @@ public void testOpenPriorSnapshot() throws IOException { dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy) - .setIndexCommit(lastCommit)); + .setIndexCommit(lastCommit) + .setMergePolicy(newLogMergePolicy(10))); assertEquals(10, writer.getDocStats().numDocs); // Should undo our rollback: @@ -476,12 +477,13 @@ public void testOpenPriorSnapshot() throws IOException { dir, newIndexWriterConfig(new MockAnalyzer(random())) .setIndexDeletionPolicy(policy) - .setIndexCommit(lastCommit)); + .setIndexCommit(lastCommit) + .setMergePolicy(newLogMergePolicy(10))); assertEquals(10, writer.getDocStats().numDocs); // Commits the rollback: writer.close(); - // Now 8 because we made another commit + // Now 7 because we made another commit assertEquals(7, DirectoryReader.listCommits(dir).size()); r = DirectoryReader.open(dir); @@ -507,7 +509,10 @@ public void testOpenPriorSnapshot() throws IOException { // but this time keeping only the last commit: writer = new IndexWriter( - dir, newIndexWriterConfig(new MockAnalyzer(random())).setIndexCommit(lastCommit)); + dir, + newIndexWriterConfig(new MockAnalyzer(random())) + .setIndexCommit(lastCommit) + .setMergePolicy(newLogMergePolicy(10))); assertEquals(10, writer.getDocStats().numDocs); // Reader still sees fully merged index, because writer diff --git a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java index 1990ce93deb3..bd71fedb05ee 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestIndexWriter.java @@ -2395,11 +2395,12 @@ public void testHasUncommittedChanges() throws IOException { writer.addDocument(doc); assertTrue(writer.hasUncommittedChanges()); - // Must commit, waitForMerges, commit again, to be - // certain that hasUncommittedChanges returns false: - writer.commit(); - writer.waitForMerges(); - writer.commit(); + // Must commit and wait for merges as long as the commit triggers merges to be certain that + // hasUncommittedChanges returns false + do { + writer.waitForMerges(); + writer.commit(); + } while (writer.hasPendingMerges()); assertFalse(writer.hasUncommittedChanges()); writer.deleteDocuments(new Term("id", "xyz")); assertTrue(writer.hasUncommittedChanges()); diff --git a/lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java new file mode 100644 index 000000000000..ac964052718f --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/index/TestTermsEnumIndex.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.index; + +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; + +public class TestTermsEnumIndex extends LuceneTestCase { + + public void testPrefix8ToComparableUnsignedLong() { + byte[] b = new byte[] {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + assertEquals(0L, TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 1, 0))); + assertEquals(4L << 56, TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 1))); + assertEquals( + (4L << 56) | (5L << 48), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 2))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 3))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 4))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32) | (8L << 24), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 5))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32) | (8L << 24) | (9L << 16), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 6))); + assertEquals( + (4L << 56) | (5L << 48) | (6L << 40) | (7L << 32) | (8L << 24) | (9L << 16) | (10L << 8), + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 7))); + assertEquals( + (4L << 56) + | (5L << 48) + | (6L << 40) + | (7L << 32) + | (8L << 24) + | (9L << 16) + | (10L << 8) + | 11L, + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 8))); + assertEquals( + (4L << 56) + | (5L << 48) + | (6L << 40) + | (7L << 32) + | (8L << 24) + | (9L << 16) + | (10L << 8) + | 11L, + TermsEnumIndex.prefix8ToComparableUnsignedLong(new BytesRef(b, 3, 9))); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java index c960e7363047..8f6f765f2936 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java @@ -86,6 +86,17 @@ public void testLegalBackslashChars() { } } + public void testParseIllegalRepeatExp() { + // out of order + IllegalArgumentException expected = + expectThrows( + IllegalArgumentException.class, + () -> { + new RegExp("a{99,11}"); + }); + assertTrue(expected.getMessage().contains("out of order")); + } + static String randomDocValue(int minLength) { String charPalette = "AAAaaaBbbCccc123456 \t"; StringBuilder sb = new StringBuilder(); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java index 54653b9d2560..eac2fae1ef48 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java @@ -56,7 +56,7 @@ public void test() throws Exception { for (int iter = 0; iter < 1; iter++) { // Build FST w/ NoOutputs and stop when nodeCount > 2.2B { - System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS"); + System.out.println("\nTEST: ~2.2B nodes; output=NO_OUTPUTS"); Outputs outputs = NoOutputs.getSingleton(); Object NO_OUTPUT = outputs.getNoOutput(); final FSTCompiler fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs); diff --git a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java index 927fe058ef05..f6dd84efd0e6 100644 --- a/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ b/lucene/core/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -568,7 +568,6 @@ public void run(int limit, boolean verify) throws IOException { System.out.println( ((tMid - tStart) / (double) TimeUnit.SECONDS.toNanos(1)) + " sec to add all terms"); - assert fstCompiler.getTermCount() == ord; FST fst = fstCompiler.compile(); long tEnd = System.nanoTime(); System.out.println( diff --git a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java index e4f759b30959..61de291d866e 100644 --- a/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java +++ b/lucene/luke/src/java/org/apache/lucene/luke/app/desktop/util/URLLabel.java @@ -38,8 +38,8 @@ public URLLabel(String text) { super(text); try { - this.link = (URI.create(text)).toURL(); - } catch (MalformedURLException e) { + this.link = (new URI(text)).toURL(); + } catch (URISyntaxException | MalformedURLException e) { throw new LukeException(e.getMessage(), e); } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java index 50f87ad0a194..5b114ff9a497 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/LuceneTestCase.java @@ -476,7 +476,12 @@ public abstract class LuceneTestCase extends Assert { * of iterations to scale your tests (for nightly builds). */ public static final int RANDOM_MULTIPLIER = - systemPropertyAsInt("tests.multiplier", TEST_NIGHTLY ? 2 : 1); + systemPropertyAsInt("tests.multiplier", defaultRandomMultiplier()); + + /** Compute the default value of the random multiplier (based on {@link #TEST_NIGHTLY}). */ + static int defaultRandomMultiplier() { + return TEST_NIGHTLY ? 2 : 1; + } /** Leave temporary files on disk, even on successful runs. */ public static final boolean LEAVE_TEMPORARY; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java index 9fa50a204173..753f27a15c85 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/RunListenerPrintReproduceInfo.java @@ -189,7 +189,8 @@ private void reportAdditionalFailureInfo(final String testName) { addVmOpt(b, "tests.seed", RandomizedContext.current().getRunnerSeedAsString()); // Test groups and multipliers. - if (RANDOM_MULTIPLIER > 1) addVmOpt(b, "tests.multiplier", RANDOM_MULTIPLIER); + if (RANDOM_MULTIPLIER != LuceneTestCase.defaultRandomMultiplier()) + addVmOpt(b, "tests.multiplier", RANDOM_MULTIPLIER); if (TEST_NIGHTLY) addVmOpt(b, SYSPROP_NIGHTLY, TEST_NIGHTLY); if (TEST_WEEKLY) addVmOpt(b, SYSPROP_WEEKLY, TEST_WEEKLY); if (TEST_MONSTER) addVmOpt(b, SYSPROP_MONSTER, TEST_MONSTER);