From e784083a8091b539b8dba15ae931525c19a1e2fc Mon Sep 17 00:00:00 2001 From: Armin Braun Date: Tue, 12 Dec 2023 10:15:10 +0100 Subject: [PATCH] Save 2M of heap for large static constant map in CategorizationPartOfSpeechDictionary (#103302) For one, this cleans up the lazy-loading implementation to use a standard holder approach. More importantly, this moves the implementation to use a final immutable map instead of a HashMap which saves about 2M in heap (reduction from ~7M to ~5M) for this map. --- .../CategorizationPartOfSpeechDictionary.java | 92 +++++++++---------- 1 file changed, 44 insertions(+), 48 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationPartOfSpeechDictionary.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationPartOfSpeechDictionary.java index 09a6846ead344..243286115eb8a 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationPartOfSpeechDictionary.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/aggs/categorization/CategorizationPartOfSpeechDictionary.java @@ -8,7 +8,6 @@ package org.elasticsearch.xpack.ml.aggs.categorization; import java.io.BufferedReader; -import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; @@ -73,49 +72,56 @@ static PartOfSpeech fromCode(char partOfSpeechCode) { } } - /** - * Lazy loaded singleton instance to avoid loading the dictionary repeatedly. - */ - private static CategorizationPartOfSpeechDictionary instance; - private static final Object INIT_LOCK = new Object(); + private static final class Holder { + /** + * Lazy loaded singleton instance to avoid loading the dictionary repeatedly. + */ + private static final CategorizationPartOfSpeechDictionary instance = new CategorizationPartOfSpeechDictionary(); + } /** * Keys are lower case. */ - private final Map partOfSpeechDictionary = new HashMap<>(); - private final int maxDictionaryWordLength; + private final Map partOfSpeechDictionary; - CategorizationPartOfSpeechDictionary(InputStream is) throws IOException { + private final int maxDictionaryWordLength; - int maxLength = 0; - BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); - String line; - while ((line = reader.readLine()) != null) { - line = line.trim(); - if (line.isEmpty()) { - continue; - } - String[] split = line.split(PART_OF_SPEECH_SEPARATOR); - if (split.length != 2) { - throw new IllegalArgumentException( - "Unexpected format in line [" + line + "]: expected one [" + PART_OF_SPEECH_SEPARATOR + "] separator" - ); - } - if (split[0].isEmpty()) { - throw new IllegalArgumentException( - "Unexpected format in line [" + line + "]: nothing preceding [" + PART_OF_SPEECH_SEPARATOR + "] separator" - ); - } - if (split[1].isEmpty()) { - throw new IllegalArgumentException( - "Unexpected format in line [" + line + "]: nothing following [" + PART_OF_SPEECH_SEPARATOR + "] separator" - ); + CategorizationPartOfSpeechDictionary() { + try (InputStream is = CategorizationPartOfSpeechDictionary.class.getResourceAsStream(DICTIONARY_FILE_PATH)) { + int maxLength = 0; + BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8)); + String line; + final Map partOfSpeechMap = new HashMap<>(); + while ((line = reader.readLine()) != null) { + line = line.trim(); + if (line.isEmpty()) { + continue; + } + String[] split = line.split(PART_OF_SPEECH_SEPARATOR); + if (split.length != 2) { + throw new IllegalArgumentException( + "Unexpected format in line [" + line + "]: expected one [" + PART_OF_SPEECH_SEPARATOR + "] separator" + ); + } + if (split[0].isEmpty()) { + throw new IllegalArgumentException( + "Unexpected format in line [" + line + "]: nothing preceding [" + PART_OF_SPEECH_SEPARATOR + "] separator" + ); + } + if (split[1].isEmpty()) { + throw new IllegalArgumentException( + "Unexpected format in line [" + line + "]: nothing following [" + PART_OF_SPEECH_SEPARATOR + "] separator" + ); + } + String lowerCaseWord = split[0].toLowerCase(Locale.ROOT); + partOfSpeechMap.put(lowerCaseWord, PartOfSpeech.fromCode(split[1].charAt(0))); + maxLength = Math.max(maxLength, lowerCaseWord.length()); } - String lowerCaseWord = split[0].toLowerCase(Locale.ROOT); - partOfSpeechDictionary.put(lowerCaseWord, PartOfSpeech.fromCode(split[1].charAt(0))); - maxLength = Math.max(maxLength, lowerCaseWord.length()); + partOfSpeechDictionary = Map.copyOf(partOfSpeechMap); + maxDictionaryWordLength = maxLength; + } catch (Exception e) { + throw new AssertionError(e); } - maxDictionaryWordLength = maxLength; } // TODO: now we have this in Java, perform this operation in Java for anomaly detection categorization instead of in C++. @@ -142,17 +148,7 @@ public boolean isInDictionary(String word) { return getPartOfSpeech(word) != PartOfSpeech.NOT_IN_DICTIONARY; } - public static CategorizationPartOfSpeechDictionary getInstance() throws IOException { - if (instance != null) { - return instance; - } - synchronized (INIT_LOCK) { - if (instance == null) { - try (InputStream is = CategorizationPartOfSpeechDictionary.class.getResourceAsStream(DICTIONARY_FILE_PATH)) { - instance = new CategorizationPartOfSpeechDictionary(is); - } - } - return instance; - } + public static CategorizationPartOfSpeechDictionary getInstance() { + return Holder.instance; } }