Skip to content

Commit

Permalink
Save 2M of heap for large static constant map in CategorizationPartOf…
Browse files Browse the repository at this point in the history
…SpeechDictionary (elastic#103302)

For one, this cleans up the lazy-loading implementation to use a standard holder approach.
More importantly, this moves the implementation to use a final immutable map instead of a HashMap
which saves about 2M in heap (reduction from ~7M to ~5M) for this map.
  • Loading branch information
original-brownbear authored Dec 12, 2023
1 parent 7d2fb63 commit e784083
Showing 1 changed file with 44 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
package org.elasticsearch.xpack.ml.aggs.categorization;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -73,49 +72,56 @@ static PartOfSpeech fromCode(char partOfSpeechCode) {
}
}

/**
* Lazy loaded singleton instance to avoid loading the dictionary repeatedly.
*/
private static CategorizationPartOfSpeechDictionary instance;
private static final Object INIT_LOCK = new Object();
private static final class Holder {
/**
* Lazy loaded singleton instance to avoid loading the dictionary repeatedly.
*/
private static final CategorizationPartOfSpeechDictionary instance = new CategorizationPartOfSpeechDictionary();
}

/**
* Keys are lower case.
*/
private final Map<String, PartOfSpeech> partOfSpeechDictionary = new HashMap<>();
private final int maxDictionaryWordLength;
private final Map<String, PartOfSpeech> partOfSpeechDictionary;

CategorizationPartOfSpeechDictionary(InputStream is) throws IOException {
private final int maxDictionaryWordLength;

int maxLength = 0;
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.isEmpty()) {
continue;
}
String[] split = line.split(PART_OF_SPEECH_SEPARATOR);
if (split.length != 2) {
throw new IllegalArgumentException(
"Unexpected format in line [" + line + "]: expected one [" + PART_OF_SPEECH_SEPARATOR + "] separator"
);
}
if (split[0].isEmpty()) {
throw new IllegalArgumentException(
"Unexpected format in line [" + line + "]: nothing preceding [" + PART_OF_SPEECH_SEPARATOR + "] separator"
);
}
if (split[1].isEmpty()) {
throw new IllegalArgumentException(
"Unexpected format in line [" + line + "]: nothing following [" + PART_OF_SPEECH_SEPARATOR + "] separator"
);
CategorizationPartOfSpeechDictionary() {
try (InputStream is = CategorizationPartOfSpeechDictionary.class.getResourceAsStream(DICTIONARY_FILE_PATH)) {
int maxLength = 0;
BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8));
String line;
final Map<String, PartOfSpeech> partOfSpeechMap = new HashMap<>();
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.isEmpty()) {
continue;
}
String[] split = line.split(PART_OF_SPEECH_SEPARATOR);
if (split.length != 2) {
throw new IllegalArgumentException(
"Unexpected format in line [" + line + "]: expected one [" + PART_OF_SPEECH_SEPARATOR + "] separator"
);
}
if (split[0].isEmpty()) {
throw new IllegalArgumentException(
"Unexpected format in line [" + line + "]: nothing preceding [" + PART_OF_SPEECH_SEPARATOR + "] separator"
);
}
if (split[1].isEmpty()) {
throw new IllegalArgumentException(
"Unexpected format in line [" + line + "]: nothing following [" + PART_OF_SPEECH_SEPARATOR + "] separator"
);
}
String lowerCaseWord = split[0].toLowerCase(Locale.ROOT);
partOfSpeechMap.put(lowerCaseWord, PartOfSpeech.fromCode(split[1].charAt(0)));
maxLength = Math.max(maxLength, lowerCaseWord.length());
}
String lowerCaseWord = split[0].toLowerCase(Locale.ROOT);
partOfSpeechDictionary.put(lowerCaseWord, PartOfSpeech.fromCode(split[1].charAt(0)));
maxLength = Math.max(maxLength, lowerCaseWord.length());
partOfSpeechDictionary = Map.copyOf(partOfSpeechMap);
maxDictionaryWordLength = maxLength;
} catch (Exception e) {
throw new AssertionError(e);
}
maxDictionaryWordLength = maxLength;
}

// TODO: now we have this in Java, perform this operation in Java for anomaly detection categorization instead of in C++.
Expand All @@ -142,17 +148,7 @@ public boolean isInDictionary(String word) {
return getPartOfSpeech(word) != PartOfSpeech.NOT_IN_DICTIONARY;
}

public static CategorizationPartOfSpeechDictionary getInstance() throws IOException {
if (instance != null) {
return instance;
}
synchronized (INIT_LOCK) {
if (instance == null) {
try (InputStream is = CategorizationPartOfSpeechDictionary.class.getResourceAsStream(DICTIONARY_FILE_PATH)) {
instance = new CategorizationPartOfSpeechDictionary(is);
}
}
return instance;
}
public static CategorizationPartOfSpeechDictionary getInstance() {
return Holder.instance;
}
}

0 comments on commit e784083

Please sign in to comment.