Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Improved the logic to switch to exact search for restrictive filters search. #1060

Merged
merged 1 commit into from
Aug 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
### Features
### Enhancements
* Enabled the IVF algorithm to work with Filters of K-NN Query. [#1013](https://github.com/opensearch-project/k-NN/pull/1013)
* Improved the logic to switch to exact search for restrictive filters search for better recall. [#1059](https://github.com/opensearch-project/k-NN/pull/1059)
### Bug Fixes
### Infrastructure
### Documentation
### Maintenance
* Update Guava Version to 32.0.1 [#1019](https://github.com/opensearch-project/k-NN/pull/1019)
### Refactoring
* Fix TransportAddress Refactoring Changes in Core [#1020](https://github.com/opensearch-project/k-NN/pull/1020)
* Fix TransportAddress Refactoring Changes in Core [#1020](https://github.com/opensearch-project/k-NN/pull/1020)
49 changes: 48 additions & 1 deletion src/main/java/org/opensearch/knn/index/KNNSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ public class KNNSettings {
public static final String MODEL_INDEX_NUMBER_OF_SHARDS = "knn.model.index.number_of_shards";
public static final String MODEL_INDEX_NUMBER_OF_REPLICAS = "knn.model.index.number_of_replicas";
public static final String MODEL_CACHE_SIZE_LIMIT = "knn.model.cache.size.limit";
public static final String ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD = "index.knn.advanced.filtered_exact_search_threshold";
public static final String ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT = "index.knn.advanced.filtered_exact_search_threshold_pct";

/**
* Default setting values
Expand All @@ -87,6 +89,9 @@ public class KNNSettings {
public static final Integer KNN_MAX_MODEL_CACHE_SIZE_LIMIT_PERCENTAGE = 25; // Model cache limit cannot exceed 25% of the JVM heap
public static final String KNN_DEFAULT_MEMORY_CIRCUIT_BREAKER_LIMIT = "50%";

public static final Integer ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE = 2000;
public static final Integer ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_DEFAULT_VALUE = 10;

/**
* Settings Definition
*/
Expand Down Expand Up @@ -154,6 +159,22 @@ public class KNNSettings {
Setting.Property.Dynamic
);

public static final Setting<Integer> ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_SETTING = Setting.intSetting(
ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD,
ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE,
0,
IndexScope,
Setting.Property.Dynamic
);

public static final Setting<Integer> ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_SETTING = Setting.intSetting(
ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT,
ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_DEFAULT_VALUE,
0,
IndexScope,
Setting.Property.Dynamic
);

public static final Setting<ByteSizeValue> MODEL_CACHE_SIZE_LIMIT_SETTING = new Setting<>(
MODEL_CACHE_SIZE_LIMIT,
percentageAsString(KNN_DEFAULT_MODEL_CACHE_SIZE_LIMIT_PERCENTAGE),
Expand Down Expand Up @@ -323,6 +344,14 @@ private Setting<?> getSetting(String key) {
return KNN_ALGO_PARAM_INDEX_THREAD_QTY_SETTING;
}

if (ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD.equals(key)) {
return ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_SETTING;
}

if (ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT.equals(key)) {
return ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_SETTING;
}

throw new IllegalArgumentException("Cannot find setting by key [" + key + "]");
}

Expand All @@ -338,7 +367,9 @@ public List<Setting<?>> getSettings() {
IS_KNN_INDEX_SETTING,
MODEL_INDEX_NUMBER_OF_SHARDS_SETTING,
MODEL_INDEX_NUMBER_OF_REPLICAS_SETTING,
MODEL_CACHE_SIZE_LIMIT_SETTING
MODEL_CACHE_SIZE_LIMIT_SETTING,
ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_SETTING,
ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_SETTING
);
return Stream.concat(settings.stream(), dynamicCacheSettings.values().stream()).collect(Collectors.toList());
}
Expand All @@ -359,6 +390,22 @@ public static double getCircuitBreakerUnsetPercentage() {
return KNNSettings.state().getSettingValue(KNNSettings.KNN_CIRCUIT_BREAKER_UNSET_PERCENTAGE);
}

public static int getFilteredExactSearchThreshold(final String indexName) {
return KNNSettings.state().clusterService.state()
.getMetadata()
.index(indexName)
.getSettings()
.getAsInt(ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD, ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE);
}

public static int getFilteredExactSearchThresholdPct(final String indexName) {
return KNNSettings.state().clusterService.state()
.getMetadata()
.index(indexName)
.getSettings()
.getAsInt(ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT, ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_DEFAULT_VALUE);
}

public void initialize(Client client, ClusterService clusterService) {
this.client = client;
this.clusterService = clusterService;
Expand Down
44 changes: 41 additions & 3 deletions src/main/java/org/opensearch/knn/index/query/KNNWeight.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.FixedBitSet;
import org.opensearch.knn.common.KNNConstants;
import org.opensearch.knn.index.KNNSettings;
import org.opensearch.knn.index.SpaceType;
import org.opensearch.knn.index.codec.util.KNNVectorSerializer;
import org.opensearch.knn.index.codec.util.KNNVectorSerializerFactory;
Expand Down Expand Up @@ -115,13 +116,16 @@ public Scorer scorer(LeafReaderContext context) throws IOException {
* . Hence, if filtered results are less than K and filter query is present we should shift to exact search.
* This improves the recall.
*/
if (filterWeight != null && filterIdsArray.length <= knnQuery.getK()) {
if (filterWeight != null && canDoExactSearch(filterIdsArray.length, getTotalDocsInSegment(context))) {
docIdsToScoreMap.putAll(doExactSearch(context, filterIdsArray));
} else {
final Map<Integer, Float> annResults = doANNSearch(context, filterIdsArray);
Map<Integer, Float> annResults = doANNSearch(context, filterIdsArray);
if (annResults == null) {
return null;
}
if (canDoExactSearchAfterANNSearch(filterIdsArray.length, annResults.size())) {
annResults = doExactSearch(context, filterIdsArray);
}
docIdsToScoreMap.putAll(annResults);
}
if (docIdsToScoreMap.isEmpty()) {
Expand Down Expand Up @@ -170,7 +174,6 @@ private int[] getFilterIdsArray(final LeafReaderContext context) throws IOExcept
if (docId == DocIdSetIterator.NO_MORE_DOCS || docId + 1 == DocIdSetIterator.NO_MORE_DOCS) {
break;
}
log.debug("Docs in filtered docs id set is : {}", docId);
filteredIds[filteredIdsIndex] = docId;
filteredIdsIndex++;
docId++;
Expand Down Expand Up @@ -369,4 +372,39 @@ private SpaceType getSpaceType(final FieldInfo fieldInfo) {
String.format(Locale.ROOT, "Unable to find the Space Type from Field Info attribute for field %s", fieldInfo.getName())
);
}

private boolean canDoExactSearch(final int filterIdsCount, final int searchableDocs) {
log.debug(
"Info for doing exact search Live Docs: {}, filterIdsLength : {}, Threshold value: {} , Threshold %age : {}",
searchableDocs,
filterIdsCount,
KNNSettings.getFilteredExactSearchThreshold(knnQuery.getIndexName()),
KNNSettings.getFilteredExactSearchThresholdPct(knnQuery.getIndexName())
);
// Refer this GitHub around more details https://github.com/opensearch-project/k-NN/issues/1049 on the logic
return filterIdsCount <= knnQuery.getK()
|| (filterIdsCount <= KNNSettings.getFilteredExactSearchThreshold(knnQuery.getIndexName())
&& (((float) filterIdsCount / (float) searchableDocs) * 100) <= (float) KNNSettings.getFilteredExactSearchThresholdPct(
knnQuery.getIndexName()
));
}

/**
* This condition mainly checks during filtered search we have more than K elements in filterIds but the ANN
* doesn't yeild K nearest neighbors.
* @param filterIdsCount count of filtered Doc ids
* @param annResultCount Count of Nearest Neighbours we got after doing filtered ANN Search.
* @return boolean - true if exactSearch needs to be done after ANNSearch.
*/
private boolean canDoExactSearchAfterANNSearch(final int filterIdsCount, final int annResultCount) {
return filterWeight != null && filterIdsCount >= knnQuery.getK() && knnQuery.getK() > annResultCount;
}

private int getTotalDocsInSegment(final LeafReaderContext context) {
// This means that there is no deleted documents, hence the live docs bitset is null
if (context.reader().getLiveDocs() == null) {
return context.reader().maxDoc();
}
return context.reader().getLiveDocs().length();
}
}
85 changes: 85 additions & 0 deletions src/test/java/org/opensearch/knn/index/KNNSettingsTests.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
package org.opensearch.knn.index;

import lombok.SneakyThrows;
import org.junit.Assert;
import org.opensearch.action.admin.cluster.state.ClusterStateRequest;
import org.opensearch.action.admin.indices.create.CreateIndexRequest;
import org.opensearch.action.admin.indices.settings.put.UpdateSettingsRequest;
import org.opensearch.cluster.ClusterName;
import org.opensearch.cluster.service.ClusterService;
import org.opensearch.common.network.NetworkModule;
Expand Down Expand Up @@ -33,6 +37,8 @@

public class KNNSettingsTests extends KNNTestCase {

private static final String INDEX_NAME = "myindex";

@SneakyThrows
public void testGetSettingValueFromConfig() {
long expectedKNNCircuitBreakerLimit = 13;
Expand Down Expand Up @@ -70,6 +76,85 @@ public void testGetSettingValueDefault() {
assertWarnings();
}

@SneakyThrows
public void testFilteredSearchAdvanceSetting_whenNoValuesProvidedByUsers_thenDefaultSettingsUsed() {
Node mockNode = createMockNode(Collections.emptyMap());
mockNode.start();
ClusterService clusterService = mockNode.injector().getInstance(ClusterService.class);
mockNode.client().admin().cluster().state(new ClusterStateRequest()).actionGet();
mockNode.client().admin().indices().create(new CreateIndexRequest(INDEX_NAME)).actionGet();
KNNSettings.state().setClusterService(clusterService);

int filteredSearchThresholdPct = KNNSettings.getFilteredExactSearchThresholdPct(INDEX_NAME);
int filteredSearchThreshold = KNNSettings.getFilteredExactSearchThreshold(INDEX_NAME);
mockNode.close();
assertEquals((int) KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT_DEFAULT_VALUE, filteredSearchThresholdPct);
assertEquals((int) KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_DEFAULT_VALUE, filteredSearchThreshold);
assertWarnings();
}

@SneakyThrows
public void testFilteredSearchAdvanceSetting_whenValuesProvidedByUsers_thenValidateSameValues() {
int userDefinedPctThreshold = 20;
int userDefinedThreshold = 1000;
int userDefinedPctThresholdMinValue = 0;
int userDefinedThresholdMinValue = 0;
Node mockNode = createMockNode(Collections.emptyMap());
mockNode.start();
ClusterService clusterService = mockNode.injector().getInstance(ClusterService.class);
mockNode.client().admin().cluster().state(new ClusterStateRequest()).actionGet();
mockNode.client().admin().indices().create(new CreateIndexRequest(INDEX_NAME)).actionGet();
KNNSettings.state().setClusterService(clusterService);

final Settings filteredSearchAdvanceSettings = Settings.builder()
.put(KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD, userDefinedThreshold)
.put(KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT, userDefinedPctThreshold)
.build();

mockNode.client()
.admin()
.indices()
.updateSettings(new UpdateSettingsRequest(filteredSearchAdvanceSettings, INDEX_NAME))
.actionGet();

int filteredSearchThresholdPct = KNNSettings.getFilteredExactSearchThresholdPct(INDEX_NAME);
int filteredSearchThreshold = KNNSettings.getFilteredExactSearchThreshold(INDEX_NAME);

// validate if we are able to set MinValues for the setting
final Settings filteredSearchAdvanceSettingsWithMinValues = Settings.builder()
.put(KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD, userDefinedThresholdMinValue)
.put(KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT, userDefinedPctThresholdMinValue)
.build();

mockNode.client()
.admin()
.indices()
.updateSettings(new UpdateSettingsRequest(filteredSearchAdvanceSettingsWithMinValues, INDEX_NAME))
.actionGet();

int filteredSearchThresholdPctMinValue = KNNSettings.getFilteredExactSearchThresholdPct(INDEX_NAME);
int filteredSearchThresholdMinValue = KNNSettings.getFilteredExactSearchThreshold(INDEX_NAME);

// Validate if less than MinValues are set then Exception Happens
final Settings filteredSearchAdvanceSettingsWithLessThanMinValues = Settings.builder()
.put(KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD, -1)
.put(KNNSettings.ADVANCED_FILTERED_EXACT_SEARCH_THRESHOLD_PCT, -1)
.build();

Assert.assertThrows(IllegalArgumentException.class, () -> mockNode.client()
.admin()
.indices()
.updateSettings(new UpdateSettingsRequest(filteredSearchAdvanceSettingsWithLessThanMinValues, INDEX_NAME))
.actionGet());

mockNode.close();
assertEquals(userDefinedPctThreshold, filteredSearchThresholdPct);
assertEquals(userDefinedThreshold, filteredSearchThreshold);
assertEquals(userDefinedPctThresholdMinValue, filteredSearchThresholdPctMinValue);
assertEquals(userDefinedThresholdMinValue, filteredSearchThresholdMinValue);
assertWarnings();
}

private Node createMockNode(Map<String, Object> configSettings) throws IOException {
Path configDir = createTempDir();
File configFile = configDir.resolve("opensearch.yml").toFile();
Expand Down
Loading