diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 8f9f72c6b..5c7240c8d 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -8,6 +8,7 @@ _List any issues this PR will resolve, e.g. Resolves [...]._ - [ ] Updated documentation (docs/ppl-lang/README.md) - [ ] Implemented unit tests - [ ] Implemented tests for combination with other commands +- [ ] New added source code should include a copyright header - [ ] Commits are signed per the DCO using `--signoff` By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. diff --git a/docs/index.md b/docs/index.md index bb3121ba6..e76cb387a 100644 --- a/docs/index.md +++ b/docs/index.md @@ -549,6 +549,7 @@ In the index mapping, the `_meta` and `properties`field stores meta and schema i - `spark.flint.monitor.initialDelaySeconds`: Initial delay in seconds before starting the monitoring task. Default value is 15. - `spark.flint.monitor.intervalSeconds`: Interval in seconds for scheduling the monitoring task. Default value is 60. - `spark.flint.monitor.maxErrorCount`: Maximum number of consecutive errors allowed before stopping the monitoring task. Default value is 5. +- `spark.flint.metadataCacheWrite.enabled`: default is false. enable writing metadata to index mappings _meta as read cache for frontend user to access. Do not use in production, this setting will be removed in later version. #### Data Type Mapping diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md index 8e796c6fb..01c3f1619 100644 --- a/docs/ppl-lang/PPL-Example-Commands.md +++ b/docs/ppl-lang/PPL-Example-Commands.md @@ -55,6 +55,9 @@ _- **Limitation: new field added by eval command with a function cannot be dropp - `source = table | where isempty(a)` - `source = table | where isblank(a)` - `source = table | where case(length(a) > 6, 'True' else 'False') = 'True'` +- `source = table | where a not in (1, 2, 3) | fields a,b,c` +- `source = table | where a between 1 and 4` - Note: This returns a >= 1 and a <= 4, i.e. [1, 4] +- `source = table | where b not between '2024-09-10' and '2025-09-10'` - Note: This returns b >= '2024-09-10' and b <= '2025-09-10' ```sql source = table | eval status_category = @@ -431,7 +434,7 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols` ### Planned Commands: #### **fillnull** - +[See additional command details](ppl-fillnull-command.md) ```sql - `source=accounts | fillnull fields status_code=101` - `source=accounts | fillnull fields request_path='/not_found', timestamp='*'` @@ -439,4 +442,3 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols` - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5` - `source=accounts | fillnull using field1=concat(field2, field3), field4=2*pi()*field5, field6 = 'N/A'` ``` -[See additional command details](planning/ppl-fillnull-command.md) diff --git a/docs/ppl-lang/functions/ppl-expressions.md b/docs/ppl-lang/functions/ppl-expressions.md index 6315663c2..171f97385 100644 --- a/docs/ppl-lang/functions/ppl-expressions.md +++ b/docs/ppl-lang/functions/ppl-expressions.md @@ -127,7 +127,7 @@ OR operator : NOT operator : - os> source=accounts | where not age in (32, 33) | fields age ; + os> source=accounts | where age not in (32, 33) | fields age ; fetched rows / total rows = 2/2 +-------+ | age | diff --git a/docs/ppl-lang/planning/ppl-between.md b/docs/ppl-lang/planning/ppl-between.md new file mode 100644 index 000000000..6c8e300e8 --- /dev/null +++ b/docs/ppl-lang/planning/ppl-between.md @@ -0,0 +1,17 @@ +## between syntax proposal + +1. **Proposed syntax** + - `... | where expr1 [NOT] BETWEEN expr2 AND expr3` + - evaluate if expr1 is [not] in between expr2 and expr3 + - `... | where a between 1 and 4` - Note: This returns a >= 1 and a <= 4, i.e. [1, 4] + - `... | where b not between '2024-09-10' and '2025-09-10'` - Note: This returns b >= '2024-09-10' and b <= '2025-09-10' + +### New syntax definition in ANTLR + +```ANTLR + +logicalExpression + ... + | expr1 = functionArg NOT? BETWEEN expr2 = functionArg AND expr3 = functionArg # between + +``` \ No newline at end of file diff --git a/docs/ppl-lang/ppl-eval-command.md b/docs/ppl-lang/ppl-eval-command.md index cd0898c1b..1908c087c 100644 --- a/docs/ppl-lang/ppl-eval-command.md +++ b/docs/ppl-lang/ppl-eval-command.md @@ -80,6 +80,8 @@ Assumptions: `a`, `b`, `c` are existing fields in `table` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one', a = 2, 'two', a = 3, 'three', a = 4, 'four', a = 5, 'five', a = 6, 'six', a = 7, 'se7en', a = 8, 'eight', a = 9, 'nine')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else 'unknown')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else concat(a, ' is an incorrect binary digit'))` +- `source = table | eval f = a in ('foo', 'bar') | fields f` +- `source = table | eval f = a not in ('foo', 'bar') | fields f` Eval with `case` example: ```sql diff --git a/docs/ppl-lang/ppl-fillnull-command.md b/docs/ppl-lang/ppl-fillnull-command.md new file mode 100644 index 000000000..00064849c --- /dev/null +++ b/docs/ppl-lang/ppl-fillnull-command.md @@ -0,0 +1,92 @@ +## PPL `fillnull` command + +### Description +Using ``fillnull`` command to fill null with provided value in one or more fields in the search result. + + +### Syntax +`fillnull [with in ["," ]] | [using = ["," = ]]` + +* null-replacement: mandatory. The value used to replace `null`s. +* nullable-field: mandatory. Field reference. The `null` values in the field referred to by the property will be replaced with the values from the null-replacement. + + +### Example 1: fillnull one field + +The example show fillnull one field. + +PPL query: + + os> source=logs | fields status_code | eval input=status_code | fillnull value = 0 status_code; +| input | status_code | +|-------|-------------| +| 403 | 403 | +| 403 | 403 | +| NULL | 0 | +| NULL | 0 | +| 200 | 200 | +| 404 | 404 | +| 500 | 500 | +| NULL | 0 | +| 500 | 500 | +| 404 | 404 | +| 200 | 200 | +| 500 | 500 | +| NULL | 0 | +| NULL | 0 | +| 404 | 404 | + + +### Example 2: fillnull applied to multiple fields + +The example show fillnull applied to multiple fields. + +PPL query: + + os> source=logs | fields request_path, timestamp | eval input_request_path=request_path, input_timestamp = timestamp | fillnull value = '???' request_path, timestamp; +| input_request_path | input_timestamp | request_path | timestamp | +|--------------------|-----------------------|--------------|------------------------| +| /contact | NULL | /contact | ??? | +| /home | NULL | /home | ??? | +| /about | 2023-10-01 10:30:00 | /about | 2023-10-01 10:30:00 | +| /home | 2023-10-01 10:15:00 | /home | 2023-10-01 10:15:00 | +| NULL | 2023-10-01 10:20:00 | ??? | 2023-10-01 10:20:00 | +| NULL | 2023-10-01 11:05:00 | ??? | 2023-10-01 11:05:00 | +| /about | NULL | /about | ??? | +| /home | 2023-10-01 10:00:00 | /home | 2023-10-01 10:00:00 | +| /contact | NULL | /contact | ??? | +| NULL | 2023-10-01 10:05:00 | ??? | 2023-10-01 10:05:00 | +| NULL | 2023-10-01 10:50:00 | ??? | 2023-10-01 10:50:00 | +| /services | NULL | /services | ??? | +| /home | 2023-10-01 10:45:00 | /home | 2023-10-01 10:45:00 | +| /services | 2023-10-01 11:00:00 | /services | 2023-10-01 11:00:00 | +| NULL | 2023-10-01 10:35:00 | ??? | 2023-10-01 10:35:00 | + +### Example 3: fillnull applied to multiple fields with various `null` replacement values + +The example show fillnull with various values used to replace `null`s. +- `/error` in `request_path` field +- `1970-01-01 00:00:00` in `timestamp` field + +PPL query: + + os> source=logs | fields request_path, timestamp | eval input_request_path=request_path, input_timestamp = timestamp | fillnull using request_path = '/error', timestamp='1970-01-01 00:00:00'; + + +| input_request_path | input_timestamp | request_path | timestamp | +|--------------------|-----------------------|--------------|------------------------| +| /contact | NULL | /contact | 1970-01-01 00:00:00 | +| /home | NULL | /home | 1970-01-01 00:00:00 | +| /about | 2023-10-01 10:30:00 | /about | 2023-10-01 10:30:00 | +| /home | 2023-10-01 10:15:00 | /home | 2023-10-01 10:15:00 | +| NULL | 2023-10-01 10:20:00 | /error | 2023-10-01 10:20:00 | +| NULL | 2023-10-01 11:05:00 | /error | 2023-10-01 11:05:00 | +| /about | NULL | /about | 1970-01-01 00:00:00 | +| /home | 2023-10-01 10:00:00 | /home | 2023-10-01 10:00:00 | +| /contact | NULL | /contact | 1970-01-01 00:00:00 | +| NULL | 2023-10-01 10:05:00 | /error | 2023-10-01 10:05:00 | +| NULL | 2023-10-01 10:50:00 | /error | 2023-10-01 10:50:00 | +| /services | NULL | /services | 1970-01-01 00:00:00 | +| /home | 2023-10-01 10:45:00 | /home | 2023-10-01 10:45:00 | +| /services | 2023-10-01 11:00:00 | /services | 2023-10-01 11:00:00 | +| NULL | 2023-10-01 10:35:00 | /error | 2023-10-01 10:35:00 | \ No newline at end of file diff --git a/docs/ppl-lang/ppl-where-command.md b/docs/ppl-lang/ppl-where-command.md index 94ddc1f5c..89a7e61fa 100644 --- a/docs/ppl-lang/ppl-where-command.md +++ b/docs/ppl-lang/ppl-where-command.md @@ -41,6 +41,8 @@ PPL query: - `source = table | where isempty(a)` - `source = table | where isblank(a)` - `source = table | where case(length(a) > 6, 'True' else 'False') = 'True'` +- `source = table | where a between 1 and 4` - Note: This returns a >= 1 and a <= 4, i.e. [1, 4] +- `source = table | where b not between '2024-09-10' and '2025-09-10'` - Note: This returns b >= '2024-09-10' and b <= '2025-09-10' - `source = table | eval status_category = case(a >= 200 AND a < 300, 'Success', diff --git a/flint-commons/src/main/scala/org/opensearch/flint/common/metadata/log/FlintMetadataLogEntry.scala b/flint-commons/src/main/scala/org/opensearch/flint/common/metadata/log/FlintMetadataLogEntry.scala index 982b7df23..1cae64b83 100644 --- a/flint-commons/src/main/scala/org/opensearch/flint/common/metadata/log/FlintMetadataLogEntry.scala +++ b/flint-commons/src/main/scala/org/opensearch/flint/common/metadata/log/FlintMetadataLogEntry.scala @@ -18,6 +18,10 @@ import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry.IndexState * log entry id * @param state * Flint index state + * @param lastRefreshStartTime + * timestamp when last refresh started for manual or external scheduler refresh + * @param lastRefreshCompleteTime + * timestamp when last refresh completed for manual or external scheduler refresh * @param entryVersion * entry version fields for consistency control * @param error @@ -28,10 +32,12 @@ import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry.IndexState case class FlintMetadataLogEntry( id: String, /** - * This is currently used as streaming job start time. In future, this should represent the - * create timestamp of the log entry + * This is currently used as streaming job start time for internal scheduler. In future, this + * should represent the create timestamp of the log entry */ createTime: Long, + lastRefreshStartTime: Long, + lastRefreshCompleteTime: Long, state: IndexState, entryVersion: Map[String, Any], error: String, @@ -40,26 +46,48 @@ case class FlintMetadataLogEntry( def this( id: String, createTime: Long, + lastRefreshStartTime: Long, + lastRefreshCompleteTime: Long, state: IndexState, entryVersion: JMap[String, Any], error: String, properties: JMap[String, Any]) = { - this(id, createTime, state, entryVersion.asScala.toMap, error, properties.asScala.toMap) + this( + id, + createTime, + lastRefreshStartTime, + lastRefreshCompleteTime, + state, + entryVersion.asScala.toMap, + error, + properties.asScala.toMap) } def this( id: String, createTime: Long, + lastRefreshStartTime: Long, + lastRefreshCompleteTime: Long, state: IndexState, entryVersion: JMap[String, Any], error: String, properties: Map[String, Any]) = { - this(id, createTime, state, entryVersion.asScala.toMap, error, properties) + this( + id, + createTime, + lastRefreshStartTime, + lastRefreshCompleteTime, + state, + entryVersion.asScala.toMap, + error, + properties) } } object FlintMetadataLogEntry { + val EMPTY_TIMESTAMP = 0L + /** * Flint index state enum. */ diff --git a/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java b/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java index 04ef216c4..9facd89ef 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java @@ -87,6 +87,10 @@ static void recordOperationSuccess(String metricNamePrefix) { MetricsUtil.incrementCounter(successMetricName); } + static void recordLatency(String metricNamePrefix, long latencyMilliseconds) { + MetricsUtil.addHistoricGauge(metricNamePrefix + ".processingTime", latencyMilliseconds); + } + /** * Records the failure of an OpenSearch operation by incrementing the corresponding metric counter. * If the exception is an OpenSearchException with a specific status code (e.g., 403), @@ -107,6 +111,8 @@ static void recordOperationFailure(String metricNamePrefix, Exception e) { if (statusCode == 403) { String forbiddenErrorMetricName = metricNamePrefix + ".403.count"; MetricsUtil.incrementCounter(forbiddenErrorMetricName); + } else if (statusCode == 429) { + MetricsUtil.incrementCounter(metricNamePrefix + ".429.count"); } String failureMetricName = metricNamePrefix + "." + (statusCode / 100) + "xx.count"; diff --git a/flint-core/src/main/java/org/opensearch/flint/core/RestHighLevelClientWrapper.java b/flint-core/src/main/java/org/opensearch/flint/core/RestHighLevelClientWrapper.java index 1b83f032a..31f012256 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/RestHighLevelClientWrapper.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/RestHighLevelClientWrapper.java @@ -5,6 +5,8 @@ package org.opensearch.flint.core; +import java.util.Arrays; +import java.util.function.Consumer; import org.opensearch.action.admin.indices.delete.DeleteIndexRequest; import org.opensearch.action.bulk.BulkRequest; import org.opensearch.action.bulk.BulkResponse; @@ -40,7 +42,10 @@ import org.opensearch.flint.core.storage.BulkRequestRateLimiter; import org.opensearch.flint.core.storage.OpenSearchBulkRetryWrapper; +import static org.opensearch.flint.core.metrics.MetricConstants.OS_BULK_OP_METRIC_PREFIX; +import static org.opensearch.flint.core.metrics.MetricConstants.OS_CREATE_OP_METRIC_PREFIX; import static org.opensearch.flint.core.metrics.MetricConstants.OS_READ_OP_METRIC_PREFIX; +import static org.opensearch.flint.core.metrics.MetricConstants.OS_SEARCH_OP_METRIC_PREFIX; import static org.opensearch.flint.core.metrics.MetricConstants.OS_WRITE_OP_METRIC_PREFIX; /** @@ -67,112 +72,126 @@ public RestHighLevelClientWrapper(RestHighLevelClient client, BulkRequestRateLim @Override public BulkResponse bulk(BulkRequest bulkRequest, RequestOptions options) throws IOException { - return execute(OS_WRITE_OP_METRIC_PREFIX, () -> { + return execute(() -> { try { rateLimiter.acquirePermit(); return bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); } catch (InterruptedException e) { throw new RuntimeException("rateLimiter.acquirePermit was interrupted.", e); } - }); + }, OS_WRITE_OP_METRIC_PREFIX, OS_BULK_OP_METRIC_PREFIX); } @Override public ClearScrollResponse clearScroll(ClearScrollRequest clearScrollRequest, RequestOptions options) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, () -> client.clearScroll(clearScrollRequest, options)); + return execute(() -> client.clearScroll(clearScrollRequest, options), + OS_READ_OP_METRIC_PREFIX); } @Override public CreateIndexResponse createIndex(CreateIndexRequest createIndexRequest, RequestOptions options) throws IOException { - return execute(OS_WRITE_OP_METRIC_PREFIX, () -> client.indices().create(createIndexRequest, options)); + return execute(() -> client.indices().create(createIndexRequest, options), + OS_WRITE_OP_METRIC_PREFIX, OS_CREATE_OP_METRIC_PREFIX); } @Override public void updateIndexMapping(PutMappingRequest putMappingRequest, RequestOptions options) throws IOException { - execute(OS_WRITE_OP_METRIC_PREFIX, () -> client.indices().putMapping(putMappingRequest, options)); + execute(() -> client.indices().putMapping(putMappingRequest, options), + OS_WRITE_OP_METRIC_PREFIX); } @Override public void deleteIndex(DeleteIndexRequest deleteIndexRequest, RequestOptions options) throws IOException { - execute(OS_WRITE_OP_METRIC_PREFIX, () -> client.indices().delete(deleteIndexRequest, options)); + execute(() -> client.indices().delete(deleteIndexRequest, options), + OS_WRITE_OP_METRIC_PREFIX); } @Override public DeleteResponse delete(DeleteRequest deleteRequest, RequestOptions options) throws IOException { - return execute(OS_WRITE_OP_METRIC_PREFIX, () -> client.delete(deleteRequest, options)); + return execute(() -> client.delete(deleteRequest, options), OS_WRITE_OP_METRIC_PREFIX); } @Override public GetResponse get(GetRequest getRequest, RequestOptions options) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, () -> client.get(getRequest, options)); + return execute(() -> client.get(getRequest, options), OS_READ_OP_METRIC_PREFIX); } @Override public GetIndexResponse getIndex(GetIndexRequest getIndexRequest, RequestOptions options) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, () -> client.indices().get(getIndexRequest, options)); + return execute(() -> client.indices().get(getIndexRequest, options), + OS_READ_OP_METRIC_PREFIX); } @Override public IndexResponse index(IndexRequest indexRequest, RequestOptions options) throws IOException { - return execute(OS_WRITE_OP_METRIC_PREFIX, () -> client.index(indexRequest, options)); + return execute(() -> client.index(indexRequest, options), OS_WRITE_OP_METRIC_PREFIX); } @Override public Boolean doesIndexExist(GetIndexRequest getIndexRequest, RequestOptions options) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, () -> client.indices().exists(getIndexRequest, options)); + return execute(() -> client.indices().exists(getIndexRequest, options), + OS_READ_OP_METRIC_PREFIX); } @Override public SearchResponse search(SearchRequest searchRequest, RequestOptions options) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, () -> client.search(searchRequest, options)); + return execute(() -> client.search(searchRequest, options), OS_READ_OP_METRIC_PREFIX, OS_SEARCH_OP_METRIC_PREFIX); } @Override public SearchResponse scroll(SearchScrollRequest searchScrollRequest, RequestOptions options) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, () -> client.scroll(searchScrollRequest, options)); + return execute(() -> client.scroll(searchScrollRequest, options), OS_READ_OP_METRIC_PREFIX); } @Override public UpdateResponse update(UpdateRequest updateRequest, RequestOptions options) throws IOException { - return execute(OS_WRITE_OP_METRIC_PREFIX, () -> client.update(updateRequest, options)); + return execute(() -> client.update(updateRequest, options), OS_WRITE_OP_METRIC_PREFIX); } - @Override - public IndicesStatsResponse stats(IndicesStatsRequest request) throws IOException { - return execute(OS_READ_OP_METRIC_PREFIX, - () -> { - OpenSearchClient openSearchClient = - new OpenSearchClient(new RestClientTransport(client.getLowLevelClient(), - new JacksonJsonpMapper())); - return openSearchClient.indices().stats(request); - }); - } + @Override + public IndicesStatsResponse stats(IndicesStatsRequest request) throws IOException { + return execute(() -> { + OpenSearchClient openSearchClient = + new OpenSearchClient(new RestClientTransport(client.getLowLevelClient(), + new JacksonJsonpMapper())); + return openSearchClient.indices().stats(request); + }, OS_READ_OP_METRIC_PREFIX + ); + } @Override public CreatePitResponse createPit(CreatePitRequest request) throws IOException { - return execute(OS_WRITE_OP_METRIC_PREFIX, () -> openSearchClient().createPit(request)); + return execute(() -> openSearchClient().createPit(request), OS_WRITE_OP_METRIC_PREFIX); } /** * Executes a given operation, tracks metrics, and handles exceptions. * - * @param metricNamePrefix the prefix for the metric name - * @param operation the operation to execute * @param the return type of the operation + * @param operation the operation to execute + * @param metricNamePrefixes array of prefixes for the metric name * @return the result of the operation * @throws IOException if an I/O exception occurs */ - private T execute(String metricNamePrefix, IOCallable operation) throws IOException { + private T execute(IOCallable operation, String... metricNamePrefixes) throws IOException { + long startTime = System.currentTimeMillis(); try { T result = operation.call(); - IRestHighLevelClient.recordOperationSuccess(metricNamePrefix); + eachPrefix(IRestHighLevelClient::recordOperationSuccess, metricNamePrefixes); return result; } catch (Exception e) { - IRestHighLevelClient.recordOperationFailure(metricNamePrefix, e); + eachPrefix(prefix -> IRestHighLevelClient.recordOperationFailure(prefix, e), metricNamePrefixes); throw e; + } finally { + long latency = System.currentTimeMillis() - startTime; + eachPrefix(prefix -> IRestHighLevelClient.recordLatency(prefix, latency), metricNamePrefixes); } } + private static void eachPrefix(Consumer fn, String... prefixes) { + Arrays.stream(prefixes).forEach(fn); + } + /** * Functional interface for operations that can throw IOException. * diff --git a/flint-core/src/main/java/org/opensearch/flint/core/metrics/HistoricGauge.java b/flint-core/src/main/java/org/opensearch/flint/core/metrics/HistoricGauge.java index 181bf8575..8e5288110 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/metrics/HistoricGauge.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/metrics/HistoricGauge.java @@ -6,6 +6,7 @@ package org.opensearch.flint.core.metrics; import com.codahale.metrics.Gauge; +import com.google.common.annotations.VisibleForTesting; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; @@ -60,4 +61,9 @@ public List pollDataPoints() { } return result; } + + @VisibleForTesting + public List getDataPoints() { + return dataPoints; + } } diff --git a/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricConstants.java b/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricConstants.java index 796186f20..ecf613f6c 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricConstants.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricConstants.java @@ -22,6 +22,24 @@ public final class MetricConstants { */ public static final String OS_WRITE_OP_METRIC_PREFIX = "opensearch.write"; + /** + * Prefixes for OpenSearch API specific metrics + */ + public static final String OS_CREATE_OP_METRIC_PREFIX = "opensearch.create"; + public static final String OS_SEARCH_OP_METRIC_PREFIX = "opensearch.search"; + public static final String OS_BULK_OP_METRIC_PREFIX = "opensearch.bulk"; + + /** + * Metric name for request size of opensearch bulk request + */ + public static final String OPENSEARCH_BULK_SIZE_METRIC = "opensearch.bulk.size.count"; + + /** + * Metric name for opensearch bulk request retry count + */ + public static final String OPENSEARCH_BULK_RETRY_COUNT_METRIC = "opensearch.bulk.retry.count"; + public static final String OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC = "opensearch.bulk.allRetryFailed.count"; + /** * Metric name for counting the errors encountered with Amazon S3 operations. */ @@ -152,7 +170,12 @@ public final class MetricConstants { */ public static final String CREATE_MV_INDICES = "query.execution.index.mv"; + /** + * Metric for tracking the latency of checkpoint deletion + */ + public static final String CHECKPOINT_DELETE_TIME_METRIC = "checkpoint.delete.processingTime"; + private MetricConstants() { // Private constructor to prevent instantiation } -} \ No newline at end of file +} diff --git a/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricsUtil.java b/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricsUtil.java index 511c18664..5a0f0f5ad 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricsUtil.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/metrics/MetricsUtil.java @@ -9,6 +9,7 @@ import com.codahale.metrics.Gauge; import com.codahale.metrics.MetricRegistry; import com.codahale.metrics.Timer; +import java.util.function.Supplier; import org.apache.spark.SparkEnv; import org.apache.spark.metrics.source.FlintMetricSource; import org.apache.spark.metrics.source.FlintIndexMetricSource; @@ -133,6 +134,21 @@ public static void addHistoricGauge(String metricName, final long value) { } } + /** + * Automatically emit latency metric as Historic Gauge for the execution of supplier + * @param supplier the lambda to be metered + * @param metricName name of the metric + * @return value returned by supplier + */ + public static T withLatencyAsHistoricGauge(Supplier supplier, String metricName) { + long startTime = System.currentTimeMillis(); + try { + return supplier.get(); + } finally { + addHistoricGauge(metricName, System.currentTimeMillis() - startTime); + } + } + private static HistoricGauge getOrCreateHistoricGauge(String metricName) { MetricRegistry metricRegistry = getMetricRegistry(false); return metricRegistry != null ? metricRegistry.gauge(metricName, HistoricGauge::new) : null; diff --git a/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedName.java b/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedName.java index da3a446d4..e3029d61c 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedName.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedName.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.metrics.reporter; import com.amazonaws.services.cloudwatch.model.Dimension; diff --git a/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameBuilder.java b/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameBuilder.java index 603c58f95..dc4e6bc4d 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameBuilder.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameBuilder.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.metrics.reporter; import com.amazonaws.services.cloudwatch.model.Dimension; diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/metadata/log/DefaultOptimisticTransaction.java b/flint-core/src/main/scala/org/opensearch/flint/core/metadata/log/DefaultOptimisticTransaction.java index e6fed4126..ec1eabf14 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/metadata/log/DefaultOptimisticTransaction.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/metadata/log/DefaultOptimisticTransaction.java @@ -86,6 +86,8 @@ public T commit(Function operation) { initialLog = initialLog.copy( initialLog.id(), initialLog.createTime(), + initialLog.lastRefreshStartTime(), + initialLog.lastRefreshCompleteTime(), initialLog.state(), latest.entryVersion(), initialLog.error(), diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiter.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiter.java index af298cc8f..797dc2d02 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiter.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiter.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; import dev.failsafe.RateLimiter; diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolder.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolder.java index 0453c70c8..73fdb8843 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolder.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolder.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; import org.opensearch.flint.core.FlintOptions; diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverter.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverter.java index 0b78304d2..f90dda9a0 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverter.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverter.java @@ -101,7 +101,7 @@ public static String toJson(FlintMetadataLogEntry logEntry) throws JsonProcessin ObjectMapper mapper = new ObjectMapper(); ObjectNode json = mapper.createObjectNode(); - json.put("version", "1.0"); + json.put("version", "1.1"); json.put("latestId", logEntry.id()); json.put("type", "flintindexstate"); json.put("state", logEntry.state().toString()); @@ -109,6 +109,8 @@ public static String toJson(FlintMetadataLogEntry logEntry) throws JsonProcessin json.put("jobId", jobId); json.put("dataSourceName", logEntry.properties().get("dataSourceName").get().toString()); json.put("jobStartTime", logEntry.createTime()); + json.put("lastRefreshStartTime", logEntry.lastRefreshStartTime()); + json.put("lastRefreshCompleteTime", logEntry.lastRefreshCompleteTime()); json.put("lastUpdateTime", lastUpdateTime); json.put("error", logEntry.error()); @@ -138,6 +140,8 @@ public static FlintMetadataLogEntry constructLogEntry( id, /* sourceMap may use Integer or Long even though it's always long in index mapping */ ((Number) sourceMap.get("jobStartTime")).longValue(), + ((Number) sourceMap.get("lastRefreshStartTime")).longValue(), + ((Number) sourceMap.get("lastRefreshCompleteTime")).longValue(), FlintMetadataLogEntry.IndexState$.MODULE$.from((String) sourceMap.get("state")), Map.of("seqNo", seqNo, "primaryTerm", primaryTerm), (String) sourceMap.get("error"), diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchMetadataLog.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchMetadataLog.java index 8c327b664..24c9df492 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchMetadataLog.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchMetadataLog.java @@ -132,7 +132,9 @@ public void purge() { public FlintMetadataLogEntry emptyLogEntry() { return new FlintMetadataLogEntry( "", - 0L, + FlintMetadataLogEntry.EMPTY_TIMESTAMP(), + FlintMetadataLogEntry.EMPTY_TIMESTAMP(), + FlintMetadataLogEntry.EMPTY_TIMESTAMP(), FlintMetadataLogEntry.IndexState$.MODULE$.EMPTY(), Map.of("seqNo", UNASSIGNED_SEQ_NO, "primaryTerm", UNASSIGNED_PRIMARY_TERM), "", @@ -146,6 +148,8 @@ private FlintMetadataLogEntry createLogEntry(FlintMetadataLogEntry logEntry) { logEntry.copy( latestId, logEntry.createTime(), + logEntry.lastRefreshStartTime(), + logEntry.lastRefreshCompleteTime(), logEntry.state(), logEntry.entryVersion(), logEntry.error(), @@ -184,6 +188,8 @@ private FlintMetadataLogEntry writeLogEntry( logEntry = new FlintMetadataLogEntry( logEntry.id(), logEntry.createTime(), + logEntry.lastRefreshStartTime(), + logEntry.lastRefreshCompleteTime(), logEntry.state(), Map.of("seqNo", response.getSeqNo(), "primaryTerm", response.getPrimaryTerm()), logEntry.error(), diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapper.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapper.java index 279c9b642..14e3b7099 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapper.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapper.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; import dev.failsafe.Failsafe; @@ -6,6 +11,7 @@ import dev.failsafe.function.CheckedPredicate; import java.util.Arrays; import java.util.List; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.logging.Logger; import org.opensearch.action.DocWriteRequest; @@ -15,6 +21,8 @@ import org.opensearch.client.RequestOptions; import org.opensearch.client.RestHighLevelClient; import org.opensearch.flint.core.http.FlintRetryOptions; +import org.opensearch.flint.core.metrics.MetricConstants; +import org.opensearch.flint.core.metrics.MetricsUtil; import org.opensearch.rest.RestStatus; public class OpenSearchBulkRetryWrapper { @@ -37,11 +45,19 @@ public OpenSearchBulkRetryWrapper(FlintRetryOptions retryOptions) { */ public BulkResponse bulkWithPartialRetry(RestHighLevelClient client, BulkRequest bulkRequest, RequestOptions options) { + final AtomicInteger requestCount = new AtomicInteger(0); try { final AtomicReference nextRequest = new AtomicReference<>(bulkRequest); - return Failsafe + BulkResponse res = Failsafe .with(retryPolicy) + .onFailure((event) -> { + if (event.isRetry()) { + MetricsUtil.addHistoricGauge( + MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC, 1); + } + }) .get(() -> { + requestCount.incrementAndGet(); BulkResponse response = client.bulk(nextRequest.get(), options); if (retryPolicy.getConfig().allowsRetries() && bulkItemRetryableResultPredicate.test( response)) { @@ -49,11 +65,15 @@ public BulkResponse bulkWithPartialRetry(RestHighLevelClient client, BulkRequest } return response; }); + return res; } catch (FailsafeException ex) { LOG.severe("Request failed permanently. Re-throwing original exception."); // unwrap original exception and throw throw new RuntimeException(ex.getCause()); + } finally { + MetricsUtil.addHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, bulkRequest.estimatedSizeInBytes()); + MetricsUtil.addHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, requestCount.get() - 1); } } diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchUpdater.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchUpdater.java index 0d84b4956..eb7264a84 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchUpdater.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/OpenSearchUpdater.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; import org.opensearch.action.support.WriteRequest; diff --git a/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsTestUtil.java b/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsTestUtil.java new file mode 100644 index 000000000..05febb92b --- /dev/null +++ b/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsTestUtil.java @@ -0,0 +1,79 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.core.metrics; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.RETURNS_DEEP_STUBS; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.mockStatic; +import static org.mockito.Mockito.when; + +import com.codahale.metrics.MetricRegistry; +import java.util.List; +import lombok.AllArgsConstructor; +import org.apache.spark.SparkEnv; +import org.apache.spark.metrics.source.Source; +import org.mockito.MockedStatic; +import org.opensearch.flint.core.metrics.HistoricGauge.DataPoint; + +/** + * Utility class for verifying metrics + */ +public class MetricsTestUtil { + @AllArgsConstructor + public static class MetricsVerifier { + + final MetricRegistry metricRegistry; + + public void assertMetricExist(String metricName) { + assertNotNull(metricRegistry.getMetrics().get(metricName)); + } + + public void assertMetricClass(String metricName, Class clazz) { + assertMetricExist(metricName); + assertEquals(clazz, metricRegistry.getMetrics().get(metricName).getClass()); + } + + public void assertHistoricGauge(String metricName, long... values) { + HistoricGauge historicGauge = getHistoricGauge(metricName); + List dataPoints = historicGauge.getDataPoints(); + for (int i = 0; i < values.length; i++) { + assertEquals(values[i], dataPoints.get(i).getValue().longValue()); + } + } + + private HistoricGauge getHistoricGauge(String metricName) { + assertMetricClass(metricName, HistoricGauge.class); + return (HistoricGauge) metricRegistry.getMetrics().get(metricName); + } + + public void assertMetricNotExist(String metricName) { + assertNull(metricRegistry.getMetrics().get(metricName)); + } + } + + @FunctionalInterface + public interface ThrowableConsumer { + void accept(T t) throws Exception; + } + + public static void withMetricEnv(ThrowableConsumer test) throws Exception { + try (MockedStatic sparkEnvMock = mockStatic(SparkEnv.class)) { + SparkEnv sparkEnv = mock(SparkEnv.class, RETURNS_DEEP_STUBS); + sparkEnvMock.when(SparkEnv::get).thenReturn(sparkEnv); + + Source metricSource = mock(Source.class); + MetricRegistry metricRegistry = new MetricRegistry(); + when(metricSource.metricRegistry()).thenReturn(metricRegistry); + when(sparkEnv.metricsSystem().getSourcesByName(any()).head()).thenReturn(metricSource); + + test.accept(new MetricsVerifier(metricRegistry)); + } + } +} diff --git a/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsUtilTest.java b/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsUtilTest.java index 70b51ed63..c586c729a 100644 --- a/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsUtilTest.java +++ b/flint-core/src/test/java/org/opensearch/flint/core/metrics/MetricsUtilTest.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.metrics; import com.codahale.metrics.Counter; diff --git a/flint-core/src/test/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameTest.java b/flint-core/src/test/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameTest.java index 6bc6a9c2d..c9f6d62f5 100644 --- a/flint-core/src/test/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameTest.java +++ b/flint-core/src/test/java/org/opensearch/flint/core/metrics/reporter/DimensionedNameTest.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.metrics.reporter; import static org.hamcrest.CoreMatchers.hasItems; diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/auth/AWSRequestSigningApacheInterceptorTest.java b/flint-core/src/test/scala/org/opensearch/flint/core/auth/AWSRequestSigningApacheInterceptorTest.java index ae8fdfa9a..599ed107d 100644 --- a/flint-core/src/test/scala/org/opensearch/flint/core/auth/AWSRequestSigningApacheInterceptorTest.java +++ b/flint-core/src/test/scala/org/opensearch/flint/core/auth/AWSRequestSigningApacheInterceptorTest.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.auth; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolderTest.java b/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolderTest.java index f2f160973..b12c2c522 100644 --- a/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolderTest.java +++ b/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterHolderTest.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; import static org.junit.jupiter.api.Assertions.assertEquals; diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterTest.java b/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterTest.java index d86f06d24..b87c9f797 100644 --- a/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterTest.java +++ b/flint-core/src/test/scala/org/opensearch/flint/core/storage/BulkRequestRateLimiterTest.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverterSuite.scala b/flint-core/src/test/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverterSuite.scala index 577dfc5fc..2708d48e8 100644 --- a/flint-core/src/test/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverterSuite.scala +++ b/flint-core/src/test/scala/org/opensearch/flint/core/storage/FlintMetadataLogEntryOpenSearchConverterSuite.scala @@ -25,6 +25,10 @@ class FlintMetadataLogEntryOpenSearchConverterTest val sourceMap = JMap.of( "jobStartTime", 1234567890123L.asInstanceOf[Object], + "lastRefreshStartTime", + 1234567890123L.asInstanceOf[Object], + "lastRefreshCompleteTime", + 1234567890123L.asInstanceOf[Object], "state", "active".asInstanceOf[Object], "dataSourceName", @@ -36,6 +40,8 @@ class FlintMetadataLogEntryOpenSearchConverterTest when(mockLogEntry.id).thenReturn("id") when(mockLogEntry.state).thenReturn(FlintMetadataLogEntry.IndexState.ACTIVE) when(mockLogEntry.createTime).thenReturn(1234567890123L) + when(mockLogEntry.lastRefreshStartTime).thenReturn(1234567890123L) + when(mockLogEntry.lastRefreshCompleteTime).thenReturn(1234567890123L) when(mockLogEntry.error).thenReturn("") when(mockLogEntry.properties).thenReturn(Map("dataSourceName" -> "testDataSource")) } @@ -45,7 +51,7 @@ class FlintMetadataLogEntryOpenSearchConverterTest val expectedJsonWithoutLastUpdateTime = s""" |{ - | "version": "1.0", + | "version": "1.1", | "latestId": "id", | "type": "flintindexstate", | "state": "active", @@ -53,6 +59,8 @@ class FlintMetadataLogEntryOpenSearchConverterTest | "jobId": "unknown", | "dataSourceName": "testDataSource", | "jobStartTime": 1234567890123, + | "lastRefreshStartTime": 1234567890123, + | "lastRefreshCompleteTime": 1234567890123, | "error": "" |} |""".stripMargin @@ -67,15 +75,22 @@ class FlintMetadataLogEntryOpenSearchConverterTest logEntry shouldBe a[FlintMetadataLogEntry] logEntry.id shouldBe "id" logEntry.createTime shouldBe 1234567890123L + logEntry.lastRefreshStartTime shouldBe 1234567890123L + logEntry.lastRefreshCompleteTime shouldBe 1234567890123L logEntry.state shouldBe FlintMetadataLogEntry.IndexState.ACTIVE logEntry.error shouldBe "" logEntry.properties.get("dataSourceName").get shouldBe "testDataSource" } - it should "construct log entry with integer jobStartTime value" in { + it should "construct log entry with integer timestamp value" in { + // Use Integer instead of Long for timestamps val testSourceMap = JMap.of( "jobStartTime", - 1234567890.asInstanceOf[Object], // Integer instead of Long + 1234567890.asInstanceOf[Object], + "lastRefreshStartTime", + 1234567890.asInstanceOf[Object], + "lastRefreshCompleteTime", + 1234567890.asInstanceOf[Object], "state", "active".asInstanceOf[Object], "dataSourceName", @@ -87,6 +102,8 @@ class FlintMetadataLogEntryOpenSearchConverterTest logEntry shouldBe a[FlintMetadataLogEntry] logEntry.id shouldBe "id" logEntry.createTime shouldBe 1234567890 + logEntry.lastRefreshStartTime shouldBe 1234567890 + logEntry.lastRefreshCompleteTime shouldBe 1234567890 logEntry.state shouldBe FlintMetadataLogEntry.IndexState.ACTIVE logEntry.error shouldBe "" logEntry.properties.get("dataSourceName").get shouldBe "testDataSource" diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapperTest.java b/flint-core/src/test/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapperTest.java index fa57da842..43bd8d2b2 100644 --- a/flint-core/src/test/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapperTest.java +++ b/flint-core/src/test/scala/org/opensearch/flint/core/storage/OpenSearchBulkRetryWrapperTest.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.flint.core.storage; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -24,11 +29,14 @@ import org.opensearch.client.RequestOptions; import org.opensearch.client.RestHighLevelClient; import org.opensearch.flint.core.http.FlintRetryOptions; +import org.opensearch.flint.core.metrics.MetricConstants; +import org.opensearch.flint.core.metrics.MetricsTestUtil; import org.opensearch.rest.RestStatus; @ExtendWith(MockitoExtension.class) class OpenSearchBulkRetryWrapperTest { + private static final long ESTIMATED_SIZE_IN_BYTES = 1000L; @Mock BulkRequest bulkRequest; @Mock @@ -45,12 +53,7 @@ class OpenSearchBulkRetryWrapperTest { DocWriteResponse docWriteResponse; @Mock IndexRequest indexRequest0, indexRequest1; - @Mock IndexRequest docWriteRequest2; -// BulkItemRequest[] bulkItemRequests = new BulkItemRequest[] { -// new BulkItemRequest(0, docWriteRequest0), -// new BulkItemRequest(1, docWriteRequest1), -// new BulkItemRequest(2, docWriteRequest2), -// }; + BulkItemResponse successItem = new BulkItemResponse(0, OpType.CREATE, docWriteResponse); BulkItemResponse failureItem = new BulkItemResponse(0, OpType.CREATE, new Failure("index", "id", null, @@ -65,87 +68,125 @@ class OpenSearchBulkRetryWrapperTest { @Test public void withRetryWhenCallSucceed() throws Exception { - OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( - retryOptionsWithRetry); - when(client.bulk(bulkRequest, options)).thenReturn(successResponse); - when(successResponse.hasFailures()).thenReturn(false); - - BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); - - assertEquals(response, successResponse); - verify(client).bulk(bulkRequest, options); + MetricsTestUtil.withMetricEnv(verifier -> { + OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( + retryOptionsWithRetry); + when(client.bulk(bulkRequest, options)).thenReturn(successResponse); + when(successResponse.hasFailures()).thenReturn(false); + when(bulkRequest.estimatedSizeInBytes()).thenReturn(ESTIMATED_SIZE_IN_BYTES); + + BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); + + assertEquals(response, successResponse); + verify(client).bulk(bulkRequest, options); + + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, ESTIMATED_SIZE_IN_BYTES); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, 0); + verifier.assertMetricNotExist(MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC); + }); } @Test public void withRetryWhenCallConflict() throws Exception { - OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( - retryOptionsWithRetry); - when(client.bulk(any(), eq(options))) - .thenReturn(conflictResponse); - mockConflictResponse(); - when(conflictResponse.hasFailures()).thenReturn(true); - - BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); - - assertEquals(response, conflictResponse); - verify(client).bulk(bulkRequest, options); + MetricsTestUtil.withMetricEnv(verifier -> { + OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( + retryOptionsWithRetry); + when(client.bulk(any(), eq(options))) + .thenReturn(conflictResponse); + mockConflictResponse(); + when(conflictResponse.hasFailures()).thenReturn(true); + when(bulkRequest.estimatedSizeInBytes()).thenReturn(ESTIMATED_SIZE_IN_BYTES); + + BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); + + assertEquals(response, conflictResponse); + verify(client).bulk(bulkRequest, options); + + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, ESTIMATED_SIZE_IN_BYTES); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, 0); + verifier.assertMetricNotExist(MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC); + }); } @Test public void withRetryWhenCallFailOnce() throws Exception { - OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( - retryOptionsWithRetry); - when(client.bulk(any(), eq(options))) - .thenReturn(failureResponse) - .thenReturn(successResponse); - mockFailureResponse(); - when(successResponse.hasFailures()).thenReturn(false); - when(bulkRequest.requests()).thenReturn(ImmutableList.of(indexRequest0, indexRequest1)); - - BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); - - assertEquals(response, successResponse); - verify(client, times(2)).bulk(any(), eq(options)); + MetricsTestUtil.withMetricEnv(verifier -> { + OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( + retryOptionsWithRetry); + when(client.bulk(any(), eq(options))) + .thenReturn(failureResponse) + .thenReturn(successResponse); + mockFailureResponse(); + when(successResponse.hasFailures()).thenReturn(false); + when(bulkRequest.requests()).thenReturn(ImmutableList.of(indexRequest0, indexRequest1)); + when(bulkRequest.estimatedSizeInBytes()).thenReturn(ESTIMATED_SIZE_IN_BYTES); + + BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); + + assertEquals(response, successResponse); + verify(client, times(2)).bulk(any(), eq(options)); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, ESTIMATED_SIZE_IN_BYTES); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, 1); + verifier.assertMetricNotExist(MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC); + }); } @Test public void withRetryWhenAllCallFail() throws Exception { - OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( - retryOptionsWithRetry); - when(client.bulk(any(), eq(options))) - .thenReturn(failureResponse); - mockFailureResponse(); - - BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); - - assertEquals(response, failureResponse); - verify(client, times(3)).bulk(any(), eq(options)); + MetricsTestUtil.withMetricEnv(verifier -> { + OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( + retryOptionsWithRetry); + when(client.bulk(any(), eq(options))) + .thenReturn(failureResponse); + when(bulkRequest.estimatedSizeInBytes()).thenReturn(ESTIMATED_SIZE_IN_BYTES); + mockFailureResponse(); + + BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); + + assertEquals(response, failureResponse); + verify(client, times(3)).bulk(any(), eq(options)); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, ESTIMATED_SIZE_IN_BYTES); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, 2); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC, 1); + }); } @Test public void withRetryWhenCallThrowsShouldNotRetry() throws Exception { - OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( - retryOptionsWithRetry); - when(client.bulk(bulkRequest, options)).thenThrow(new RuntimeException("test")); - - assertThrows(RuntimeException.class, - () -> bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options)); - - verify(client).bulk(bulkRequest, options); + MetricsTestUtil.withMetricEnv(verifier -> { + OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( + retryOptionsWithRetry); + when(client.bulk(bulkRequest, options)).thenThrow(new RuntimeException("test")); + when(bulkRequest.estimatedSizeInBytes()).thenReturn(ESTIMATED_SIZE_IN_BYTES); + + assertThrows(RuntimeException.class, + () -> bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options)); + + verify(client).bulk(bulkRequest, options); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, ESTIMATED_SIZE_IN_BYTES); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, 0); + verifier.assertMetricNotExist(MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC); + }); } @Test public void withoutRetryWhenCallFail() throws Exception { - OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( - retryOptionsWithoutRetry); - when(client.bulk(bulkRequest, options)) - .thenReturn(failureResponse); - mockFailureResponse(); - - BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); - - assertEquals(response, failureResponse); - verify(client).bulk(bulkRequest, options); + MetricsTestUtil.withMetricEnv(verifier -> { + OpenSearchBulkRetryWrapper bulkRetryWrapper = new OpenSearchBulkRetryWrapper( + retryOptionsWithoutRetry); + when(client.bulk(bulkRequest, options)) + .thenReturn(failureResponse); + when(bulkRequest.estimatedSizeInBytes()).thenReturn(ESTIMATED_SIZE_IN_BYTES); + mockFailureResponse(); + + BulkResponse response = bulkRetryWrapper.bulkWithPartialRetry(client, bulkRequest, options); + + assertEquals(response, failureResponse); + verify(client).bulk(bulkRequest, options); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_SIZE_METRIC, ESTIMATED_SIZE_IN_BYTES); + verifier.assertHistoricGauge(MetricConstants.OPENSEARCH_BULK_RETRY_COUNT_METRIC, 0); + verifier.assertMetricNotExist(MetricConstants.OPENSEARCH_BULK_ALL_RETRY_FAILED_COUNT_METRIC); + }); } private void mockFailureResponse() { diff --git a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala index 68721d235..bdcc120c0 100644 --- a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala +++ b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala @@ -253,6 +253,10 @@ object FlintSparkConf { FlintConfig("spark.metadata.accessAWSCredentialsProvider") .doc("AWS credentials provider for metadata access permission") .createOptional() + val METADATA_CACHE_WRITE = FlintConfig("spark.flint.metadataCacheWrite.enabled") + .doc("Enable Flint metadata cache write to Flint index mappings") + .createWithDefault("false") + val CUSTOM_SESSION_MANAGER = FlintConfig("spark.flint.job.customSessionManager") .createOptional() @@ -309,6 +313,8 @@ case class FlintSparkConf(properties: JMap[String, String]) extends Serializable def monitorMaxErrorCount(): Int = MONITOR_MAX_ERROR_COUNT.readFrom(reader).toInt + def isMetadataCacheWriteEnabled: Boolean = METADATA_CACHE_WRITE.readFrom(reader).toBoolean + /** * spark.sql.session.timeZone */ diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala index 532bd8e60..b4412a3d4 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala @@ -20,6 +20,7 @@ import org.opensearch.flint.core.metadata.log.FlintMetadataLogServiceBuilder import org.opensearch.flint.spark.FlintSparkIndex.ID_COLUMN import org.opensearch.flint.spark.FlintSparkIndexOptions.OptionName._ import org.opensearch.flint.spark.covering.FlintSparkCoveringIndex +import org.opensearch.flint.spark.metadatacache.FlintMetadataCacheWriterBuilder import org.opensearch.flint.spark.mv.FlintSparkMaterializedView import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh import org.opensearch.flint.spark.refresh.FlintSparkIndexRefresh.RefreshMode._ @@ -56,6 +57,8 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w FlintIndexMetadataServiceBuilder.build(flintSparkConf.flintOptions()) } + private val flintMetadataCacheWriter = FlintMetadataCacheWriterBuilder.build(flintSparkConf) + private val flintAsyncQueryScheduler: AsyncQueryScheduler = { AsyncQuerySchedulerBuilder.build(flintSparkConf.flintOptions()) } @@ -117,7 +120,6 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w throw new IllegalStateException(s"Flint index $indexName already exists") } } else { - val metadata = index.metadata() val jobSchedulingService = FlintSparkJobSchedulingService.create( index, spark, @@ -129,15 +131,18 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w .transientLog(latest => latest.copy(state = CREATING)) .finalLog(latest => latest.copy(state = ACTIVE)) .commit(latest => { - if (latest == null) { // in case transaction capability is disabled - flintClient.createIndex(indexName, metadata) - flintIndexMetadataService.updateIndexMetadata(indexName, metadata) - } else { - logInfo(s"Creating index with metadata log entry ID ${latest.id}") - flintClient.createIndex(indexName, metadata.copy(latestId = Some(latest.id))) - flintIndexMetadataService - .updateIndexMetadata(indexName, metadata.copy(latestId = Some(latest.id))) + val metadata = latest match { + case null => // in case transaction capability is disabled + index.metadata() + case latestEntry => + logInfo(s"Creating index with metadata log entry ID ${latestEntry.id}") + index + .metadata() + .copy(latestId = Some(latestEntry.id), latestLogEntry = Some(latest)) } + flintClient.createIndex(indexName, metadata) + flintIndexMetadataService.updateIndexMetadata(indexName, metadata) + flintMetadataCacheWriter.updateMetadataCache(indexName, metadata) jobSchedulingService.handleJob(index, AsyncQuerySchedulerAction.SCHEDULE) }) } @@ -156,22 +161,10 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w val index = describeIndex(indexName) .getOrElse(throw new IllegalStateException(s"Index $indexName doesn't exist")) val indexRefresh = FlintSparkIndexRefresh.create(indexName, index) - tx - .initialLog(latest => latest.state == ACTIVE) - .transientLog(latest => - latest.copy(state = REFRESHING, createTime = System.currentTimeMillis())) - .finalLog(latest => { - // Change state to active if full, otherwise update index state regularly - if (indexRefresh.refreshMode == AUTO) { - logInfo("Scheduling index state monitor") - flintIndexMonitor.startMonitor(indexName) - latest - } else { - logInfo("Updating index state to active") - latest.copy(state = ACTIVE) - } - }) - .commit(_ => indexRefresh.start(spark, flintSparkConf)) + indexRefresh.refreshMode match { + case AUTO => refreshIndexAuto(index, indexRefresh, tx) + case FULL | INCREMENTAL => refreshIndexManual(index, indexRefresh, tx) + } }.flatten /** @@ -520,6 +513,63 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w updatedOptions.isExternalSchedulerEnabled() != originalOptions.isExternalSchedulerEnabled() } + /** + * Handles refresh for refresh mode AUTO, which is used exclusively by auto refresh index with + * internal scheduler. Refresh start time and complete time aren't tracked for streaming job. + * TODO: in future, track MicroBatchExecution time for streaming job and update as well + */ + private def refreshIndexAuto( + index: FlintSparkIndex, + indexRefresh: FlintSparkIndexRefresh, + tx: OptimisticTransaction[Option[String]]): Option[String] = { + val indexName = index.name + tx + .initialLog(latest => latest.state == ACTIVE) + .transientLog(latest => + latest.copy(state = REFRESHING, createTime = System.currentTimeMillis())) + .finalLog(latest => { + logInfo("Scheduling index state monitor") + flintIndexMonitor.startMonitor(indexName) + latest + }) + .commit(_ => indexRefresh.start(spark, flintSparkConf)) + } + + /** + * Handles refresh for refresh mode FULL and INCREMENTAL, which is used by full refresh index, + * incremental refresh index, and auto refresh index with external scheduler. Stores refresh + * start time and complete time. + */ + private def refreshIndexManual( + index: FlintSparkIndex, + indexRefresh: FlintSparkIndexRefresh, + tx: OptimisticTransaction[Option[String]]): Option[String] = { + val indexName = index.name + tx + .initialLog(latest => latest.state == ACTIVE) + .transientLog(latest => { + val currentTime = System.currentTimeMillis() + val updatedLatest = latest + .copy(state = REFRESHING, createTime = currentTime, lastRefreshStartTime = currentTime) + flintMetadataCacheWriter + .updateMetadataCache( + indexName, + index.metadata.copy(latestLogEntry = Some(updatedLatest))) + updatedLatest + }) + .finalLog(latest => { + logInfo("Updating index state to active") + val updatedLatest = + latest.copy(state = ACTIVE, lastRefreshCompleteTime = System.currentTimeMillis()) + flintMetadataCacheWriter + .updateMetadataCache( + indexName, + index.metadata.copy(latestLogEntry = Some(updatedLatest))) + updatedLatest + }) + .commit(_ => indexRefresh.start(spark, flintSparkConf)) + } + private def updateIndexAutoToManual( index: FlintSparkIndex, tx: OptimisticTransaction[Option[String]]): Option[String] = { @@ -539,8 +589,10 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w .transientLog(latest => latest.copy(state = UPDATING)) .finalLog(latest => latest.copy(state = jobSchedulingService.stateTransitions.finalStateForUnschedule)) - .commit(_ => { + .commit(latest => { flintIndexMetadataService.updateIndexMetadata(indexName, index.metadata) + flintMetadataCacheWriter + .updateMetadataCache(indexName, index.metadata.copy(latestLogEntry = Some(latest))) logInfo("Update index options complete") jobSchedulingService.handleJob(index, AsyncQuerySchedulerAction.UNSCHEDULE) None @@ -566,8 +618,10 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w .finalLog(latest => { latest.copy(state = jobSchedulingService.stateTransitions.finalStateForUpdate) }) - .commit(_ => { + .commit(latest => { flintIndexMetadataService.updateIndexMetadata(indexName, index.metadata) + flintMetadataCacheWriter + .updateMetadataCache(indexName, index.metadata.copy(latestLogEntry = Some(latest))) logInfo("Update index options complete") jobSchedulingService.handleJob(index, AsyncQuerySchedulerAction.UPDATE) }) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkCheckpoint.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkCheckpoint.scala index 4c18fea77..6ae55bd33 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkCheckpoint.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSparkCheckpoint.scala @@ -8,6 +8,7 @@ package org.opensearch.flint.spark import java.util.UUID import org.apache.hadoop.fs.{FSDataOutputStream, Path} +import org.opensearch.flint.core.metrics.{MetricConstants, MetricsUtil} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession @@ -75,7 +76,9 @@ class FlintSparkCheckpoint(spark: SparkSession, val checkpointLocation: String) */ def delete(): Unit = { try { - checkpointManager.delete(checkpointRootDir) + MetricsUtil.withLatencyAsHistoricGauge( + () => checkpointManager.delete(checkpointRootDir), + MetricConstants.CHECKPOINT_DELETE_TIME_METRIC) logInfo(s"Checkpoint directory $checkpointRootDir deleted") } catch { case e: Exception => diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintDisabledMetadataCacheWriter.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintDisabledMetadataCacheWriter.scala new file mode 100644 index 000000000..4099da3ff --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintDisabledMetadataCacheWriter.scala @@ -0,0 +1,17 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import org.opensearch.flint.common.metadata.FlintMetadata + +/** + * Default implementation of {@link FlintMetadataCacheWriter} that does nothing + */ +class FlintDisabledMetadataCacheWriter extends FlintMetadataCacheWriter { + override def updateMetadataCache(indexName: String, metadata: FlintMetadata): Unit = { + // Do nothing + } +} diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCache.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCache.scala new file mode 100644 index 000000000..e8a91e1be --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCache.scala @@ -0,0 +1,74 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import scala.collection.JavaConverters.mapAsScalaMapConverter + +import org.opensearch.flint.common.metadata.FlintMetadata +import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry +import org.opensearch.flint.spark.FlintSparkIndexOptions +import org.opensearch.flint.spark.scheduler.util.IntervalSchedulerParser + +/** + * Flint metadata cache defines metadata required to store in read cache for frontend user to + * access. + */ +case class FlintMetadataCache( + metadataCacheVersion: String, + /** Refresh interval for Flint index with auto refresh. Unit: seconds */ + refreshInterval: Option[Int], + /** Source table names for building the Flint index. */ + sourceTables: Array[String], + /** Timestamp when Flint index is last refreshed. Unit: milliseconds */ + lastRefreshTime: Option[Long]) { + + /** + * Convert FlintMetadataCache to a map. Skips a field if its value is not defined. + */ + def toMap: Map[String, AnyRef] = { + val fieldNames = getClass.getDeclaredFields.map(_.getName) + val fieldValues = productIterator.toList + + fieldNames + .zip(fieldValues) + .flatMap { + case (_, None) => List.empty + case (name, Some(value)) => List((name, value)) + case (name, value) => List((name, value)) + } + .toMap + .mapValues(_.asInstanceOf[AnyRef]) + } +} + +object FlintMetadataCache { + + // TODO: constant for version + val mockTableName = + "dataSourceName.default.logGroups(logGroupIdentifier:['arn:aws:logs:us-east-1:123456:test-llt-xa', 'arn:aws:logs:us-east-1:123456:sample-lg-1'])" + + def apply(metadata: FlintMetadata): FlintMetadataCache = { + val indexOptions = FlintSparkIndexOptions( + metadata.options.asScala.mapValues(_.asInstanceOf[String]).toMap) + val refreshInterval = if (indexOptions.autoRefresh()) { + indexOptions + .refreshInterval() + .map(IntervalSchedulerParser.parseAndConvertToMillis) + .map(millis => (millis / 1000).toInt) // convert to seconds + } else { + None + } + val lastRefreshTime: Option[Long] = metadata.latestLogEntry.flatMap { entry => + entry.lastRefreshCompleteTime match { + case FlintMetadataLogEntry.EMPTY_TIMESTAMP => None + case timestamp => Some(timestamp) + } + } + + // TODO: get source tables from metadata + FlintMetadataCache("1.0", refreshInterval, Array(mockTableName), lastRefreshTime) + } +} diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheWriter.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheWriter.scala new file mode 100644 index 000000000..c256463c3 --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheWriter.scala @@ -0,0 +1,27 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import org.opensearch.flint.common.metadata.{FlintIndexMetadataService, FlintMetadata} + +/** + * Writes {@link FlintMetadataCache} to a storage of choice. This is different from {@link + * FlintIndexMetadataService} which persists the full index metadata to a storage for single + * source of truth. + */ +trait FlintMetadataCacheWriter { + + /** + * Update metadata cache for a Flint index. + * + * @param indexName + * index name + * @param metadata + * index metadata to update the cache + */ + def updateMetadataCache(indexName: String, metadata: FlintMetadata): Unit + +} diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheWriterBuilder.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheWriterBuilder.scala new file mode 100644 index 000000000..be821ae25 --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheWriterBuilder.scala @@ -0,0 +1,18 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import org.apache.spark.sql.flint.config.FlintSparkConf + +object FlintMetadataCacheWriterBuilder { + def build(flintSparkConf: FlintSparkConf): FlintMetadataCacheWriter = { + if (flintSparkConf.isMetadataCacheWriteEnabled) { + new FlintOpenSearchMetadataCacheWriter(flintSparkConf.flintOptions()) + } else { + new FlintDisabledMetadataCacheWriter + } + } +} diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintOpenSearchMetadataCacheWriter.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintOpenSearchMetadataCacheWriter.scala new file mode 100644 index 000000000..2bc373792 --- /dev/null +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/metadatacache/FlintOpenSearchMetadataCacheWriter.scala @@ -0,0 +1,106 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import java.util + +import scala.collection.JavaConverters._ + +import org.opensearch.client.RequestOptions +import org.opensearch.client.indices.PutMappingRequest +import org.opensearch.common.xcontent.XContentType +import org.opensearch.flint.common.metadata.{FlintIndexMetadataService, FlintMetadata} +import org.opensearch.flint.core.{FlintOptions, IRestHighLevelClient} +import org.opensearch.flint.core.metadata.FlintIndexMetadataServiceBuilder +import org.opensearch.flint.core.metadata.FlintJsonHelper._ +import org.opensearch.flint.core.storage.{FlintOpenSearchIndexMetadataService, OpenSearchClientUtils} + +import org.apache.spark.internal.Logging + +/** + * Writes {@link FlintMetadataCache} to index mappings `_meta` field for frontend user to access. + */ +class FlintOpenSearchMetadataCacheWriter(options: FlintOptions) + extends FlintMetadataCacheWriter + with Logging { + + /** + * Since metadata cache shares the index mappings _meta field with OpenSearch index metadata + * storage, this flag is to allow for preserving index metadata that is already stored in _meta + * when updating metadata cache. + */ + private val includeSpec: Boolean = + FlintIndexMetadataServiceBuilder + .build(options) + .isInstanceOf[FlintOpenSearchIndexMetadataService] + + override def updateMetadataCache(indexName: String, metadata: FlintMetadata): Unit = { + logInfo(s"Updating metadata cache for $indexName"); + val osIndexName = OpenSearchClientUtils.sanitizeIndexName(indexName) + var client: IRestHighLevelClient = null + try { + client = OpenSearchClientUtils.createClient(options) + val request = new PutMappingRequest(osIndexName) + request.source(serialize(metadata), XContentType.JSON) + client.updateIndexMapping(request, RequestOptions.DEFAULT) + } catch { + case e: Exception => + throw new IllegalStateException( + s"Failed to update metadata cache for Flint index $osIndexName", + e) + } finally + if (client != null) { + client.close() + } + } + + /** + * Serialize FlintMetadataCache from FlintMetadata. Modified from {@link + * FlintOpenSearchIndexMetadataService} + */ + private[metadatacache] def serialize(metadata: FlintMetadata): String = { + try { + buildJson(builder => { + objectField(builder, "_meta") { + // If _meta is used as index metadata storage, preserve them. + if (includeSpec) { + builder + .field("version", metadata.version.version) + .field("name", metadata.name) + .field("kind", metadata.kind) + .field("source", metadata.source) + .field("indexedColumns", metadata.indexedColumns) + + if (metadata.latestId.isDefined) { + builder.field("latestId", metadata.latestId.get) + } + optionalObjectField(builder, "options", metadata.options) + } + + optionalObjectField(builder, "properties", buildPropertiesMap(metadata)) + } + builder.field("properties", metadata.schema) + }) + } catch { + case e: Exception => + throw new IllegalStateException("Failed to jsonify cache metadata", e) + } + } + + /** + * Since _meta.properties is shared by both index metadata and metadata cache, here we merge the + * two maps. + */ + private def buildPropertiesMap(metadata: FlintMetadata): util.Map[String, AnyRef] = { + val metadataCacheProperties = FlintMetadataCache(metadata).toMap + + if (includeSpec) { + (metadataCacheProperties ++ metadata.properties.asScala).asJava + } else { + metadataCacheProperties.asJava + } + } +} diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/scheduler/util/IntervalSchedulerParser.java b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/scheduler/util/IntervalSchedulerParser.java index 8745681b9..9622b4c64 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/scheduler/util/IntervalSchedulerParser.java +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/scheduler/util/IntervalSchedulerParser.java @@ -18,21 +18,30 @@ public class IntervalSchedulerParser { /** - * Parses a schedule string into an IntervalSchedule. + * Parses a schedule string into an integer in milliseconds. * * @param scheduleStr the schedule string to parse - * @return the parsed IntervalSchedule + * @return the parsed integer * @throws IllegalArgumentException if the schedule string is invalid */ - public static IntervalSchedule parse(String scheduleStr) { + public static Long parseAndConvertToMillis(String scheduleStr) { if (Strings.isNullOrEmpty(scheduleStr)) { throw new IllegalArgumentException("Schedule string must not be null or empty."); } - Long millis = Triggers.convert(scheduleStr); + return Triggers.convert(scheduleStr); + } + /** + * Parses a schedule string into an IntervalSchedule. + * + * @param scheduleStr the schedule string to parse + * @return the parsed IntervalSchedule + * @throws IllegalArgumentException if the schedule string is invalid + */ + public static IntervalSchedule parse(String scheduleStr) { // Convert milliseconds to minutes (rounding down) - int minutes = (int) (millis / (60 * 1000)); + int minutes = (int) (parseAndConvertToMillis(scheduleStr) / (60 * 1000)); // Use the current time as the start time Instant startTime = Instant.now(); diff --git a/flint-spark-integration/src/test/java/org/opensearch/flint/core/scheduler/util/IntervalSchedulerParserTest.java b/flint-spark-integration/src/test/java/org/opensearch/flint/core/scheduler/util/IntervalSchedulerParserTest.java index 2ad1fea9c..731e1ae5c 100644 --- a/flint-spark-integration/src/test/java/org/opensearch/flint/core/scheduler/util/IntervalSchedulerParserTest.java +++ b/flint-spark-integration/src/test/java/org/opensearch/flint/core/scheduler/util/IntervalSchedulerParserTest.java @@ -23,53 +23,92 @@ public void testParseNull() { IntervalSchedulerParser.parse(null); } + @Test(expected = IllegalArgumentException.class) + public void testParseMillisNull() { + IntervalSchedulerParser.parseAndConvertToMillis(null); + } + @Test(expected = IllegalArgumentException.class) public void testParseEmptyString() { IntervalSchedulerParser.parse(""); } + @Test(expected = IllegalArgumentException.class) + public void testParseMillisEmptyString() { + IntervalSchedulerParser.parseAndConvertToMillis(""); + } + @Test public void testParseString() { - Schedule result = IntervalSchedulerParser.parse("10 minutes"); - assertTrue(result instanceof IntervalSchedule); - IntervalSchedule intervalSchedule = (IntervalSchedule) result; + Schedule schedule = IntervalSchedulerParser.parse("10 minutes"); + assertTrue(schedule instanceof IntervalSchedule); + IntervalSchedule intervalSchedule = (IntervalSchedule) schedule; assertEquals(10, intervalSchedule.getInterval()); assertEquals(ChronoUnit.MINUTES, intervalSchedule.getUnit()); } + @Test + public void testParseMillisString() { + Long millis = IntervalSchedulerParser.parseAndConvertToMillis("10 minutes"); + assertEquals(600000, millis.longValue()); + } + @Test(expected = IllegalArgumentException.class) public void testParseInvalidFormat() { IntervalSchedulerParser.parse("invalid format"); } + @Test(expected = IllegalArgumentException.class) + public void testParseMillisInvalidFormat() { + IntervalSchedulerParser.parseAndConvertToMillis("invalid format"); + } + @Test public void testParseStringScheduleMinutes() { - IntervalSchedule result = IntervalSchedulerParser.parse("5 minutes"); - assertEquals(5, result.getInterval()); - assertEquals(ChronoUnit.MINUTES, result.getUnit()); + IntervalSchedule schedule = IntervalSchedulerParser.parse("5 minutes"); + assertEquals(5, schedule.getInterval()); + assertEquals(ChronoUnit.MINUTES, schedule.getUnit()); + } + + @Test + public void testParseMillisStringScheduleMinutes() { + Long millis = IntervalSchedulerParser.parseAndConvertToMillis("5 minutes"); + assertEquals(300000, millis.longValue()); } @Test public void testParseStringScheduleHours() { - IntervalSchedule result = IntervalSchedulerParser.parse("2 hours"); - assertEquals(120, result.getInterval()); - assertEquals(ChronoUnit.MINUTES, result.getUnit()); + IntervalSchedule schedule = IntervalSchedulerParser.parse("2 hours"); + assertEquals(120, schedule.getInterval()); + assertEquals(ChronoUnit.MINUTES, schedule.getUnit()); + } + + @Test + public void testParseMillisStringScheduleHours() { + Long millis = IntervalSchedulerParser.parseAndConvertToMillis("2 hours"); + assertEquals(7200000, millis.longValue()); } @Test public void testParseStringScheduleDays() { - IntervalSchedule result = IntervalSchedulerParser.parse("1 day"); - assertEquals(1440, result.getInterval()); - assertEquals(ChronoUnit.MINUTES, result.getUnit()); + IntervalSchedule schedule = IntervalSchedulerParser.parse("1 day"); + assertEquals(1440, schedule.getInterval()); + assertEquals(ChronoUnit.MINUTES, schedule.getUnit()); + } + + @Test + public void testParseMillisStringScheduleDays() { + Long millis = IntervalSchedulerParser.parseAndConvertToMillis("1 day"); + assertEquals(86400000, millis.longValue()); } @Test public void testParseStringScheduleStartTime() { Instant before = Instant.now(); - IntervalSchedule result = IntervalSchedulerParser.parse("30 minutes"); + IntervalSchedule schedule = IntervalSchedulerParser.parse("30 minutes"); Instant after = Instant.now(); - assertTrue(result.getStartTime().isAfter(before) || result.getStartTime().equals(before)); - assertTrue(result.getStartTime().isBefore(after) || result.getStartTime().equals(after)); + assertTrue(schedule.getStartTime().isAfter(before) || schedule.getStartTime().equals(before)); + assertTrue(schedule.getStartTime().isBefore(after) || schedule.getStartTime().equals(after)); } } \ No newline at end of file diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala index ee8a52d96..e43b0c52c 100644 --- a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala +++ b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala @@ -10,7 +10,7 @@ import org.opensearch.flint.spark.FlintSparkExtensions import org.apache.spark.sql.catalyst.expressions.CodegenObjectFactoryMode import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.flint.config.FlintConfigEntry -import org.apache.spark.sql.flint.config.FlintSparkConf.HYBRID_SCAN_ENABLED +import org.apache.spark.sql.flint.config.FlintSparkConf.{EXTERNAL_SCHEDULER_ENABLED, HYBRID_SCAN_ENABLED, METADATA_CACHE_WRITE} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSparkSession @@ -44,4 +44,22 @@ trait FlintSuite extends SharedSparkSession { setFlintSparkConf(HYBRID_SCAN_ENABLED, "false") } } + + protected def withExternalSchedulerEnabled(block: => Unit): Unit = { + setFlintSparkConf(EXTERNAL_SCHEDULER_ENABLED, "true") + try { + block + } finally { + setFlintSparkConf(EXTERNAL_SCHEDULER_ENABLED, "false") + } + } + + protected def withMetadataCacheWriteEnabled(block: => Unit): Unit = { + setFlintSparkConf(METADATA_CACHE_WRITE, "true") + try { + block + } finally { + setFlintSparkConf(METADATA_CACHE_WRITE, "false") + } + } } diff --git a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/covering/ApplyFlintSparkCoveringIndexSuite.scala b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/covering/ApplyFlintSparkCoveringIndexSuite.scala index 2c5518778..96c71d94b 100644 --- a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/covering/ApplyFlintSparkCoveringIndexSuite.scala +++ b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/covering/ApplyFlintSparkCoveringIndexSuite.scala @@ -246,6 +246,8 @@ class ApplyFlintSparkCoveringIndexSuite extends FlintSuite with Matchers { new FlintMetadataLogEntry( "id", 0, + 0, + 0, state, Map("seqNo" -> 0, "primaryTerm" -> 0), "", diff --git a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheSuite.scala b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheSuite.scala new file mode 100644 index 000000000..c6d2cf12a --- /dev/null +++ b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/metadatacache/FlintMetadataCacheSuite.scala @@ -0,0 +1,79 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry +import org.opensearch.flint.core.storage.FlintOpenSearchIndexMetadataService +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +class FlintMetadataCacheSuite extends AnyFlatSpec with Matchers { + val flintMetadataLogEntry = FlintMetadataLogEntry( + "id", + 0L, + 0L, + 1234567890123L, + FlintMetadataLogEntry.IndexState.ACTIVE, + Map.empty[String, Any], + "", + Map.empty[String, Any]) + + it should "construct from FlintMetadata" in { + val content = + """ { + | "_meta": { + | "kind": "test_kind", + | "options": { + | "auto_refresh": "true", + | "refresh_interval": "10 Minutes" + | } + | }, + | "properties": { + | "age": { + | "type": "integer" + | } + | } + | } + |""".stripMargin + val metadata = FlintOpenSearchIndexMetadataService + .deserialize(content) + .copy(latestLogEntry = Some(flintMetadataLogEntry)) + + val metadataCache = FlintMetadataCache(metadata) + metadataCache.metadataCacheVersion shouldBe "1.0" + metadataCache.refreshInterval.get shouldBe 600 + metadataCache.sourceTables shouldBe Array(FlintMetadataCache.mockTableName) + metadataCache.lastRefreshTime.get shouldBe 1234567890123L + } + + it should "construct from FlintMetadata excluding invalid fields" in { + // Set auto_refresh = false and lastRefreshCompleteTime = 0 + val content = + """ { + | "_meta": { + | "kind": "test_kind", + | "options": { + | "refresh_interval": "10 Minutes" + | } + | }, + | "properties": { + | "age": { + | "type": "integer" + | } + | } + | } + |""".stripMargin + val metadata = FlintOpenSearchIndexMetadataService + .deserialize(content) + .copy(latestLogEntry = Some(flintMetadataLogEntry.copy(lastRefreshCompleteTime = 0L))) + + val metadataCache = FlintMetadataCache(metadata) + metadataCache.metadataCacheVersion shouldBe "1.0" + metadataCache.refreshInterval shouldBe empty + metadataCache.sourceTables shouldBe Array(FlintMetadataCache.mockTableName) + metadataCache.lastRefreshTime shouldBe empty + } +} diff --git a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/skipping/ApplyFlintSparkSkippingIndexSuite.scala b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/skipping/ApplyFlintSparkSkippingIndexSuite.scala index f03116de9..d56c4e66f 100644 --- a/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/skipping/ApplyFlintSparkSkippingIndexSuite.scala +++ b/flint-spark-integration/src/test/scala/org/opensearch/flint/spark/skipping/ApplyFlintSparkSkippingIndexSuite.scala @@ -132,6 +132,8 @@ class ApplyFlintSparkSkippingIndexSuite extends FlintSuite with Matchers { new FlintMetadataLogEntry( "id", 0L, + 0L, + 0L, indexState, Map.empty[String, Any], "", diff --git a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintMetadataLogITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintMetadataLogITSuite.scala index 9aeba7512..33702f23f 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintMetadataLogITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintMetadataLogITSuite.scala @@ -25,10 +25,12 @@ class FlintMetadataLogITSuite extends OpenSearchTransactionSuite with Matchers { val testFlintIndex = "flint_test_index" val testLatestId: String = Base64.getEncoder.encodeToString(testFlintIndex.getBytes) - val testCreateTime = 1234567890123L + val testTimestamp = 1234567890123L val flintMetadataLogEntry = FlintMetadataLogEntry( testLatestId, - testCreateTime, + testTimestamp, + testTimestamp, + testTimestamp, ACTIVE, Map("seqNo" -> UNASSIGNED_SEQ_NO, "primaryTerm" -> UNASSIGNED_PRIMARY_TERM), "", @@ -85,8 +87,10 @@ class FlintMetadataLogITSuite extends OpenSearchTransactionSuite with Matchers { val latest = metadataLog.get.getLatest latest.isPresent shouldBe true latest.get.id shouldBe testLatestId - latest.get.createTime shouldBe testCreateTime + latest.get.createTime shouldBe testTimestamp latest.get.error shouldBe "" + latest.get.lastRefreshStartTime shouldBe testTimestamp + latest.get.lastRefreshCompleteTime shouldBe testTimestamp latest.get.properties.get("dataSourceName").get shouldBe testDataSourceName } diff --git a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintTransactionITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintTransactionITSuite.scala index 605e8e7fd..df5f4eec2 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintTransactionITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintTransactionITSuite.scala @@ -24,6 +24,7 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { val testFlintIndex = "flint_test_index" val testLatestId: String = Base64.getEncoder.encodeToString(testFlintIndex.getBytes) + val testTimestamp = 1234567890123L var flintMetadataLogService: FlintMetadataLogService = _ override def beforeAll(): Unit = { @@ -40,6 +41,8 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { latest.id shouldBe testLatestId latest.state shouldBe EMPTY latest.createTime shouldBe 0L + latest.lastRefreshStartTime shouldBe 0L + latest.lastRefreshCompleteTime shouldBe 0L latest.error shouldBe "" latest.properties.get("dataSourceName").get shouldBe testDataSourceName true @@ -49,11 +52,12 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { } test("should preserve original values when transition") { - val testCreateTime = 1234567890123L createLatestLogEntry( FlintMetadataLogEntry( id = testLatestId, - createTime = testCreateTime, + createTime = testTimestamp, + lastRefreshStartTime = testTimestamp, + lastRefreshCompleteTime = testTimestamp, state = ACTIVE, Map("seqNo" -> UNASSIGNED_SEQ_NO, "primaryTerm" -> UNASSIGNED_PRIMARY_TERM), error = "", @@ -63,8 +67,10 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { .startTransaction(testFlintIndex) .initialLog(latest => { latest.id shouldBe testLatestId - latest.createTime shouldBe testCreateTime + latest.createTime shouldBe testTimestamp latest.error shouldBe "" + latest.lastRefreshStartTime shouldBe testTimestamp + latest.lastRefreshCompleteTime shouldBe testTimestamp latest.properties.get("dataSourceName").get shouldBe testDataSourceName true }) @@ -72,8 +78,10 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { .finalLog(latest => latest.copy(state = DELETED)) .commit(latest => { latest.id shouldBe testLatestId - latest.createTime shouldBe testCreateTime + latest.createTime shouldBe testTimestamp latest.error shouldBe "" + latest.lastRefreshStartTime shouldBe testTimestamp + latest.lastRefreshCompleteTime shouldBe testTimestamp latest.properties.get("dataSourceName").get shouldBe testDataSourceName }) @@ -112,7 +120,9 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { createLatestLogEntry( FlintMetadataLogEntry( id = testLatestId, - createTime = 1234567890123L, + createTime = testTimestamp, + lastRefreshStartTime = testTimestamp, + lastRefreshCompleteTime = testTimestamp, state = ACTIVE, Map("seqNo" -> UNASSIGNED_SEQ_NO, "primaryTerm" -> UNASSIGNED_PRIMARY_TERM), error = "", @@ -198,7 +208,9 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { createLatestLogEntry( FlintMetadataLogEntry( id = testLatestId, - createTime = 1234567890123L, + createTime = testTimestamp, + lastRefreshStartTime = testTimestamp, + lastRefreshCompleteTime = testTimestamp, state = ACTIVE, Map("seqNo" -> UNASSIGNED_SEQ_NO, "primaryTerm" -> UNASSIGNED_PRIMARY_TERM), error = "", @@ -240,6 +252,8 @@ class FlintTransactionITSuite extends OpenSearchTransactionSuite with Matchers { latest.id shouldBe testLatestId latest.state shouldBe EMPTY latest.createTime shouldBe 0L + latest.lastRefreshStartTime shouldBe 0L + latest.lastRefreshCompleteTime shouldBe 0L latest.error shouldBe "" latest.properties.get("dataSourceName").get shouldBe testDataSourceName true diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala index debb95370..e16d40f2a 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkTransactionITSuite.scala @@ -55,6 +55,8 @@ class FlintSparkTransactionITSuite extends OpenSearchTransactionSuite with Match latestLogEntry(testLatestId) should (contain("latestId" -> testLatestId) and contain("state" -> "active") and contain("jobStartTime" -> 0) + and contain("lastRefreshStartTime" -> 0) + and contain("lastRefreshCompleteTime" -> 0) and contain("dataSourceName" -> testDataSourceName)) implicit val formats: Formats = Serialization.formats(NoTypeHints) @@ -77,9 +79,25 @@ class FlintSparkTransactionITSuite extends OpenSearchTransactionSuite with Match .create() flint.refreshIndex(testFlintIndex) - val latest = latestLogEntry(testLatestId) + var latest = latestLogEntry(testLatestId) + val prevJobStartTime = latest("jobStartTime").asInstanceOf[Number].longValue() + val prevLastRefreshStartTime = latest("lastRefreshStartTime").asInstanceOf[Number].longValue() + val prevLastRefreshCompleteTime = + latest("lastRefreshCompleteTime").asInstanceOf[Number].longValue() latest should contain("state" -> "active") - latest("jobStartTime").asInstanceOf[Number].longValue() should be > 0L + prevJobStartTime should be > 0L + prevLastRefreshStartTime should be > 0L + prevLastRefreshCompleteTime should be > prevLastRefreshStartTime + + flint.refreshIndex(testFlintIndex) + latest = latestLogEntry(testLatestId) + val jobStartTime = latest("jobStartTime").asInstanceOf[Number].longValue() + val lastRefreshStartTime = latest("lastRefreshStartTime").asInstanceOf[Number].longValue() + val lastRefreshCompleteTime = + latest("lastRefreshCompleteTime").asInstanceOf[Number].longValue() + jobStartTime should be > prevLastRefreshCompleteTime + lastRefreshStartTime should be > prevLastRefreshCompleteTime + lastRefreshCompleteTime should be > lastRefreshStartTime } test("incremental refresh index") { @@ -97,9 +115,26 @@ class FlintSparkTransactionITSuite extends OpenSearchTransactionSuite with Match .create() flint.refreshIndex(testFlintIndex) - val latest = latestLogEntry(testLatestId) + var latest = latestLogEntry(testLatestId) + val prevJobStartTime = latest("jobStartTime").asInstanceOf[Number].longValue() + val prevLastRefreshStartTime = + latest("lastRefreshStartTime").asInstanceOf[Number].longValue() + val prevLastRefreshCompleteTime = + latest("lastRefreshCompleteTime").asInstanceOf[Number].longValue() latest should contain("state" -> "active") - latest("jobStartTime").asInstanceOf[Number].longValue() should be > 0L + prevJobStartTime should be > 0L + prevLastRefreshStartTime should be > 0L + prevLastRefreshCompleteTime should be > prevLastRefreshStartTime + + flint.refreshIndex(testFlintIndex) + latest = latestLogEntry(testLatestId) + val jobStartTime = latest("jobStartTime").asInstanceOf[Number].longValue() + val lastRefreshStartTime = latest("lastRefreshStartTime").asInstanceOf[Number].longValue() + val lastRefreshCompleteTime = + latest("lastRefreshCompleteTime").asInstanceOf[Number].longValue() + jobStartTime should be > prevLastRefreshCompleteTime + lastRefreshStartTime should be > prevLastRefreshCompleteTime + lastRefreshCompleteTime should be > lastRefreshStartTime } } @@ -142,6 +177,8 @@ class FlintSparkTransactionITSuite extends OpenSearchTransactionSuite with Match val latest = latestLogEntry(testLatestId) latest should contain("state" -> "refreshing") latest("jobStartTime").asInstanceOf[Number].longValue() should be > 0L + latest("lastRefreshStartTime").asInstanceOf[Number].longValue() shouldBe 0L + latest("lastRefreshCompleteTime").asInstanceOf[Number].longValue() shouldBe 0L } test("update auto refresh index to full refresh index") { @@ -153,13 +190,24 @@ class FlintSparkTransactionITSuite extends OpenSearchTransactionSuite with Match .create() flint.refreshIndex(testFlintIndex) + var latest = latestLogEntry(testLatestId) + val prevLastRefreshStartTime = latest("lastRefreshStartTime").asInstanceOf[Number].longValue() + val prevLastRefreshCompleteTime = + latest("lastRefreshCompleteTime").asInstanceOf[Number].longValue() + val index = flint.describeIndex(testFlintIndex).get val updatedIndex = flint .skippingIndex() .copyWithUpdate(index, FlintSparkIndexOptions(Map("auto_refresh" -> "false"))) flint.updateIndex(updatedIndex) - val latest = latestLogEntry(testLatestId) + latest = latestLogEntry(testLatestId) latest should contain("state" -> "active") + latest("lastRefreshStartTime") + .asInstanceOf[Number] + .longValue() shouldBe prevLastRefreshStartTime + latest("lastRefreshCompleteTime") + .asInstanceOf[Number] + .longValue() shouldBe prevLastRefreshCompleteTime } test("delete and vacuum index") { diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/metadatacache/FlintOpenSearchMetadataCacheWriterITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/metadatacache/FlintOpenSearchMetadataCacheWriterITSuite.scala new file mode 100644 index 000000000..c04209f06 --- /dev/null +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/metadatacache/FlintOpenSearchMetadataCacheWriterITSuite.scala @@ -0,0 +1,407 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.metadatacache + +import java.util.{Base64, List} + +import scala.collection.JavaConverters._ + +import com.stephenn.scalatest.jsonassert.JsonMatchers.matchJson +import org.json4s.native.JsonMethods._ +import org.opensearch.flint.common.FlintVersion.current +import org.opensearch.flint.common.metadata.FlintMetadata +import org.opensearch.flint.common.metadata.log.FlintMetadataLogEntry +import org.opensearch.flint.core.FlintOptions +import org.opensearch.flint.core.storage.{FlintOpenSearchClient, FlintOpenSearchIndexMetadataService} +import org.opensearch.flint.spark.{FlintSparkIndexOptions, FlintSparkSuite} +import org.opensearch.flint.spark.skipping.FlintSparkSkippingIndex.getSkippingIndexName +import org.scalatest.Entry +import org.scalatest.matchers.should.Matchers + +import org.apache.spark.sql.flint.config.FlintSparkConf + +class FlintOpenSearchMetadataCacheWriterITSuite extends FlintSparkSuite with Matchers { + + /** Lazy initialize after container started. */ + lazy val options = new FlintOptions(openSearchOptions.asJava) + lazy val flintClient = new FlintOpenSearchClient(options) + lazy val flintMetadataCacheWriter = new FlintOpenSearchMetadataCacheWriter(options) + lazy val flintIndexMetadataService = new FlintOpenSearchIndexMetadataService(options) + + private val testTable = "spark_catalog.default.metadatacache_test" + private val testFlintIndex = getSkippingIndexName(testTable) + private val testLatestId: String = Base64.getEncoder.encodeToString(testFlintIndex.getBytes) + private val testLastRefreshCompleteTime = 1234567890123L + private val flintMetadataLogEntry = FlintMetadataLogEntry( + testLatestId, + 0L, + 0L, + testLastRefreshCompleteTime, + FlintMetadataLogEntry.IndexState.ACTIVE, + Map.empty[String, Any], + "", + Map.empty[String, Any]) + + override def beforeAll(): Unit = { + super.beforeAll() + createPartitionedMultiRowAddressTable(testTable) + } + + override def afterAll(): Unit = { + sql(s"DROP TABLE $testTable") + super.afterAll() + } + + override def afterEach(): Unit = { + deleteTestIndex(testFlintIndex) + super.afterEach() + } + + test("build disabled metadata cache writer") { + FlintMetadataCacheWriterBuilder + .build(FlintSparkConf()) shouldBe a[FlintDisabledMetadataCacheWriter] + } + + test("build opensearch metadata cache writer") { + setFlintSparkConf(FlintSparkConf.METADATA_CACHE_WRITE, "true") + withMetadataCacheWriteEnabled { + FlintMetadataCacheWriterBuilder + .build(FlintSparkConf()) shouldBe a[FlintOpenSearchMetadataCacheWriter] + } + } + + test("serialize metadata cache to JSON") { + val expectedMetadataJson: String = s""" + | { + | "_meta": { + | "version": "${current()}", + | "name": "${testFlintIndex}", + | "kind": "test_kind", + | "source": "test_source_table", + | "indexedColumns": [ + | { + | "test_field": "spark_type" + | }], + | "options": { + | "auto_refresh": "true", + | "refresh_interval": "10 Minutes" + | }, + | "properties": { + | "metadataCacheVersion": "1.0", + | "refreshInterval": 600, + | "sourceTables": ["${FlintMetadataCache.mockTableName}"], + | "lastRefreshTime": ${testLastRefreshCompleteTime} + | }, + | "latestId": "${testLatestId}" + | }, + | "properties": { + | "test_field": { + | "type": "os_type" + | } + | } + | } + |""".stripMargin + val builder = new FlintMetadata.Builder + builder.name(testFlintIndex) + builder.kind("test_kind") + builder.source("test_source_table") + builder.addIndexedColumn(Map[String, AnyRef]("test_field" -> "spark_type").asJava) + builder.options( + Map("auto_refresh" -> "true", "refresh_interval" -> "10 Minutes") + .mapValues(_.asInstanceOf[AnyRef]) + .asJava) + builder.schema(Map[String, AnyRef]("test_field" -> Map("type" -> "os_type").asJava).asJava) + builder.latestLogEntry(flintMetadataLogEntry) + + val metadata = builder.build() + flintMetadataCacheWriter.serialize(metadata) should matchJson(expectedMetadataJson) + } + + test("write metadata cache to index mappings") { + val metadata = FlintOpenSearchIndexMetadataService + .deserialize("{}") + .copy(latestLogEntry = Some(flintMetadataLogEntry)) + flintClient.createIndex(testFlintIndex, metadata) + flintMetadataCacheWriter.updateMetadataCache(testFlintIndex, metadata) + + val properties = flintIndexMetadataService.getIndexMetadata(testFlintIndex).properties + properties should have size 3 + properties should contain allOf (Entry("metadataCacheVersion", "1.0"), + Entry("lastRefreshTime", testLastRefreshCompleteTime)) + properties + .get("sourceTables") + .asInstanceOf[List[String]] + .toArray should contain theSameElementsAs Array(FlintMetadataCache.mockTableName) + } + + test("write metadata cache to index mappings with refresh interval") { + val content = + """ { + | "_meta": { + | "kind": "test_kind", + | "options": { + | "auto_refresh": "true", + | "refresh_interval": "10 Minutes" + | } + | }, + | "properties": { + | "age": { + | "type": "integer" + | } + | } + | } + |""".stripMargin + val metadata = FlintOpenSearchIndexMetadataService + .deserialize(content) + .copy(latestLogEntry = Some(flintMetadataLogEntry)) + flintClient.createIndex(testFlintIndex, metadata) + flintMetadataCacheWriter.updateMetadataCache(testFlintIndex, metadata) + + val properties = flintIndexMetadataService.getIndexMetadata(testFlintIndex).properties + properties should have size 4 + properties should contain allOf (Entry("metadataCacheVersion", "1.0"), + Entry("refreshInterval", 600), + Entry("lastRefreshTime", testLastRefreshCompleteTime)) + properties + .get("sourceTables") + .asInstanceOf[List[String]] + .toArray should contain theSameElementsAs Array(FlintMetadataCache.mockTableName) + } + + test("exclude refresh interval in metadata cache when auto refresh is false") { + val content = + """ { + | "_meta": { + | "kind": "test_kind", + | "options": { + | "refresh_interval": "10 Minutes" + | } + | }, + | "properties": { + | "age": { + | "type": "integer" + | } + | } + | } + |""".stripMargin + val metadata = FlintOpenSearchIndexMetadataService + .deserialize(content) + .copy(latestLogEntry = Some(flintMetadataLogEntry)) + flintClient.createIndex(testFlintIndex, metadata) + flintMetadataCacheWriter.updateMetadataCache(testFlintIndex, metadata) + + val properties = flintIndexMetadataService.getIndexMetadata(testFlintIndex).properties + properties should have size 3 + properties should contain allOf (Entry("metadataCacheVersion", "1.0"), + Entry("lastRefreshTime", testLastRefreshCompleteTime)) + properties + .get("sourceTables") + .asInstanceOf[List[String]] + .toArray should contain theSameElementsAs Array(FlintMetadataCache.mockTableName) + } + + test("exclude last refresh time in metadata cache when index has not been refreshed") { + val metadata = FlintOpenSearchIndexMetadataService + .deserialize("{}") + .copy(latestLogEntry = Some(flintMetadataLogEntry.copy(lastRefreshCompleteTime = 0L))) + flintClient.createIndex(testFlintIndex, metadata) + flintMetadataCacheWriter.updateMetadataCache(testFlintIndex, metadata) + + val properties = flintIndexMetadataService.getIndexMetadata(testFlintIndex).properties + properties should have size 2 + properties should contain(Entry("metadataCacheVersion", "1.0")) + properties + .get("sourceTables") + .asInstanceOf[List[String]] + .toArray should contain theSameElementsAs Array(FlintMetadataCache.mockTableName) + } + + test("write metadata cache to index mappings and preserve other index metadata") { + val content = + """ { + | "_meta": { + | "kind": "test_kind" + | }, + | "properties": { + | "age": { + | "type": "integer" + | } + | } + | } + |""".stripMargin + + val metadata = FlintOpenSearchIndexMetadataService + .deserialize(content) + .copy(latestLogEntry = Some(flintMetadataLogEntry)) + flintClient.createIndex(testFlintIndex, metadata) + + flintIndexMetadataService.updateIndexMetadata(testFlintIndex, metadata) + flintMetadataCacheWriter.updateMetadataCache(testFlintIndex, metadata) + + flintIndexMetadataService.getIndexMetadata(testFlintIndex).kind shouldBe "test_kind" + flintIndexMetadataService.getIndexMetadata(testFlintIndex).name shouldBe empty + flintIndexMetadataService.getIndexMetadata(testFlintIndex).schema should have size 1 + var properties = flintIndexMetadataService.getIndexMetadata(testFlintIndex).properties + properties should have size 3 + properties should contain allOf (Entry("metadataCacheVersion", "1.0"), + Entry("lastRefreshTime", testLastRefreshCompleteTime)) + properties + .get("sourceTables") + .asInstanceOf[List[String]] + .toArray should contain theSameElementsAs Array(FlintMetadataCache.mockTableName) + + val newContent = + """ { + | "_meta": { + | "kind": "test_kind", + | "name": "test_name" + | }, + | "properties": { + | "age": { + | "type": "integer" + | } + | } + | } + |""".stripMargin + + val newMetadata = FlintOpenSearchIndexMetadataService + .deserialize(newContent) + .copy(latestLogEntry = Some(flintMetadataLogEntry)) + flintIndexMetadataService.updateIndexMetadata(testFlintIndex, newMetadata) + flintMetadataCacheWriter.updateMetadataCache(testFlintIndex, newMetadata) + + flintIndexMetadataService.getIndexMetadata(testFlintIndex).kind shouldBe "test_kind" + flintIndexMetadataService.getIndexMetadata(testFlintIndex).name shouldBe "test_name" + flintIndexMetadataService.getIndexMetadata(testFlintIndex).schema should have size 1 + properties = flintIndexMetadataService.getIndexMetadata(testFlintIndex).properties + properties should have size 3 + properties should contain allOf (Entry("metadataCacheVersion", "1.0"), + Entry("lastRefreshTime", testLastRefreshCompleteTime)) + properties + .get("sourceTables") + .asInstanceOf[List[String]] + .toArray should contain theSameElementsAs Array(FlintMetadataCache.mockTableName) + } + + Seq( + ( + "auto refresh index with external scheduler", + Map( + "auto_refresh" -> "true", + "scheduler_mode" -> "external", + "refresh_interval" -> "10 Minute", + "checkpoint_location" -> "s3a://test/"), + s""" + | { + | "metadataCacheVersion": "1.0", + | "refreshInterval": 600, + | "sourceTables": ["${FlintMetadataCache.mockTableName}"] + | } + |""".stripMargin), + ( + "full refresh index", + Map.empty[String, String], + s""" + | { + | "metadataCacheVersion": "1.0", + | "sourceTables": ["${FlintMetadataCache.mockTableName}"] + | } + |""".stripMargin), + ( + "incremental refresh index", + Map("incremental_refresh" -> "true", "checkpoint_location" -> "s3a://test/"), + s""" + | { + | "metadataCacheVersion": "1.0", + | "sourceTables": ["${FlintMetadataCache.mockTableName}"] + | } + |""".stripMargin)).foreach { case (refreshMode, optionsMap, expectedJson) => + test(s"write metadata cache for $refreshMode") { + withExternalSchedulerEnabled { + withMetadataCacheWriteEnabled { + withTempDir { checkpointDir => + // update checkpoint_location if available in optionsMap + val indexOptions = FlintSparkIndexOptions( + optionsMap + .get("checkpoint_location") + .map(_ => + optionsMap.updated("checkpoint_location", checkpointDir.getAbsolutePath)) + .getOrElse(optionsMap)) + + flint + .skippingIndex() + .onTable(testTable) + .addMinMax("age") + .options(indexOptions, testFlintIndex) + .create() + + var index = flint.describeIndex(testFlintIndex) + index shouldBe defined + val propertiesJson = + compact( + render( + parse( + flintMetadataCacheWriter.serialize( + index.get.metadata())) \ "_meta" \ "properties")) + propertiesJson should matchJson(expectedJson) + + flint.refreshIndex(testFlintIndex) + index = flint.describeIndex(testFlintIndex) + index shouldBe defined + val lastRefreshTime = + compact( + render( + parse( + flintMetadataCacheWriter.serialize( + index.get.metadata())) \ "_meta" \ "properties" \ "lastRefreshTime")).toLong + lastRefreshTime should be > 0L + } + } + } + } + } + + test("write metadata cache for auto refresh index with internal scheduler") { + withMetadataCacheWriteEnabled { + withTempDir { checkpointDir => + flint + .skippingIndex() + .onTable(testTable) + .addMinMax("age") + .options( + FlintSparkIndexOptions( + Map( + "auto_refresh" -> "true", + "scheduler_mode" -> "internal", + "refresh_interval" -> "10 Minute", + "checkpoint_location" -> checkpointDir.getAbsolutePath)), + testFlintIndex) + .create() + + var index = flint.describeIndex(testFlintIndex) + index shouldBe defined + val propertiesJson = + compact( + render(parse( + flintMetadataCacheWriter.serialize(index.get.metadata())) \ "_meta" \ "properties")) + propertiesJson should matchJson(s""" + | { + | "metadataCacheVersion": "1.0", + | "refreshInterval": 600, + | "sourceTables": ["${FlintMetadataCache.mockTableName}"] + | } + |""".stripMargin) + + flint.refreshIndex(testFlintIndex) + index = flint.describeIndex(testFlintIndex) + index shouldBe defined + compact(render(parse( + flintMetadataCacheWriter.serialize( + index.get.metadata())) \ "_meta" \ "properties")) should not include "lastRefreshTime" + } + } + } +} diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLAggregationsITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLAggregationsITSuite.scala index 55d3d0709..bcfe22764 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLAggregationsITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLAggregationsITSuite.scala @@ -1122,4 +1122,166 @@ class FlintSparkPPLAggregationsITSuite comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) } + + test("test count() at the first of stats clause") { + val frame = sql(s""" + | source = $testTable | eval a = 1 | stats count() as cnt, sum(a) as sum, avg(a) as avg + | """.stripMargin) + assertSameRows(Seq(Row(4, 4, 1.0)), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), table) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val aggregate = Aggregate(Seq.empty, Seq(count, sum, avg), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test count() in the middle of stats clause") { + val frame = sql(s""" + | source = $testTable | eval a = 1 | stats sum(a) as sum, count() as cnt, avg(a) as avg + | """.stripMargin) + assertSameRows(Seq(Row(4, 4, 1.0)), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), table) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val aggregate = Aggregate(Seq.empty, Seq(sum, count, avg), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test count() at the end of stats clause") { + val frame = sql(s""" + | source = $testTable | eval a = 1 | stats sum(a) as sum, avg(a) as avg, count() as cnt + | """.stripMargin) + assertSameRows(Seq(Row(4, 1.0, 4)), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), table) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val aggregate = Aggregate(Seq.empty, Seq(sum, avg, count), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test count() at the first of stats by clause") { + val frame = sql(s""" + | source = $testTable | eval a = 1 | stats count() as cnt, sum(a) as sum, avg(a) as avg by country + | """.stripMargin) + assertSameRows(Seq(Row(2, 2, 1.0, "Canada"), Row(2, 2, 1.0, "USA")), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), table) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val grouping = + Alias(UnresolvedAttribute("country"), "country")() + val aggregate = Aggregate(Seq(grouping), Seq(count, sum, avg, grouping), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test count() in the middle of stats by clause") { + val frame = sql(s""" + | source = $testTable | eval a = 1 | stats sum(a) as sum, count() as cnt, avg(a) as avg by country + | """.stripMargin) + assertSameRows(Seq(Row(2, 2, 1.0, "Canada"), Row(2, 2, 1.0, "USA")), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), table) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val grouping = + Alias(UnresolvedAttribute("country"), "country")() + val aggregate = Aggregate(Seq(grouping), Seq(sum, count, avg, grouping), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } + + test("test count() at the end of stats by clause") { + val frame = sql(s""" + | source = $testTable | eval a = 1 | stats sum(a) as sum, avg(a) as avg, count() as cnt by country + | """.stripMargin) + assertSameRows(Seq(Row(2, 1.0, 2, "Canada"), Row(2, 1.0, 2, "USA")), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), table) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val grouping = + Alias(UnresolvedAttribute("country"), "country")() + val aggregate = Aggregate(Seq(grouping), Seq(sum, avg, count, grouping), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } } diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBetweenITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBetweenITSuite.scala new file mode 100644 index 000000000..ce0be1409 --- /dev/null +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLBetweenITSuite.scala @@ -0,0 +1,136 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import java.sql.Timestamp + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.streaming.StreamTest + +class FlintSparkPPLBetweenITSuite + extends QueryTest + with LogicalPlanTestUtils + with FlintPPLSuite + with StreamTest { + + /** Test table and index name */ + private val testTable = "spark_catalog.default.flint_ppl_test" + private val timeSeriesTestTable = "spark_catalog.default.flint_ppl_timeseries_test" + + override def beforeAll(): Unit = { + super.beforeAll() + + // Create test tables + createPartitionedStateCountryTable(testTable) + createTimeSeriesTable(timeSeriesTestTable) + } + + protected override def afterEach(): Unit = { + super.afterEach() + // Stop all streaming jobs if any + spark.streams.active.foreach { job => + job.stop() + job.awaitTermination() + } + } + + test("test between should return records between two integer values") { + val frame = sql(s""" + | source = $testTable | where age between 20 and 30 + | """.stripMargin) + + val results = frame.collect() + assert(results.length == 3) + assert(frame.columns.length == 6) + + results.foreach(row => { + val age = row.getAs[Int]("age") + assert(age >= 20 && age <= 30, s"Age $age is not between 20 and 30") + }) + } + + test("test between should return records between two integer computed values") { + val frame = sql(s""" + | source = $testTable | where age between 20 + 1 and 30 - 1 + | """.stripMargin) + + val results = frame.collect() + assert(results.length == 1) + assert(frame.columns.length == 6) + + results.foreach(row => { + val age = row.getAs[Int]("age") + assert(age >= 21 && age <= 29, s"Age $age is not between 21 and 29") + }) + } + + test("test between should return records NOT between two integer values") { + val frame = sql(s""" + | source = $testTable | where age NOT between 20 and 30 + | """.stripMargin) + + val results = frame.collect() + assert(results.length == 1) + assert(frame.columns.length == 6) + + results.foreach(row => { + val age = row.getAs[Int]("age") + assert(age < 20 || age > 30, s"Age $age is not between 20 and 30") + }) + } + + test("test between should return records where NOT between two integer values") { + val frame = sql(s""" + | source = $testTable | where NOT age between 20 and 30 + | """.stripMargin) + + val results = frame.collect() + assert(results.length == 1) + assert(frame.columns.length == 6) + + results.foreach(row => { + val age = row.getAs[Int]("age") + assert(age < 20 || age > 30, s"Age $age is not between 20 and 30") + }) + } + + test("test between should return records between two date values") { + val frame = sql(s""" + | source = $timeSeriesTestTable | where time between '2023-10-01 00:01:00' and '2023-10-01 00:10:00' + | """.stripMargin) + + val results = frame.collect() + assert(results.length == 2) + assert(frame.columns.length == 4) + + results.foreach(row => { + val ts = row.getAs[Timestamp]("time") + assert( + !ts.before(Timestamp.valueOf("2023-10-01 00:01:00")) || !ts.after( + Timestamp.valueOf("2023-10-01 00:01:00")), + s"Timestamp $ts is not between '2023-10-01 00:01:00' and '2023-10-01 00:10:00'") + }) + } + + test("test between should return records NOT between two date values") { + val frame = sql(s""" + | source = $timeSeriesTestTable | where time NOT between '2023-10-01 00:01:00' and '2023-10-01 00:10:00' + | """.stripMargin) + + val results = frame.collect() + assert(results.length == 3) + assert(frame.columns.length == 4) + + results.foreach(row => { + val ts = row.getAs[Timestamp]("time") + assert( + ts.before(Timestamp.valueOf("2023-10-01 00:01:00")) || ts.after( + Timestamp.valueOf("2023-10-01 00:01:00")), + s"Timestamp $ts is not between '2023-10-01 00:01:00' and '2023-10-01 00:10:00'") + }) + + } +} diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLEvalITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLEvalITSuite.scala index e10b2e2a6..c3dd1d533 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLEvalITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLEvalITSuite.scala @@ -9,7 +9,7 @@ import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq import org.apache.spark.sql.{AnalysisException, QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, And, Ascending, CaseWhen, Descending, EqualTo, GreaterThanOrEqual, LessThan, Literal, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Ascending, CaseWhen, Descending, EqualTo, GreaterThanOrEqual, In, LessThan, Literal, SortOrder} import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, GlobalLimit, LocalLimit, LogicalPlan, Project, Sort} import org.apache.spark.sql.streaming.StreamTest @@ -688,4 +688,20 @@ class FlintSparkPPLEvalITSuite implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, String](_.getAs[String](0)) assert(results.sorted.sameElements(expectedResults.sorted)) } + + test("test IN expr in eval") { + val frame = sql(s""" + | source = $testTable | eval in = state in ('California', 'New York') | fields in + | """.stripMargin) + assertSameRows(Seq(Row(true), Row(true), Row(false), Row(false)), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val in = Alias( + In(UnresolvedAttribute("state"), Seq(Literal("California"), Literal("New York"))), + "in")() + val eval = Project(Seq(UnresolvedStar(None), in), table) + val expectedPlan = Project(Seq(UnresolvedAttribute("in")), eval) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } } diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFiltersITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFiltersITSuite.scala index 14ef7ccc4..f2d7ee844 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFiltersITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFiltersITSuite.scala @@ -7,7 +7,7 @@ package org.opensearch.flint.spark.ppl import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, And, Ascending, CaseWhen, Descending, Divide, EqualTo, Floor, GreaterThan, LessThan, LessThanOrEqual, Literal, Multiply, Not, Or, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Ascending, CaseWhen, Descending, Divide, EqualTo, Floor, GreaterThan, In, LessThan, LessThanOrEqual, Literal, Multiply, Not, Or, SortOrder} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.streaming.StreamTest @@ -453,4 +453,18 @@ class FlintSparkPPLFiltersITSuite // Compare the two plans comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) } + + test("test NOT IN expr in filter") { + val frame = sql(s""" + | source = $testTable | where state not in ('California', 'New York') | fields state + | """.stripMargin) + assertSameRows(Seq(Row("Ontario"), Row("Quebec")), frame) + + val logicalPlan: LogicalPlan = frame.queryExecution.logical + val table = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test")) + val in = In(UnresolvedAttribute("state"), Seq(Literal("California"), Literal("New York"))) + val filter = Filter(Not(in), table) + val expectedPlan = Project(Seq(UnresolvedAttribute("state")), filter) + comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) + } } diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 index fc92b0a14..82fdeb42f 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -74,13 +74,9 @@ INDEX: 'INDEX'; D: 'D'; DESC: 'DESC'; DATASOURCES: 'DATASOURCES'; -VALUE: 'VALUE'; USING: 'USING'; WITH: 'WITH'; -// CLAUSE KEYWORDS -SORTBY: 'SORTBY'; - // FIELD KEYWORDS AUTO: 'AUTO'; STR: 'STR'; @@ -396,6 +392,7 @@ LIKE: 'LIKE'; ISNULL: 'ISNULL'; ISNOTNULL: 'ISNOTNULL'; ISPRESENT: 'ISPRESENT'; +BETWEEN: 'BETWEEN'; // FLOWCONTROL FUNCTIONS IFNULL: 'IFNULL'; diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 index 2d9357ec5..48984b3a5 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 @@ -55,6 +55,35 @@ commands | fieldsummaryCommand ; +commandName + : SEARCH + | DESCRIBE + | SHOW + | AD + | ML + | KMEANS + | WHERE + | CORRELATE + | JOIN + | FIELDS + | STATS + | EVENTSTATS + | DEDUP + | EXPLAIN + | SORT + | HEAD + | TOP + | RARE + | EVAL + | GROK + | PARSE + | PATTERNS + | LOOKUP + | RENAME + | FILLNULL + | FIELDSUMMARY + ; + searchCommand : (SEARCH)? fromClause # searchFrom | (SEARCH)? fromClause logicalExpression # searchFromFilter @@ -360,14 +389,6 @@ statsFunctionName | STDDEV_POP ; -takeAggFunction - : TAKE LT_PRTHS fieldExpression (COMMA size = integerLiteral)? RT_PRTHS - ; - -percentileAggFunction - : PERCENTILE LESS value = integerLiteral GREATER LT_PRTHS aggField = fieldExpression RT_PRTHS - ; - // expressions expression : logicalExpression @@ -385,7 +406,8 @@ logicalExpression comparisonExpression : left = valueExpression comparisonOperator right = valueExpression # compareExpr - | valueExpression IN valueList # inExpr + | valueExpression NOT? IN valueList # inExpr + | expr1 = functionArg NOT? BETWEEN expr2 = functionArg AND expr3 = functionArg # between ; valueExpressionList @@ -999,45 +1021,37 @@ keywordsCanBeId | mathematicalFunctionName | positionFunctionName | cryptographicFunctionName - // commands - | SEARCH - | DESCRIBE - | SHOW - | FROM - | WHERE - | CORRELATE - | FIELDS - | RENAME - | STATS - | DEDUP - | SORT - | EVAL - | HEAD - | TOP - | RARE - | PARSE - | METHOD - | REGEX - | PUNCT - | GROK - | PATTERN - | PATTERNS - | NEW_FIELD - | KMEANS - | AD - | ML - | EXPLAIN + | singleFieldRelevanceFunctionName + | multiFieldRelevanceFunctionName + | commandName + | comparisonOperator + | explainMode + | correlationType // commands assist keywords + | IN | SOURCE | INDEX | DESC | DATASOURCES - // CLAUSEKEYWORDS - | SORTBY - // FIELDKEYWORDSAUTO + | AUTO | STR | IP | NUM + | FROM + | PATTERN + | NEW_FIELD + | SCOPE + | MAPPING + | WITH + | USING + | CAST + | GET_FORMAT + | EXTRACT + | INTERVAL + | PLUS + | MINUS + | INCLUDEFIELDS + | NULLS // ARGUMENT KEYWORDS | KEEPEMPTY | CONSECUTIVE @@ -1060,27 +1074,21 @@ keywordsCanBeId | TRAINING_DATA_SIZE | ANOMALY_SCORE_THRESHOLD // AGGREGATIONS - | AVG - | COUNT + | statsFunctionName | DISTINCT_COUNT + | PERCENTILE + | PERCENTILE_APPROX | ESTDC | ESTDC_ERROR - | MAX | MEAN | MEDIAN - | MIN | MODE | RANGE | STDEV | STDEVP - | SUM | SUMSQ | VAR_SAMP | VAR_POP - | STDDEV_SAMP - | STDDEV_POP - | PERCENTILE - | PERCENTILE_APPROX | TAKE | FIRST | LAST @@ -1098,10 +1106,6 @@ keywordsCanBeId | SPARKLINE | C | DC - // FIELD SUMMARY - | FIELDSUMMARY - | INCLUDEFIELDS - | NULLS // JOIN TYPE | OUTER | INNER @@ -1111,4 +1115,5 @@ keywordsCanBeId | FULL | SEMI | ANTI + | BETWEEN ; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/FieldsMapping.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/FieldsMapping.java index a53b1e130..6226a7f6b 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/FieldsMapping.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/FieldsMapping.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ast.expression; import lombok.Getter; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/IsEmpty.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/IsEmpty.java index 0374d1c90..1691992a6 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/IsEmpty.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/IsEmpty.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ast.expression; import com.google.common.collect.ImmutableList; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Scope.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Scope.java index fb18b8c1e..1ecea8779 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Scope.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Scope.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ast.expression; import lombok.EqualsAndHashCode; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Correlation.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Correlation.java index f7bd8ad9a..cccc163f6 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Correlation.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Correlation.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ast.tree; import com.google.common.collect.ImmutableList; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/FillNull.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/FillNull.java index 19bfea668..a1d591b9f 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/FillNull.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/FillNull.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ast.tree; import lombok.Getter; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/common/antlr/Parser.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/common/antlr/Parser.java index 7962f53ef..02faf4f3d 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/common/antlr/Parser.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/common/antlr/Parser.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.common.antlr; import org.antlr.v4.runtime.tree.ParseTree; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java index 32ed2c92f..441287ddb 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java @@ -15,7 +15,10 @@ import org.apache.spark.sql.catalyst.expressions.Descending$; import org.apache.spark.sql.catalyst.expressions.Exists$; import org.apache.spark.sql.catalyst.expressions.Expression; +import org.apache.spark.sql.catalyst.expressions.In$; +import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual; import org.apache.spark.sql.catalyst.expressions.InSubquery$; +import org.apache.spark.sql.catalyst.expressions.LessThanOrEqual; import org.apache.spark.sql.catalyst.expressions.ListQuery$; import org.apache.spark.sql.catalyst.expressions.NamedExpression; import org.apache.spark.sql.catalyst.expressions.Predicate; @@ -34,6 +37,7 @@ import org.opensearch.sql.ast.expression.AllFields; import org.opensearch.sql.ast.expression.And; import org.opensearch.sql.ast.expression.Argument; +import org.opensearch.sql.ast.expression.Between; import org.opensearch.sql.ast.expression.BinaryExpression; import org.opensearch.sql.ast.expression.Case; import org.opensearch.sql.ast.expression.Compare; @@ -697,11 +701,7 @@ public Expression visitCorrelationMapping(FieldsMapping node, CatalystPlanContex @Override public Expression visitAllFields(AllFields node, CatalystPlanContext context) { - // Case of aggregation step - no start projection can be added - if (context.getNamedParseExpressions().isEmpty()) { - // Create an UnresolvedStar for all-fields projection - context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); - } + context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); return context.getNamedParseExpressions().peek(); } @@ -769,7 +769,13 @@ public Expression visitDedupe(Dedupe node, CatalystPlanContext context) { @Override public Expression visitIn(In node, CatalystPlanContext context) { - throw new IllegalStateException("Not Supported operation : In"); + node.getField().accept(this, context); + Expression value = context.popNamedParseExpressions().get(); + List list = node.getValueList().stream().map( expression -> { + expression.accept(this, context); + return context.popNamedParseExpressions().get(); + }).collect(Collectors.toList()); + return context.getNamedParseExpressions().push(In$.MODULE$.apply(value, seq(list))); } @Override @@ -864,5 +870,14 @@ public Expression visitExistsSubquery(ExistsSubquery node, CatalystPlanContext c Option.empty()); return context.getNamedParseExpressions().push(existsSubQuery); } + + @Override + public Expression visitBetween(Between node, CatalystPlanContext context) { + Expression value = analyze(node.getValue(), context); + Expression lower = analyze(node.getLowerBound(), context); + Expression upper = analyze(node.getUpperBound(), context); + context.retainAllNamedParseExpressions(p -> p); + return context.getNamedParseExpressions().push(new org.apache.spark.sql.catalyst.expressions.And(new GreaterThanOrEqual(value, lower), new LessThanOrEqual(value, upper))); + } } } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index ea51ca7a1..6a0c80c16 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -17,13 +17,14 @@ import org.opensearch.sql.ast.expression.And; import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.AttributeList; +import org.opensearch.sql.ast.expression.Between; import org.opensearch.sql.ast.expression.Case; import org.opensearch.sql.ast.expression.Compare; import org.opensearch.sql.ast.expression.DataType; import org.opensearch.sql.ast.expression.EqualTo; import org.opensearch.sql.ast.expression.Field; -import org.opensearch.sql.ast.expression.FieldList; import org.opensearch.sql.ast.expression.Function; +import org.opensearch.sql.ast.expression.In; import org.opensearch.sql.ast.expression.subquery.ExistsSubquery; import org.opensearch.sql.ast.expression.subquery.InSubquery; import org.opensearch.sql.ast.expression.Interval; @@ -41,7 +42,6 @@ import org.opensearch.sql.ast.expression.UnresolvedExpression; import org.opensearch.sql.ast.expression.When; import org.opensearch.sql.ast.expression.Xor; -import org.opensearch.sql.ast.tree.UnresolvedPlan; import org.opensearch.sql.common.utils.StringUtils; import org.opensearch.sql.ppl.utils.ArgumentFactory; @@ -53,8 +53,6 @@ import java.util.stream.IntStream; import java.util.stream.Stream; -import static org.opensearch.flint.spark.ppl.OpenSearchPPLParser.INCLUDEFIELDS; -import static org.opensearch.flint.spark.ppl.OpenSearchPPLParser.NULLS; import static org.opensearch.sql.expression.function.BuiltinFunctionName.EQUAL; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NOT_NULL; import static org.opensearch.sql.expression.function.BuiltinFunctionName.IS_NULL; @@ -80,7 +78,7 @@ public void setAstBuilder(AstBuilder astBuilder) { /** * The function name mapping between fronted and core engine. */ - private static Map FUNCTION_NAME_MAPPING = + private static final Map FUNCTION_NAME_MAPPING = new ImmutableMap.Builder() .put("isnull", IS_NULL.getName().getFunctionName()) .put("isnotnull", IS_NOT_NULL.getName().getFunctionName()) @@ -216,14 +214,6 @@ public UnresolvedExpression visitDistinctCountFunctionCall(OpenSearchPPLParser.D return new AggregateFunction("count", visit(ctx.valueExpression()), true); } - @Override - public UnresolvedExpression visitPercentileAggFunction(OpenSearchPPLParser.PercentileAggFunctionContext ctx) { - return new AggregateFunction( - ctx.PERCENTILE().getText(), - visit(ctx.aggField), - Collections.singletonList(new Argument("rank", (Literal) visit(ctx.value)))); - } - @Override public UnresolvedExpression visitPercentileFunctionCall(OpenSearchPPLParser.PercentileFunctionCallContext ctx) { return new AggregateFunction( @@ -288,6 +278,12 @@ public UnresolvedExpression visitConvertedDataType(OpenSearchPPLParser.Converted return new Literal(ctx.getText(), DataType.STRING); } + @Override + public UnresolvedExpression visitBetween(OpenSearchPPLParser.BetweenContext ctx) { + UnresolvedExpression betweenExpr = new Between(visit(ctx.expr1),visit(ctx.expr2),visit(ctx.expr3)); + return ctx.NOT() != null ? new Not(betweenExpr) : betweenExpr; + } + private Function buildFunction( String functionName, List args) { return new Function( @@ -418,6 +414,13 @@ public UnresolvedExpression visitExistsSubqueryExpr(OpenSearchPPLParser.ExistsSu return new ExistsSubquery(astBuilder.visitSubSearch(ctx.subSearch())); } + @Override + public UnresolvedExpression visitInExpr(OpenSearchPPLParser.InExprContext ctx) { + UnresolvedExpression expr = new In(visit(ctx.valueExpression()), + ctx.valueList().literalValue().stream().map(this::visit).collect(Collectors.toList())); + return ctx.NOT() != null ? new Not(expr) : expr; + } + private QualifiedName visitIdentifiers(List ctx) { return new QualifiedName( ctx.stream() diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/JoinSpecTransformer.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/JoinSpecTransformer.java index 8a6bafc53..f6f59c009 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/JoinSpecTransformer.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/JoinSpecTransformer.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ppl.utils; import org.apache.spark.sql.catalyst.expressions.Expression; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseStrategy.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseStrategy.java index 6cdb2f6b2..8775d077b 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseStrategy.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ParseStrategy.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ppl.utils; import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$; diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/RelationUtils.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/RelationUtils.java index 7be7f1f45..1dc7b9878 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/RelationUtils.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/RelationUtils.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql.ppl.utils; import org.apache.spark.sql.catalyst.TableIdentifier; diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanAggregationQueriesTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanAggregationQueriesTranslatorTestSuite.scala index 03d7f0ab0..9946bff6a 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanAggregationQueriesTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanAggregationQueriesTranslatorTestSuite.scala @@ -959,4 +959,58 @@ class PPLLogicalPlanAggregationQueriesTranslatorTestSuite comparePlans(expectedPlan, logPlan, false) } + + test("test count() as the last aggregator in stats clause") { + val context = new CatalystPlanContext + val logPlan = planTransformer.visit( + plan( + pplParser, + "source = table | eval a = 1 | stats sum(a) as sum, avg(a) as avg, count() as cnt"), + context) + val tableRelation = UnresolvedRelation(Seq("table")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), tableRelation) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val aggregate = Aggregate(Seq.empty, Seq(sum, avg, count), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } + + test("test count() as the last aggregator in stats by clause") { + val context = new CatalystPlanContext + val logPlan = planTransformer.visit( + plan( + pplParser, + "source = table | eval a = 1 | stats sum(a) as sum, avg(a) as avg, count() as cnt by country"), + context) + val tableRelation = UnresolvedRelation(Seq("table")) + val eval = Project(Seq(UnresolvedStar(None), Alias(Literal(1), "a")()), tableRelation) + val sum = + Alias( + UnresolvedFunction(Seq("SUM"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "sum")() + val avg = + Alias( + UnresolvedFunction(Seq("AVG"), Seq(UnresolvedAttribute("a")), isDistinct = false), + "avg")() + val count = + Alias( + UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedStar(None)), isDistinct = false), + "cnt")() + val grouping = + Alias(UnresolvedAttribute("country"), "country")() + val aggregate = Aggregate(Seq(grouping), Seq(sum, avg, count, grouping), eval) + val expectedPlan = Project(Seq(UnresolvedStar(None)), aggregate) + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanBetweenExpressionTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanBetweenExpressionTranslatorTestSuite.scala new file mode 100644 index 000000000..6defcb766 --- /dev/null +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanBetweenExpressionTranslatorTestSuite.scala @@ -0,0 +1,55 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.flint.spark.ppl + +import org.opensearch.flint.spark.ppl.PlaneUtils.plan +import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} +import org.scalatest.matchers.should.Matchers + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.expressions.{And, GreaterThanOrEqual, LessThanOrEqual, Literal} +import org.apache.spark.sql.catalyst.plans.PlanTest +import org.apache.spark.sql.catalyst.plans.logical._ + +class PPLLogicalPlanBetweenExpressionTranslatorTestSuite + extends SparkFunSuite + with PlanTest + with LogicalPlanTestUtils + with Matchers { + + private val planTransformer = new CatalystQueryPlanVisitor() + private val pplParser = new PPLSyntaxParser() + + test("test between expression") { + // if successful build ppl logical plan and translate to catalyst logical plan + val context = new CatalystPlanContext + val logPlan = { + planTransformer.visit( + plan( + pplParser, + "source = table | where datetime_field between '2024-09-10' and '2024-09-15'"), + context) + } + // SQL: SELECT * FROM table WHERE datetime_field BETWEEN '2024-09-10' AND '2024-09-15' + val star = Seq(UnresolvedStar(None)) + + val datetime_field = UnresolvedAttribute("datetime_field") + val tableRelation = UnresolvedRelation(Seq("table")) + + val lowerBound = Literal("2024-09-10") + val upperBound = Literal("2024-09-15") + val betweenCondition = And( + GreaterThanOrEqual(datetime_field, lowerBound), + LessThanOrEqual(datetime_field, upperBound)) + + val filterPlan = Filter(betweenCondition, tableRelation) + val expectedPlan = Project(star, filterPlan) + + comparePlans(expectedPlan, logPlan, false) + } + +} diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala index 3e2b3cc30..ba0d78670 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala @@ -12,7 +12,7 @@ import org.scalatest.matchers.should.Matchers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, ExprId, Literal, NamedExpression, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, ExprId, In, Literal, NamedExpression, SortOrder} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{Project, Sort} @@ -200,4 +200,17 @@ class PPLLogicalPlanEvalTranslatorTestSuite val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t"))) comparePlans(expectedPlan, logPlan, checkAnalysis = false) } + + test("test IN expr in eval") { + val context = new CatalystPlanContext + val logPlan = + planTransformer.visit( + plan(pplParser, "source=t | eval in = a in ('Hello', 'World') | fields in"), + context) + + val in = Alias(In(UnresolvedAttribute("a"), Seq(Literal("Hello"), Literal("World"))), "in")() + val eval = Project(Seq(UnresolvedStar(None), in), UnresolvedRelation(Seq("t"))) + val expectedPlan = Project(Seq(UnresolvedAttribute("in")), eval) + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFiltersTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFiltersTranslatorTestSuite.scala index 20809db95..fe9304e22 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFiltersTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFiltersTranslatorTestSuite.scala @@ -11,7 +11,7 @@ import org.scalatest.matchers.should.Matchers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{And, Ascending, EqualTo, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Literal, Not, Or, SortOrder} +import org.apache.spark.sql.catalyst.expressions.{And, Ascending, EqualTo, GreaterThan, GreaterThanOrEqual, In, LessThan, LessThanOrEqual, Literal, Not, Or, SortOrder} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ @@ -233,4 +233,15 @@ class PPLLogicalPlanFiltersTranslatorTestSuite val expectedPlan = Project(Seq(UnresolvedStar(None)), filter) comparePlans(expectedPlan, logPlan, false) } + + test("test IN expr in filter") { + val context = new CatalystPlanContext + val logPlan = + planTransformer.visit(plan(pplParser, "source=t | where a in ('Hello', 'World')"), context) + + val in = In(UnresolvedAttribute("a"), Seq(Literal("Hello"), Literal("World"))) + val filter = Filter(in, UnresolvedRelation(Seq("t"))) + val expectedPlan = Project(Seq(UnresolvedStar(None)), filter) + comparePlans(expectedPlan, logPlan, checkAnalysis = false) + } } diff --git a/spark-sql-application/src/main/java/org/opensearch/sql/FlintDelegatingSessionCatalog.java b/spark-sql-application/src/main/java/org/opensearch/sql/FlintDelegatingSessionCatalog.java index 6ed9fa980..40f4aee46 100644 --- a/spark-sql-application/src/main/java/org/opensearch/sql/FlintDelegatingSessionCatalog.java +++ b/spark-sql-application/src/main/java/org/opensearch/sql/FlintDelegatingSessionCatalog.java @@ -1,3 +1,8 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + package org.opensearch.sql; import org.apache.spark.sql.SparkSession;