diff --git a/.github/workflows/snapshot-publish.yml b/.github/workflows/snapshot-publish.yml
index c9a2efa7b..0ac112b62 100644
--- a/.github/workflows/snapshot-publish.yml
+++ b/.github/workflows/snapshot-publish.yml
@@ -27,9 +27,6 @@ jobs:
distribution: 'temurin'
java-version: 11
- - name: Set up SBT
- uses: sbt/setup-sbt@v1
-
- name: Publish to Local Maven
run: |
sbt standaloneCosmetic/publishM2
diff --git a/.github/workflows/test-and-build-workflow.yml b/.github/workflows/test-and-build-workflow.yml
index 216f8292d..e3b2b20f4 100644
--- a/.github/workflows/test-and-build-workflow.yml
+++ b/.github/workflows/test-and-build-workflow.yml
@@ -22,9 +22,6 @@ jobs:
distribution: 'temurin'
java-version: 11
- - name: Set up SBT
- uses: sbt/setup-sbt@v1
-
- name: Style check
run: sbt scalafmtCheckAll
diff --git a/README.md b/README.md
index db3790e64..2a3754e6c 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,8 @@ bin/spark-shell --packages "org.opensearch:opensearch-spark-standalone_2.12:0.7.
To build and run this PPL in Spark, you can run (requires Java 11):
```
-sbt clean sparkPPLCosmetic/publishM2
+
+
```
Then add org.opensearch:opensearch-spark-ppl_2.12 when run spark application, for example,
diff --git a/build.sbt b/build.sbt
index 365b88aa3..131fb2347 100644
--- a/build.sbt
+++ b/build.sbt
@@ -2,7 +2,8 @@
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
-import Dependencies.*
+import Dependencies._
+import sbtassembly.AssemblyPlugin.autoImport.ShadeRule
lazy val scala212 = "2.12.14"
lazy val sparkVersion = "3.5.1"
@@ -37,11 +38,6 @@ ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml"
*/
ThisBuild / Test / parallelExecution := false
-/**
- * Set the parallelism of forked tests to 4 to accelerate integration test
- */
-concurrentRestrictions in Global := Seq(Tags.limit(Tags.ForkedTestGroup, 4))
-
// Run as part of compile task.
lazy val compileScalastyle = taskKey[Unit]("compileScalastyle")
@@ -194,6 +190,9 @@ lazy val pplSparkIntegration = (project in file("ppl-spark-integration"))
"com.github.sbt" % "junit-interface" % "0.13.3" % "test",
"org.projectlombok" % "lombok" % "1.18.30",
"com.github.seancfoley" % "ipaddress" % "5.5.1",
+ "org.apache.commons" % "commons-lang3" % "3.17.0",
+ "org.apache.commons" % "commons-csv" % "1.12.0",
+ "com.fasterxml.jackson.core" % "jackson-annotations" % "2.14.2",
),
libraryDependencies ++= deps(sparkVersion),
// ANTLR settings
@@ -278,29 +277,13 @@ lazy val integtest = (project in file("integ-test"))
IntegrationTest / javaSource := baseDirectory.value / "src/integration/java",
IntegrationTest / scalaSource := baseDirectory.value / "src/integration/scala",
IntegrationTest / resourceDirectory := baseDirectory.value / "src/integration/resources",
- IntegrationTest / parallelExecution := true, // enable parallel execution
- IntegrationTest / testForkedParallel := false, // disable forked parallel execution to avoid duplicate spark context in the same JVM
+ IntegrationTest / parallelExecution := false,
IntegrationTest / fork := true,
- IntegrationTest / testGrouping := {
- val tests = (IntegrationTest / definedTests).value
- val forkOptions = ForkOptions()
- val groups = tests.grouped(tests.size / 4 + 1).zipWithIndex.map { case (group, index) =>
- val groupName = s"group-${index + 1}"
- new Tests.Group(
- name = groupName,
- tests = group,
- runPolicy = Tests.SubProcess(
- forkOptions.withRunJVMOptions(forkOptions.runJVMOptions ++
- Seq(s"-Djava.io.tmpdir=${baseDirectory.value}/integ-test/target/tmp/$groupName")))
- )
- }
- groups.toSeq
- }
)),
inConfig(AwsIntegrationTest)(Defaults.testSettings ++ Seq(
AwsIntegrationTest / javaSource := baseDirectory.value / "src/aws-integration/java",
AwsIntegrationTest / scalaSource := baseDirectory.value / "src/aws-integration/scala",
- AwsIntegrationTest / parallelExecution := true,
+ AwsIntegrationTest / parallelExecution := false,
AwsIntegrationTest / fork := true,
)),
libraryDependencies ++= Seq(
diff --git a/docker/apache-spark-sample/.env b/docker/apache-spark-sample/.env
deleted file mode 100644
index a047df5ba..000000000
--- a/docker/apache-spark-sample/.env
+++ /dev/null
@@ -1,4 +0,0 @@
-MASTER_UI_PORT=8080
-MASTER_PORT=7077
-UI_PORT=4040
-PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar
diff --git a/docker/apache-spark-sample/docker-compose.yml b/docker/apache-spark-sample/docker-compose.yml
deleted file mode 100644
index df2da6d52..000000000
--- a/docker/apache-spark-sample/docker-compose.yml
+++ /dev/null
@@ -1,41 +0,0 @@
-services:
- spark:
- image: bitnami/spark:3.5.3
- ports:
- - "${MASTER_UI_PORT:-8080}:8080"
- - "${MASTER_PORT:-7077}:7077"
- - "${UI_PORT:-4040}:4040"
- environment:
- - SPARK_MODE=master
- - SPARK_RPC_AUTHENTICATION_ENABLED=no
- - SPARK_RPC_ENCRYPTION_ENABLED=no
- - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- - SPARK_SSL_ENABLED=no
- - SPARK_PUBLIC_DNS=localhost
- volumes:
- - type: bind
- source: ./spark-defaults.conf
- target: /opt/bitnami/spark/conf/spark-defaults.conf
- - type: bind
- source: $PPL_JAR
- target: /opt/bitnami/spark/jars/ppl-spark-integration.jar
-
- spark-worker:
- image: bitnami/spark:3.5.3
- environment:
- - SPARK_MODE=worker
- - SPARK_MASTER_URL=spark://spark:7077
- - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-1G}
- - SPARK_WORKER_CORES=${WORKER_CORES:-1}
- - SPARK_RPC_AUTHENTICATION_ENABLED=no
- - SPARK_RPC_ENCRYPTION_ENABLED=no
- - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no
- - SPARK_SSL_ENABLED=no
- - SPARK_PUBLIC_DNS=localhost
- volumes:
- - type: bind
- source: ./spark-defaults.conf
- target: /opt/bitnami/spark/conf/spark-defaults.conf
- - type: bind
- source: $PPL_JAR
- target: /opt/bitnami/spark/jars/ppl-spark-integration.jar
diff --git a/docker/apache-spark-sample/spark-defaults.conf b/docker/apache-spark-sample/spark-defaults.conf
deleted file mode 100644
index 47fdaae03..000000000
--- a/docker/apache-spark-sample/spark-defaults.conf
+++ /dev/null
@@ -1,29 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# Default system properties included when running spark-submit.
-# This is useful for setting default environmental settings.
-
-# Example:
-# spark.master spark://master:7077
-# spark.eventLog.enabled true
-# spark.eventLog.dir hdfs://namenode:8021/directory
-# spark.serializer org.apache.spark.serializer.KryoSerializer
-# spark.driver.memory 5g
-# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
-spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions
-spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog
diff --git a/docker/spark-emr-sample/.env b/docker/spark-emr-sample/.env
deleted file mode 100644
index a717532a4..000000000
--- a/docker/spark-emr-sample/.env
+++ /dev/null
@@ -1 +0,0 @@
-PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar
diff --git a/docker/spark-emr-sample/docker-compose.yml b/docker/spark-emr-sample/docker-compose.yml
deleted file mode 100644
index d0da9f166..000000000
--- a/docker/spark-emr-sample/docker-compose.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-services:
- spark-emr:
- image: public.ecr.aws/emr-serverless/spark/emr-7.5.0:20241125
- volumes:
- - type: bind
- source: ./logging-conf
- target: /var/loggingConfiguration/spark
- - type: bind
- source: ../spark-sample-app/target/scala-2.12
- target: /app
- - type: bind
- source: ./spark-conf
- target: /etc/spark/conf
- - type: bind
- source: ${PPL_JAR}
- target: /usr/lib/spark/jars/ppl-spark-integration.jar
- command: driver --class MyApp /app/myapp_2.12-1.0.jar
diff --git a/docker/spark-emr-sample/logging-conf/run-adot-collector.sh b/docker/spark-emr-sample/logging-conf/run-adot-collector.sh
deleted file mode 100644
index 0873413aa..000000000
--- a/docker/spark-emr-sample/logging-conf/run-adot-collector.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-# Do nothing as default logging is sufficient
diff --git a/docker/spark-emr-sample/logging-conf/run-fluentd-spark.sh b/docker/spark-emr-sample/logging-conf/run-fluentd-spark.sh
deleted file mode 100644
index 0873413aa..000000000
--- a/docker/spark-emr-sample/logging-conf/run-fluentd-spark.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/bash
-
-# Do nothing as default logging is sufficient
diff --git a/docker/spark-emr-sample/spark-conf/hive-site.xml b/docker/spark-emr-sample/spark-conf/hive-site.xml
deleted file mode 100644
index f0dc50e1e..000000000
--- a/docker/spark-emr-sample/spark-conf/hive-site.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- hive.metastore.connect.retries
- 15
-
-
\ No newline at end of file
diff --git a/docker/spark-emr-sample/spark-conf/log4j2.properties b/docker/spark-emr-sample/spark-conf/log4j2.properties
deleted file mode 100644
index 27ff7047f..000000000
--- a/docker/spark-emr-sample/spark-conf/log4j2.properties
+++ /dev/null
@@ -1,74 +0,0 @@
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# This property will be overridden for JVMs running inside YARN containers.
-# Other log4j configurations may reference the property, for example, in order to
-# cause a log file to appear in the usual log directory for the YARN container,
-# so that LogPusher will upload it to S3. The following provides a default value
-# to be used for this property such that logs are still written to a valid location
-# even for Spark processes run *outside* of a YARN container (e.g., a Spark
-# driver run in client deploy-mode).
-spark.yarn.app.container.log.dir=/var/log/spark/user/${user.name}
-
-# Set everything to be logged to the console
-rootLogger.level = info
-rootLogger.appenderRef.stdout.ref = console
-
-appender.console.type = Console
-appender.console.name = console
-appender.console.target = SYSTEM_ERR
-appender.console.layout.type = PatternLayout
-appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
-
-# Set the default spark-shell/spark-sql log level to WARN. When running the
-# spark-shell/spark-sql, the log level for these classes is used to overwrite
-# the root logger's log level, so that the user can have different defaults
-# for the shell and regular Spark apps.
-logger.repl.name = org.apache.spark.repl.Main
-logger.repl.level = warn
-
-logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
-logger.thriftserver.level = warn
-
-# Settings to quiet third party logs that are too verbose
-logger.jetty1.name = org.sparkproject.jetty
-logger.jetty1.level = warn
-logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
-logger.jetty2.level = error
-logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
-logger.replexprTyper.level = info
-logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
-logger.replSparkILoopInterpreter.level = info
-logger.parquet1.name = org.apache.parquet
-logger.parquet1.level = error
-logger.parquet2.name = parquet
-logger.parquet2.level = error
-logger.hudi.name = org.apache.hudi
-logger.hudi.level = warn
-
-# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
-logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
-logger.RetryingHMSHandler.level = fatal
-logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
-logger.FunctionRegistry.level = error
-
-# For deploying Spark ThriftServer
-# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
-appender.console.filter.1.type = RegexFilter
-appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
-appender.console.filter.1.onMatch = deny
-appender.console.filter.1.onMismatch = neutral
\ No newline at end of file
diff --git a/docker/spark-emr-sample/spark-conf/metrics.properties b/docker/spark-emr-sample/spark-conf/metrics.properties
deleted file mode 100644
index e69de29bb..000000000
diff --git a/docker/spark-emr-sample/spark-conf/spark-defaults.conf b/docker/spark-emr-sample/spark-conf/spark-defaults.conf
deleted file mode 100644
index 0a5dabe7d..000000000
--- a/docker/spark-emr-sample/spark-conf/spark-defaults.conf
+++ /dev/null
@@ -1,65 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-spark.driver.extraClassPath /usr/lib/livy/rsc-jars/*:/usr/lib/livy/repl_2.12-jars/*:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/goodies/lib/emr-serverless-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/usr/share/aws/redshift/jdbc/RedshiftJDBC.jar:/usr/share/aws/redshift/spark-redshift/lib/*:/usr/share/aws/iceberg/lib/iceberg-emr-common.jar:/usr/share/aws/iceberg/lib/iceberg-spark3-runtime.jar
-spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native
-spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/goodies/lib/emr-serverless-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/usr/share/aws/redshift/jdbc/RedshiftJDBC.jar:/usr/share/aws/redshift/spark-redshift/lib/*:/usr/share/aws/iceberg/lib/iceberg-emr-common.jar:/usr/share/aws/iceberg/lib/iceberg-spark3-runtime.jar
-spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native
-spark.eventLog.enabled true
-spark.eventLog.dir file:///var/log/spark/apps
-spark.history.fs.logDirectory file:///var/log/spark/apps
-spark.history.ui.port 18080
-spark.blacklist.decommissioning.enabled true
-spark.blacklist.decommissioning.timeout 1h
-spark.resourceManager.cleanupExpiredHost true
-spark.stage.attempt.ignoreOnDecommissionFetchFailure true
-spark.decommissioning.timeout.threshold 20
-spark.files.fetchFailure.unRegisterOutputOnHost true
-spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem 2
-spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem true
-spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds 2000
-spark.sql.parquet.output.committer.class com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter
-spark.sql.parquet.fs.optimized.committer.optimization-enabled true
-spark.sql.emr.internal.extensions com.amazonaws.emr.spark.EmrSparkSessionExtensions
-spark.executor.memory 14G
-spark.executor.cores 4
-spark.driver.memory 14G
-spark.driver.cores 4
-spark.executor.defaultJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70 -XX:OnOutOfMemoryError='kill -9 %p'
-spark.driver.defaultJavaOptions -XX:OnOutOfMemoryError='kill -9 %p'
-spark.hadoop.mapreduce.output.fs.optimized.committer.enabled true
-
-spark.master custom:emr-serverless
-spark.submit.deployMode client
-spark.submit.customResourceManager.submit.class org.apache.spark.deploy.emrserverless.submit.EmrServerlessClientApplication
-spark.hadoop.fs.defaultFS file:///
-spark.dynamicAllocation.enabled true
-spark.dynamicAllocation.shuffleTracking.enabled true
-spark.hadoop.fs.s3.customAWSCredentialsProvider com.amazonaws.auth.DefaultAWSCredentialsProviderChain
-spark.authenticate true
-spark.ui.enabled false
-spark.ui.custom.executor.log.url /logs/{{CONTAINER_ID}}/{{FILE_NAME}}.gz
-
-spark.emr-serverless.client.create.batch.size 100
-spark.emr-serverless.client.describe.batch.size 100
-spark.emr-serverless.client.release.batch.size 100
-spark.dynamicAllocation.initialExecutors 3
-spark.dynamicAllocation.minExecutors 0
-spark.executor.instances 3
-spark.hadoop.fs.s3a.aws.credentials.provider software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider
-spark.sql.hive.metastore.sharedPrefixes software.amazon.awssdk.services.dynamodb
-spark.sql.legacy.createHiveTableByDefault false
-spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions
-spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog
diff --git a/docker/spark-emr-sample/spark-conf/spark-env.sh b/docker/spark-emr-sample/spark-conf/spark-env.sh
deleted file mode 100644
index a40f294b6..000000000
--- a/docker/spark-emr-sample/spark-conf/spark-env.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export SPARK_HOME=${SPARK_HOME:-/usr/lib/spark}
-export SPARK_LOG_DIR=${SPARK_LOG_DIR:-/var/log/spark}
-export HADOOP_HOME=${HADOOP_HOME:-/usr/lib/hadoop}
-export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf}
-export HIVE_CONF_DIR=${HIVE_CONF_DIR:-/etc/hive/conf}
-
-export SPARK_MASTER_PORT=7077
-export SPARK_MASTER_IP=$STANDALONE_SPARK_MASTER_HOST
-export SPARK_MASTER_WEBUI_PORT=8080
-
-export SPARK_WORKER_DIR=${SPARK_WORKER_DIR:-/var/run/spark/work}
-export SPARK_WORKER_PORT=7078
-export SPARK_WORKER_WEBUI_PORT=8081
-
-export HIVE_SERVER2_THRIFT_BIND_HOST=0.0.0.0
-export HIVE_SERVER2_THRIFT_PORT=10001
-
-
-export SPARK_DAEMON_JAVA_OPTS="$SPARK_DAEMON_JAVA_OPTS -XX:OnOutOfMemoryError='kill -9 %p'"
-export PYSPARK_PYTHON=${PYSPARK_PYTHON:-/usr/bin/python3}
-export PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:-/usr/bin/python3}
-
-export AWS_STS_REGIONAL_ENDPOINTS=regional
diff --git a/docker/spark-sample-app/build.sbt b/docker/spark-sample-app/build.sbt
deleted file mode 100644
index ea49bfd20..000000000
--- a/docker/spark-sample-app/build.sbt
+++ /dev/null
@@ -1,8 +0,0 @@
-name := "MyApp"
-
-version := "1.0"
-
-scalaVersion := "2.12.20"
-
-libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.3"
-
diff --git a/docker/spark-sample-app/src/main/scala/MyApp.scala b/docker/spark-sample-app/src/main/scala/MyApp.scala
deleted file mode 100644
index 6e2171c41..000000000
--- a/docker/spark-sample-app/src/main/scala/MyApp.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-import org.apache.spark.sql.SparkSession
-
-object MyApp {
- def main(args: Array[String]): Unit = {
- var spark = SparkSession.builder()
- .master("local[1]")
- .appName("MyApp")
- .getOrCreate();
-
- println("APP Name :" + spark.sparkContext.appName);
- println("Deploy Mode :" + spark.sparkContext.deployMode);
- println("Master :" + spark.sparkContext.master);
-
- spark.sql("CREATE table foo (id int, name varchar(100))").show()
- println(">>> Table created")
- spark.sql("SELECT * FROM foo").show()
- println(">>> SQL query of table completed")
-
- spark.sql("source=foo | fields id").show()
- println(">>> PPL query of table completed")
- }
-}
diff --git a/docs/index.md b/docs/index.md
index abc801bde..82c147de2 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -546,7 +546,6 @@ In the index mapping, the `_meta` and `properties`field stores meta and schema i
- `spark.flint.index.checkpointLocation.rootDir`: default is None. Flint will create a default checkpoint location in format of '//' to isolate checkpoint data.
- `spark.flint.index.checkpoint.mandatory`: default is true.
- `spark.datasource.flint.socket_timeout_millis`: default value is 60000.
-- `spark.datasource.flint.request.completionDelayMillis`: Time to wait in milliseconds after request is complete. Applied after index creation. Default value is 2000 if using aoss service, otherwise 0.
- `spark.flint.monitor.initialDelaySeconds`: Initial delay in seconds before starting the monitoring task. Default value is 15.
- `spark.flint.monitor.intervalSeconds`: Interval in seconds for scheduling the monitoring task. Default value is 60.
- `spark.flint.monitor.maxErrorCount`: Maximum number of consecutive errors allowed before stopping the monitoring task. Default value is 5.
@@ -578,10 +577,6 @@ The following table define the data type mapping between Flint data type and Spa
* Spark data types VarcharType(length) and CharType(length) are both currently mapped to Flint data
type *keyword*, dropping their length property. On the other hand, Flint data type *keyword* only
maps to StringType.
-* Spark data type MapType is mapped to an empty OpenSearch object. The inner fields then rely on
- dynamic mapping. On the other hand, Flint data type *object* only maps to StructType.
-* Spark data type DecimalType is mapped to an OpenSearch double. On the other hand, Flint data type
- *double* only maps to DoubleType.
Unsupported Spark data types:
* DecimalType
diff --git a/docs/load_geoip_data.scala b/docs/load_geoip_data.scala
deleted file mode 100644
index 1540dbfb1..000000000
--- a/docs/load_geoip_data.scala
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-import java.io.BufferedReader
-import java.io.FileReader
-import java.io.PrintStream
-import java.math.BigInteger
-import scala.collection.mutable.ListBuffer
-
-var ipv4NodeCount = 0
-var ipv6NodeCount = 0
-var ipv4NodeOutputCount = 0
-var ipv6NodeOutputCount = 0
-
-/* Create a binary tree based on the bits of the start IP address of the subnets. Only use the
- first bits needed for the netmask. For example with a subnet of "192.168.2.0/24", only use the
- first 24 bits.
-
- If a node for a subnet has children, then there is an overlap that must be corrected. To correct
- an overlap, make sure that both children of the node exist and remove the subnet for the current
- node. Finally check the child nodes for overlapping subnets and continue.
- */
-class TreeNode(var ipAddressBytes: Array[Byte], var netmask: Int, var isIPv4: Boolean, var lineRemainder: String) {
- var falseChild: TreeNode = null
- var trueChild: TreeNode = null
-
- def maxNetmask: Integer = if (isIPv4) 32 else 128
-
- // Add a new node to the tree in the correct position
- def addNode(nodeToAdd: TreeNode): Unit = {
- if (netmask >= nodeToAdd.netmask || netmask == maxNetmask) {
- return
- }
-
- var byteIndex = netmask / 8
- var bitValue = (nodeToAdd.ipAddressBytes(byteIndex) & (1 << (7 - (netmask % 8)))) > 0
-
- if (netmask + 1 == nodeToAdd.netmask) {
- if (bitValue) {
- trueChild = nodeToAdd
- } else {
- falseChild = nodeToAdd
- }
- } else {
- var nextChild: TreeNode = null
- if (bitValue) {
- nextChild = trueChild
- if (trueChild == null) {
- nextChild = new TreeNode(null, netmask + 1, isIPv4, null)
- trueChild = nextChild
- }
- } else {
- nextChild = falseChild
- if (falseChild == null) {
- nextChild = new TreeNode(null, netmask + 1, isIPv4, null)
- falseChild = nextChild
- }
- }
-
- nextChild.addNode(nodeToAdd)
- }
-
- return
- }
-
- def haveOverlap(): Boolean = falseChild != null || trueChild != null
-
- // Convert the IP address to a string. For IPv6, this is more complicated, since it may
- // need to be reduced.
- def ipAddressString(): String = {
- if (isIPv4) {
- return ipAddressBytes.map(v => 255 & v).mkString(".")
- } else {
- var allZeroes = true
- for (b <- ipAddressBytes) {
- if (b != 0) {
- allZeroes = false
- }
- }
-
- if (allZeroes) {
- return "::"
- }
-
- var zeroes: ListBuffer[(Int, Int)] = ListBuffer()
- var zeroesStart = -1
- var zeroesStartIndex = -1
- for (i <- 0 to 7) {
- if (ipAddressBytes(i * 2) == 0 && ipAddressBytes(i * 2 + 1) == 0) {
- if (zeroesStart == -1) {
- zeroesStart = i
- zeroesStartIndex = zeroes.length
- zeroes = zeroes :+ (i, 1)
- } else {
- var existingTuple = zeroes(zeroesStartIndex)
- zeroes.update(zeroesStartIndex, (existingTuple._1, 1 + existingTuple._2))
- }
- } else {
- zeroesStart = -1
- zeroesStartIndex = -1
- }
- }
-
- var longestZeroesIndex = -1
- var longestZeroesLength = 0
- for (v <- zeroes) {
- if (v._2 >= longestZeroesLength) {
- longestZeroesLength = v._2
- longestZeroesIndex = v._1
- }
- }
-
- var fullIpAddress: Array[String] = Array.fill(8){null}
- for (i <- 0 to 7) {
- var strValue = (((255 & ipAddressBytes(i * 2)) << 8) + (255 & ipAddressBytes(i * 2 + 1))).toHexString
- fullIpAddress(i) = strValue
- }
-
- if (longestZeroesIndex == -1) {
- return fullIpAddress.mkString(":")
- } else {
- var ipPartsStart = fullIpAddress.slice(0, longestZeroesIndex)
- var ipPartsEnd = fullIpAddress.slice(longestZeroesIndex + longestZeroesLength, 8)
- return ipPartsStart.mkString(":") + "::" + ipPartsEnd.mkString(":")
- }
- }
- }
-
- def getStart(): BigInteger = new BigInteger(ipAddressBytes)
-
- def getEnd(): BigInteger = {
- var valueToAdd = new BigInteger(Array.fill(maxNetmask / 8){0.toByte})
- if (netmask < maxNetmask) {
- valueToAdd = valueToAdd.flipBit(maxNetmask - netmask)
- valueToAdd = valueToAdd.subtract(new BigInteger("1"))
- }
- return getStart().add(valueToAdd)
- }
-
- def valueToByteArray(value: BigInteger): Array[Byte] = {
- var fullArray = Array.fill(maxNetmask / 8){0.toByte}
- var valueArray = value.toByteArray()
- valueArray.copyToArray(fullArray, (maxNetmask / 8) - valueArray.length, valueArray.length)
- return fullArray
- }
-
- def incrementNodeCount(): Unit = {
- if (isIPv4) {
- ipv4NodeCount += ipv4NodeCount
- } else {
- ipv6NodeCount += ipv6NodeCount
- }
- }
-
- // Split a node. Make sure that both children exist and remove the subnet for the current node.
- def split(): Unit = {
- if (ipAddressBytes == null) {
- return
- }
-
- var ipAddressStr = ipAddressString()
- println(s">>> Splitting IP: $ipAddressStr")
-
- if (falseChild == null) {
- falseChild = new TreeNode(ipAddressBytes, netmask + 1, isIPv4, lineRemainder)
- } else if (falseChild.ipAddressBytes == null) {
- falseChild.ipAddressBytes = ipAddressBytes
- falseChild.lineRemainder = lineRemainder
- }
-
- if (trueChild == null) {
- var valueStart = falseChild.getEnd().add(new BigInteger("1"))
- var startArray = valueToByteArray(valueStart)
- trueChild = new TreeNode(startArray, netmask + 1, isIPv4, lineRemainder)
- } else if (trueChild.ipAddressBytes == null) {
- var valueStart = falseChild.getEnd().add(new BigInteger("1"))
- var startArray = valueToByteArray(valueStart)
- trueChild.ipAddressBytes = startArray
- trueChild.lineRemainder = lineRemainder
- }
-
- ipAddressBytes = null
- lineRemainder = null
-
- return
- }
-
- def fixTree(): Unit = {
- if (haveOverlap()) {
- split()
- }
-
- if (falseChild != null) {
- falseChild.fixTree()
- }
-
- if (trueChild != null) {
- trueChild.fixTree()
- }
- }
-
- def printTree(outStream: PrintStream, tenPercentCount: Int): Unit = {
- if (ipAddressBytes != null) {
- outStream.print(ipAddressString())
- outStream.print("/")
- outStream.print(netmask.toString)
- outStream.print(",")
- outStream.print(lineRemainder)
- outStream.print(",")
- outStream.print(getStart().toString())
- outStream.print(",")
- outStream.print(getEnd().toString())
- outStream.print(",")
- outStream.println(isIPv4.toString)
-
- var currentNodeCount = if (isIPv4) ipv4NodeOutputCount else ipv6NodeOutputCount
- if (currentNodeCount % tenPercentCount == 0) {
- print((currentNodeCount * 10 / tenPercentCount).toString + "%..")
- }
-
- if (isIPv4) {
- ipv4NodeOutputCount += 1
- } else {
- ipv6NodeOutputCount += 1
- }
- }
-
- if (falseChild != null) {
- falseChild.printTree(outStream, tenPercentCount)
- }
- if (trueChild != null) {
- trueChild.printTree(outStream, tenPercentCount)
- }
- }
-}
-
-// Create a node for an IPv4 entry
-def createIPv4TreeNode(fullLine: String): TreeNode = {
- var charIndex = fullLine.indexOf(",")
- var subnet = fullLine.substring(0, charIndex)
- var lineRemainder = fullLine.substring(charIndex + 1)
-
- charIndex = subnet.indexOf("/")
- var ipAddressStr = subnet.substring(0, charIndex)
- var netmask = subnet.substring(charIndex + 1).toInt
-
- var addrParts = ipAddressStr.split("\\.")
- var bytes = Array[Byte](
- addrParts(0).toInt.toByte,
- addrParts(1).toInt.toByte,
- addrParts(2).toInt.toByte,
- addrParts(3).toInt.toByte
- )
-
- return new TreeNode(bytes, netmask, true, lineRemainder)
-}
-
-// Create a node for an IPv6 entry
-def createIPv6TreeNode(fullLine: String): TreeNode = {
- var charIndex = fullLine.indexOf(",")
- var subnet = fullLine.substring(0, charIndex)
- var lineRemainder = fullLine.substring(charIndex + 1)
-
- charIndex = subnet.indexOf("/")
- var ipAddressStr = subnet.substring(0, charIndex)
- var netmask = subnet.substring(charIndex + 1).toInt
-
- var bytes: Array[Byte] = null
- charIndex = ipAddressStr.indexOf("::")
-
- if (charIndex == -1) {
- var values = ipAddressStr.split(":").map(x => Integer.parseInt(x, 16))
- bytes = Array.fill(16){0.toByte}
- for (i <- 0 to 7) {
- bytes(i * 2) = (values(i) >> 8).toByte
- bytes(i * 2 + 1) = (values(i) & 255).toByte
- }
- } else if ("::" == ipAddressStr) {
- bytes = Array.fill(16){0.toByte}
- } else {
- if (charIndex == 0) {
- var values = ipAddressStr.substring(2).split(":").map(x => Integer.parseInt(x, 16))
- bytes = Array.fill(16){0.toByte}
- for (i <- 8 - values.length to 7) {
- var valuesIndex = i - 8 + values.length
- bytes(i * 2) = (values(valuesIndex) >> 8).toByte
- bytes(i * 2 + 1) = (values(valuesIndex) & 255).toByte
- }
- } else if (charIndex == ipAddressStr.length - 2) {
- var values = ipAddressStr.substring(0, ipAddressStr.length - 2).split(":").map(x => Integer.parseInt(x, 16))
- bytes = Array.fill(16){0.toByte}
- for (i <- 0 to values.length - 1) {
- bytes(i * 2) = (values(i) >> 8).toByte
- bytes(i * 2 + 1) = (values(i) & 255).toByte
- }
- } else {
- var startValues = ipAddressStr.substring(0, charIndex).split(":").map(x => Integer.parseInt(x, 16))
- var endValues = ipAddressStr.substring(charIndex + 2).split(":").map(x => Integer.parseInt(x, 16))
- bytes = Array.fill(16){0.toByte}
- for (i <- 0 to startValues.length - 1) {
- bytes(i * 2) = (startValues(i) >> 8).toByte
- bytes(i * 2 + 1) = (startValues(i) & 255).toByte
- }
- for (i <- 8 - endValues.length to 7) {
- var valuesIndex = i - 8 + endValues.length
- bytes(i * 2) = (endValues(valuesIndex) >> 8).toByte
- bytes(i * 2 + 1) = (endValues(valuesIndex) & 255).toByte
- }
- }
- }
-
- return new TreeNode(bytes, netmask, false, lineRemainder)
-}
-
-def createTreeNode(fullLine: String): TreeNode = {
- var charIndex = fullLine.indexOf(",")
- var subnet = fullLine.substring(0, charIndex)
- if (subnet.indexOf(':') > -1) {
- return createIPv6TreeNode(fullLine)
- } else {
- return createIPv4TreeNode(fullLine)
- }
-}
-
-var header: String = null
-def readSubnets(fileName: String, ipv4Root: TreeNode, ipv6Root: TreeNode): Unit = {
- var reader = new BufferedReader(new FileReader(fileName))
- header = reader.readLine()
-
- var line = reader.readLine()
- while (line != null) {
- var newNode = createTreeNode(line)
- if (newNode.isIPv4) {
- ipv4Root.addNode(newNode)
- ipv4NodeCount += 1
- } else {
- ipv6Root.addNode(newNode)
- ipv6NodeCount += 1
- }
-
- line = reader.readLine()
- }
-
- reader.close()
-}
-
-def writeSubnets(fileName: String, ipv4Root: TreeNode, ipv6Root: TreeNode): Unit = {
- var outStream = new PrintStream(fileName)
- outStream.print(header)
- outStream.print(",ip_range_start,ip_range_end,ipv4")
- outStream.print("\r\n")
-
- println("Writing IPv4 data")
- ipv4NodeOutputCount = 0
- ipv4Root.printTree(outStream, (ipv4NodeCount / 10).floor.toInt)
- println()
-
- println("Writing IPv6 data")
- ipv6NodeOutputCount = 0
- ipv6Root.printTree(outStream, (ipv6NodeCount / 10).floor.toInt)
- println()
-
- outStream.close()
-}
-
-// Create the table in Spark
-def createTable(fileName: String, tableName: String): Unit = {
- try {
- var sparkSessionClass = Class.forName("org.apache.spark.sql.SparkSession")
- var activeSessionMethod = sparkSessionClass.getMethod("active")
- var sparkSession = activeSessionMethod.invoke(sparkSessionClass)
-
- var readMethod = sparkSessionClass.getMethod("read")
- var dataFrameReader = readMethod.invoke(sparkSession)
-
- var dataFrameReaderClass = Class.forName("org.apache.spark.sql.DataFrameReader")
- var formatMethod = dataFrameReaderClass.getMethod("format", classOf[java.lang.String])
- dataFrameReader = formatMethod.invoke(dataFrameReader, "csv")
-
- var optionMethod = dataFrameReaderClass.getMethod("option", classOf[java.lang.String], classOf[java.lang.String])
- dataFrameReader = optionMethod.invoke(dataFrameReader, "inferSchema", "true")
- dataFrameReader = optionMethod.invoke(dataFrameReader, "header", "true")
-
- var loadMethod = dataFrameReaderClass.getMethod("load", classOf[java.lang.String])
- var dataset = loadMethod.invoke(dataFrameReader, fileName)
-
- var datasetClass = Class.forName("org.apache.spark.sql.Dataset")
- var writeMethod = datasetClass.getMethod("write")
- var dataFrameWriter = writeMethod.invoke(dataset)
-
- var dataFrameWriterClass = Class.forName("org.apache.spark.sql.DataFrameWriter")
- var saveAsTableMethod = dataFrameWriterClass.getMethod("saveAsTable", classOf[java.lang.String])
- saveAsTableMethod.invoke(dataFrameWriter, tableName)
- } catch {
- case e: Exception => {
- println("Unable to load data into table")
- e.printStackTrace()
- }
- }
-}
-
-// Sanitize the data and import it into a Spark table
-def cleanAndImport(inputFile: String, outputFile: String, tableName: String): Unit = {
- if (tableName != null) {
- try {
- Class.forName("org.apache.spark.sql.SparkSession")
- } catch {
- case e: ClassNotFoundException => {
- println("Must run in Spark CLI to create the Spark table")
- return
- }
- }
- }
-
- println("Loading data")
- var ipv4Root = new TreeNode(null, 0, true, null)
- var ipv6Root = new TreeNode(null, 0, false, null)
- readSubnets(inputFile, ipv4Root, ipv6Root)
-
- println("Fixing overlapping subnets")
- ipv4Root.fixTree()
- ipv6Root.fixTree()
-
- println("Writing data to file")
- writeSubnets(outputFile, ipv4Root, ipv6Root)
-
- if (tableName != null) {
- println("Creating and populating Spark table")
- createTable(outputFile, tableName)
- }
-
- println("Done")
-}
-
-var FILE_PATH_TO_INPUT_CSV: String = "/replace/this/value"
-var FILE_PATH_TO_OUTPUT_CSV: String = "/replace/this/value"
-var TABLE_NAME: String = null
-var result = cleanAndImport(FILE_PATH_TO_INPUT_CSV, FILE_PATH_TO_OUTPUT_CSV, TABLE_NAME)
diff --git a/docs/opensearch-geoip.md b/docs/opensearch-geoip.md
deleted file mode 100644
index cd262e187..000000000
--- a/docs/opensearch-geoip.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# OpenSearch Geographic IP Location Data
-
-## Overview
-
-OpenSearch has PPL functions for looking up the geographic location of IP addresses. In order
-to use these functions, a table needs to be created containing the geographic location
-information.
-
-## How to Create Geographic Location Index
-
-A script has been created that can cleanup and augment a CSV file that contains geographic
-location information for IP addresses ranges. The CSV file is expected to have the following
-columns:
-
-| Column Name | Description |
-|------------------|---------------------------------------------------------------------------------------------------------|
-| cidr | IP address subnet in format `IP_ADDRESS/NETMASK` (ex. `192.168.0.0/24`). IP address can be IPv4 or IPv6 |
-| country_iso_code | ISO code of the country where the IP address subnet is located |
-| country_name | Name of the country where the IP address subnet is located |
-| continent_name | Name of the continent where the IP address subent is located |
-| region_iso_code | ISO code of the region where the IP address subnet is located |
-| region_name | Name of the region where the IP address subnet is located |
-| city_name | Name of the city where the IP address subnet is located |
-| time_zone | Time zone where the IP address subnet is located |
-| location | Latitude and longitude where the IP address subnet is located |
-
-The script will cleanup the data by splitting IP address subnets so that an IP address can only be in at most one subnet.
-
-The data is augmented by adding 3 fields.
-
-| Column Name | Description |
-|----------------|--------------------------------------------------------------------|
-| ip_range_start | An integer value used to determine if an IP address is in a subnet |
-| ip_range_end | An integer value used to determine if an IP address is in a subnet |
-| ipv4 | A boolean value, `true` if the IP address subnet is in IPv4 format |
-
-## Run the Script
-
-1. Create a copy of the scala file `load_geoip_data.scala`
-2. Edit the copy of the file `load_geoip_data.scala`
- There are three variables that need to be updated.
- 1. `FILE_PATH_TO_INPUT_CSV` - the full path to the CSV file to load
- 2. `FILE_PATH_TO_OUTPUT_CSV` - the full path of the CSV file to write the sanitized data to
- 3. `TABLE_NAME` - name of the index to create in OpenSearch. No table is created if this is null
-4. Save the file
-5. Run the Apache Spark CLI and connect to the database
-6. Load the Scala script
- ```scala
- :load FILENAME
- ```
- Replace `FILENAME` with the full path to the Scala script.
-
-## Notes for EMR
-
-With EMR it is necessary to load the data from an S3 object. Follow the instructions for
-**Run the Script**, but make sure that `TABLE_NAME` is set to `null`. Upload the
-`FILE_PATH_TO_OUTPUT_CSV` to S3.
-
-## End-to-End
-
-How to download a sample data GeoIP location data set, clean it up and import it into a
-Spark table.
-
-1. Use a web browser to download the [data set Zip file](https://geoip.maps.opensearch.org/v1/geolite2-city/data/geolite2-city_1732905911000.zip)
-2. Extract the Zip file
-3. Copy the file `geolite2-City.csv` to the computer where you run `spark-shell`
-4. Copy the file file `load_geoip_data.scala` to the computer where you run `spark-shell`
-5. Connect to the computer where you run `spark-shell`
-6. Change to the directory containing `geolite2-City.csv` and `load_geoip_data.scala`
-7. Update the `load_geoip_data.scala` file to specify the CSV files to read and write. Also update
- it to specify the Spark table to create (`geo_ip_data` in this case).
- ```
- sed -i \
- -e "s#^var FILE_PATH_TO_INPUT_CSV: String =.*#var FILE_PATH_TO_INPUT_CSV: String = \"${PWD}/geolite2-City.csv\"#" \
- load_geoip_data.scala
- sed -i \
- -e "s#^var FILE_PATH_TO_OUTPUT_CSV: String = .*#var FILE_PATH_TO_OUTPUT_CSV: String = \"${PWD}/geolite2-City-fixed.csv\"#" \
- load_geoip_data.scala
- sed -i \
- -e 's#^var TABLE_NAME: String = .*#var TABLE_NAME: String = "geo_ip_data"#' \
- load_geoip_data.scala
- ```
-8. Run `spark-shell`
- ```
- spark-shell
- ```
-9. Load and run the `load_geoip_data.scala` script
- ```
- :load load_geoip_data.scala
- ```
diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md
index 5a61992de..7766c3b50 100644
--- a/docs/ppl-lang/PPL-Example-Commands.md
+++ b/docs/ppl-lang/PPL-Example-Commands.md
@@ -118,7 +118,6 @@ Assumptions: `a`, `b`, `c` are existing fields in `table`
- `source = table | eval r = coalesce(a, b, c) | fields r`
- `source = table | eval e = isempty(a) | fields e`
- `source = table | eval e = isblank(a) | fields e`
-- `source = table | eval e = cast(a as timestamp) | fields e`
- `source = table | eval f = case(a = 0, 'zero', a = 1, 'one', a = 2, 'two', a = 3, 'three', a = 4, 'four', a = 5, 'five', a = 6, 'six', a = 7, 'se7en', a = 8, 'eight', a = 9, 'nine')`
- `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else 'unknown')`
- `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else concat(a, ' is an incorrect binary digit'))`
@@ -141,7 +140,6 @@ Assumptions: `a`, `b`, `c`, `d`, `e` are existing fields in `table`
Assumptions: `bridges`, `coor` are existing fields in `table`, and the field's types are `struct,?>` or `array>`
- `source = table | flatten bridges`
- `source = table | flatten coor`
-- `source = table | flatten coor as (altitude, latitude, longitude)`
- `source = table | flatten bridges | flatten coor`
- `source = table | fields bridges | flatten bridges`
- `source = table | fields country, bridges | flatten bridges | fields country, length | stats avg(length) as avg by country`
@@ -487,11 +485,4 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols`
> ppl-correlation-command is an experimental command - it may be removed in future versions
-#### **Cast**
-[See additional command details](functions/ppl-conversion.md)
-- `source = table | eval int_to_string = cast(1 as string) | fields int_to_string`
-- `source = table | eval int_to_string = cast(int_col as string), string_to_int = cast(string_col as integer) | fields int_to_string, string_to_int`
-- `source = table | eval cdate = CAST('2012-08-07' as date), ctime = cast('2012-08-07T08:07:06' as timestamp) | fields cdate, ctime`
-- `source = table | eval chained_cast = cast(cast("true" as boolean) as integer) | fields chained_cast`
-
---
diff --git a/docs/ppl-lang/functions/ppl-conversion.md b/docs/ppl-lang/functions/ppl-conversion.md
index 7d3535936..48e4106ca 100644
--- a/docs/ppl-lang/functions/ppl-conversion.md
+++ b/docs/ppl-lang/functions/ppl-conversion.md
@@ -7,21 +7,22 @@
`cast(expr as dateType)` cast the expr to dataType. return the value of dataType. The following conversion rules are used:
```
-+------------+--------+--------+---------+-------------+--------+
-| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE |
-+------------+--------+--------+---------+-------------+--------+
-| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() |
-+------------+--------+--------+---------+-------------+--------+
-| NUMBER | Note1 | | v!=0 | N/A | N/A |
-+------------+--------+--------+---------+-------------+--------+
-| BOOLEAN | Note1 | v?1:0 | | N/A | N/A |
-+------------+--------+--------+---------+-------------+--------+
-| TIMESTAMP | Note1 | N/A | N/A | | DATE() |
-+------------+--------+--------+---------+-------------+--------+
-| DATE | Note1 | N/A | N/A | N/A | |
-+------------+--------+--------+---------+-------------+--------+
++------------+--------+--------+---------+-------------+--------+--------+
+| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | TIME |
++------------+--------+--------+---------+-------------+--------+--------+
+| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | TIME() |
++------------+--------+--------+---------+-------------+--------+--------+
+| NUMBER | Note1 | | v!=0 | N/A | N/A | N/A |
++------------+--------+--------+---------+-------------+--------+--------+
+| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | N/A |
++------------+--------+--------+---------+-------------+--------+--------+
+| TIMESTAMP | Note1 | N/A | N/A | | DATE() | TIME() |
++------------+--------+--------+---------+-------------+--------+--------+
+| DATE | Note1 | N/A | N/A | N/A | | N/A |
++------------+--------+--------+---------+-------------+--------+--------+
+| TIME | Note1 | N/A | N/A | N/A | N/A | |
++------------+--------+--------+---------+-------------+--------+--------+
```
-- `NUMBER` includes `INTEGER`, `LONG`, `FLOAT`, `DOUBLE`.
Cast to **string** example:
@@ -35,7 +36,7 @@ Cast to **string** example:
Cast to **number** example:
- os> source=people | eval `cbool` = CAST(true as integer), `cstring` = CAST('1' as integer) | fields `cbool`, `cstring`
+ os> source=people | eval `cbool` = CAST(true as int), `cstring` = CAST('1' as int) | fields `cbool`, `cstring`
fetched rows / total rows = 1/1
+---------+-----------+
| cbool | cstring |
@@ -45,13 +46,13 @@ Cast to **number** example:
Cast to **date** example:
- os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctimestamp`
+ os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctime` = CAST('01:01:01' as time), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctime`, `ctimestamp`
fetched rows / total rows = 1/1
- +------------+---------------------+
- | cdate | ctimestamp |
- |------------+---------------------|
- | 2012-08-07 | 2012-08-07 01:01:01 |
- +------------+---------------------+
+ +------------+----------+---------------------+
+ | cdate | ctime | ctimestamp |
+ |------------+----------+---------------------|
+ | 2012-08-07 | 01:01:01 | 2012-08-07 01:01:01 |
+ +------------+----------+---------------------+
Cast function can be **chained**:
diff --git a/docs/ppl-lang/planning/ppl-geoip.md b/docs/ppl-lang/planning/ppl-geoip.md
new file mode 100644
index 000000000..f6bef8f34
--- /dev/null
+++ b/docs/ppl-lang/planning/ppl-geoip.md
@@ -0,0 +1,39 @@
+## geoip syntax proposal
+
+geoip function to add information about the geographical location of an IPv4 or IPv6 address
+
+1. **Proposed syntax**
+ - `... | eval geoinfo = geoip([datasource,] ipAddress [,properties])`
+ - generic syntax
+ - `... | eval geoinfo = geoip(ipAddress)`
+ - use the default geoip datasource
+ - `... | eval geoinfo = geoip("abc", ipAddress)`
+ - use the "abc" geoip datasource
+ - `... | eval geoinfo = geoip(ipAddress, "city,lat,lon")`
+ - use the default geoip datasource, retrieve only city, lat and lon
+ - `... | eval geoinfo = geoip("abc", ipAddress, "city,lat,lon")`
+ - use the "abc" geoip datasource, retrieve only city, lat and lon
+
+
+2. **Proposed wiring with the geoip database**
+ - Leverage the functionality of the ip2geo processor
+ - ip2geo processor configuration, functionality and code will be used
+ - Prerequisite for the geoip is that ip2geo processor is configured properly
+ - See https://opensearch.org/docs/latest/ingest-pipelines/processors/ip2geo/
+
+
+### New syntax definition in ANTLR
+
+```ANTLR
+
+// functions
+evalFunctionCall
+ : evalFunctionName LT_PRTHS functionArgs RT_PRTHS
+ | geoipFunction
+ ;
+
+geoipFunction
+ : GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = stringLiteral)? RT_PRTHS
+ ;
+```
+
diff --git a/docs/ppl-lang/ppl-fillnull-command.md b/docs/ppl-lang/ppl-fillnull-command.md
index f204a5969..00064849c 100644
--- a/docs/ppl-lang/ppl-fillnull-command.md
+++ b/docs/ppl-lang/ppl-fillnull-command.md
@@ -17,7 +17,7 @@ The example show fillnull one field.
PPL query:
- os> source=logs | fields status_code | eval input=status_code | fillnull with 0 in status_code;
+ os> source=logs | fields status_code | eval input=status_code | fillnull value = 0 status_code;
| input | status_code |
|-------|-------------|
| 403 | 403 |
@@ -43,7 +43,7 @@ The example show fillnull applied to multiple fields.
PPL query:
- os> source=logs | fields request_path, timestamp | eval input_request_path=request_path, input_timestamp = timestamp | fillnull with '???' in request_path, timestamp;
+ os> source=logs | fields request_path, timestamp | eval input_request_path=request_path, input_timestamp = timestamp | fillnull value = '???' request_path, timestamp;
| input_request_path | input_timestamp | request_path | timestamp |
|--------------------|-----------------------|--------------|------------------------|
| /contact | NULL | /contact | ??? |
@@ -89,4 +89,4 @@ PPL query:
| /services | NULL | /services | 1970-01-01 00:00:00 |
| /home | 2023-10-01 10:45:00 | /home | 2023-10-01 10:45:00 |
| /services | 2023-10-01 11:00:00 | /services | 2023-10-01 11:00:00 |
-| NULL | 2023-10-01 10:35:00 | /error | 2023-10-01 10:35:00 |
+| NULL | 2023-10-01 10:35:00 | /error | 2023-10-01 10:35:00 |
\ No newline at end of file
diff --git a/docs/ppl-lang/ppl-flatten-command.md b/docs/ppl-lang/ppl-flatten-command.md
index 68b03e82e..4c1ae5d0d 100644
--- a/docs/ppl-lang/ppl-flatten-command.md
+++ b/docs/ppl-lang/ppl-flatten-command.md
@@ -7,10 +7,9 @@ Using `flatten` command to flatten a field of type:
### Syntax
-`flatten [As aliasSequence]`
+`flatten `
* field: to be flattened. The field must be of supported type.
-* aliasSequence: to be used as aliasSequence for the flattened-output fields. Better to put the aliasSequence in brace if there is more than one field.
### Test table
#### Schema
@@ -88,18 +87,4 @@ PPL query:
| 2024-09-13T12:00:00 | Prague | Czech Republic| 343 | Legion Bridge | 200 | 50.0755| 14.4378|
| 2024-09-13T12:00:00 | Budapest| Hungary | 375 | Chain Bridge | 96 | 47.4979| 19.0402|
| 2024-09-13T12:00:00 | Budapest| Hungary | 333 | Liberty Bridge | 96 | 47.4979| 19.0402|
-| 1990-09-13T12:00:00 | Warsaw | Poland | NULL | NULL | NULL | NULL | NULL |
-
-### Example 4: flatten with aliasSequence
-This example shows how to flatten with aliasSequence.
-PPL query:
- - `source=table | flatten coor as (altitude, latitude, longitude)`
-
-| \_time | bridges | city | country | altitude | latitude | longtitude |
-|---------------------|----------------------------------------------|---------|---------------|----------|----------|------------|
-| 2024-09-13T12:00:00 | [{801, Tower Bridge}, {928, London Bridge}] | London | England | 35 | 51.5074 | -0.1278 |
-| 2024-09-13T12:00:00 | [{232, Pont Neuf}, {160, Pont Alexandre III}]| Paris | France | 35 | 48.8566 | 2.3522 |
-| 2024-09-13T12:00:00 | [{48, Rialto Bridge}, {11, Bridge of Sighs}] | Venice | Italy | 2 | 45.4408 | 12.3155 |
-| 2024-09-13T12:00:00 | [{516, Charles Bridge}, {343, Legion Bridge}]| Prague | Czech Republic| 200 | 50.0755 | 14.4378 |
-| 2024-09-13T12:00:00 | [{375, Chain Bridge}, {333, Liberty Bridge}] | Budapest| Hungary | 96 | 47.4979 | 19.0402 |
-| 1990-09-13T12:00:00 | NULL | Warsaw | Poland | NULL | NULL | NULL |
+| 1990-09-13T12:00:00 | Warsaw | Poland | NULL | NULL | NULL | NULL | NULL |
\ No newline at end of file
diff --git a/docs/spark-docker.md b/docs/spark-docker.md
deleted file mode 100644
index d1200e2b3..000000000
--- a/docs/spark-docker.md
+++ /dev/null
@@ -1,164 +0,0 @@
-# Running Queries with Apache Spark in Docker
-
-There are [Bitnami Apache Spark docker images](https://hub.docker.com/r/bitnami/spark). These
-can be modified to be able to include the OpenSearch Spark PPL extension. With the OpenSearch
-Spark PPL extension, the docker image can be used to test PPL commands.
-
-The Bitnami Apache Spark image can be used to run a Spark cluster and also to run
-`spark-shell` for running queries.
-
-## Prepare OpenSearch Spark PPL Extension
-
-Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the
-location of the Jar file as well as the name of the Jar file.
-
-From the root of this repository, build the OpenSearch Spark PPL extension with:
-
-```
-sbt clean
-sbt assembly
-```
-
-Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information.
-
-## Using Docker Compose
-
-There are sample files in this repository at `docker/apache-spark-sample` They can be used to
-start up both nodes with the command:
-
-```
-docker compose up -d
-```
-
-The cluster can be stopped with:
-
-```
-docker compose down
-```
-
-### Configuration
-
-There is a file `docker/apache-spark-sample/.env` that can be edited to change some settings.
-
-| Variable Name | Description |
-|----------------|---------------------------------------------------|
-| MASTER_UI_PORT | Host port to bind to port 8080 of the master node |
-| MASTER_PORT | Host port to bind to port 7077 of the master node |
-| UI_PORT | Host port to bind to port 4040 of the master node |
-| PPL_JAR | Path to the PPL Jar file |
-
-## Running Spark Shell
-
-Can run `spark-shell` on the master node.
-
-```
-docker exec -it apache-spark-sample-spark-1 /opt/bitnami/spark/bin/spark-shell
-```
-
-Within the Spark Shell, you can submit queries, including PPL queries. For example a sample
-table can be created, populated and finally queried using PPL.
-
-```
-spark.sql("CREATE TABLE test_table(id int, name varchar(100))")
-spark.sql("INSERT INTO test_table (id, name) VALUES(1, 'Foo')")
-spark.sql("INSERT INTO test_table (id, name) VALUES(2, 'Bar')")
-spark.sql("source=test_table | eval x = id + 5 | fields x, name").show()
-```
-
-For further information, see the [Spark PPL Test Instructions](ppl-lang/local-spark-ppl-test-instruction.md)
-
-## Manual Setup
-
-### spark-conf
-
-Contains the Apache Spark configuration. Need to add three lines to the `spark-defaults.conf`
-file:
-```
-spark.sql.legacy.createHiveTableByDefault false
-spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions
-spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog
-```
-
-An example file available in this repository at `docker/apache-spark-sample/spark-defaults.conf`
-
-## Prepare OpenSearch Spark PPL Extension
-
-Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the
-location of the Jar file as well as the name of the Jar file.
-
-## Run the Spark Cluster
-
-Need to run a master node and a worker node. For these to communicate, first create a network
-for them to use.
-
-```
-docker network create spark-network
-```
-
-### Master Node
-
-The master node can be run with the following command:
-```
-docker run \
- -d \
- --name spark \
- --network spark-network \
- -p 8080:8080 \
- -p 7077:7077 \
- -p 4040:4040 \
- -e SPARK_MODE=master \
- -e SPARK_RPC_AUTHENTICATION_ENABLED=no \
- -e SPARK_RPC_ENCRYPTION_ENABLED=no \
- -e SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no \
- -e SPARK_SSL_ENABLED=no \
- -e SPARK_PUBLIC_DNS=localhost \
- -v :/opt/bitnami/spark/conf/spark-defaults.conf \
- -v /:/opt/bitnami/spark/jars/ \
- bitnami/spark:3.5.3
-```
-
-* `-d`
- Run the container in the background and return to the shell
-* `--name spark`
- Name the docker container `spark`
-* ``
- Replace with the path to the Spark configuration file.
-* ``
- Replace with the path to the directory containing the OpenSearch Spark PPL extension
- Jar file.
-* ``
- Replace with the filename of the OpenSearch Spark PPL extension Jar file.
-
-### Worker Node
-
-The worker node can be run with the following command:
-```
-docker run \
- -d \
- --name spark-worker \
- --network spark-network \
- -e SPARK_MODE=worker \
- -e SPARK_MASTER_URL=spark://spark:7077 \
- -e SPARK_WORKER_MEMORY=1G \
- -e SPARK_WORKER_CORES=1 \
- -e SPARK_RPC_AUTHENTICATION_ENABLED=no \
- -e SPARK_RPC_ENCRYPTION_ENABLED=no \
- -e SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no \
- -e SPARK_SSL_ENABLED=no \
- -e SPARK_PUBLIC_DNS=localhost \
- -v :/opt/bitnami/spark/conf/spark-defaults.conf \
- -v /:/opt/bitnami/spark/jars/ \
- bitnami/spark:3.5.3
-```
-
-* `-d`
- Run the container in the background and return to the shell
-* `--name spark-worker`
- Name the docker container `spark-worker`
-* ``
- Replace with the path to the Spark configuration file.
-* ``
- Replace with the path to the directory containing the OpenSearch Spark PPL extension
- Jar file.
-* ``
- Replace with the filename of the OpenSearch Spark PPL extension Jar file.
diff --git a/docs/spark-emr-docker.md b/docs/spark-emr-docker.md
deleted file mode 100644
index 7eef4d250..000000000
--- a/docs/spark-emr-docker.md
+++ /dev/null
@@ -1,147 +0,0 @@
-# Running Queries with Spark EMR in Docker
-
-Spark EMR images are available on the Amazon ECR Public Gallery. These can be modified to
-be able to include the OpenSearch Spark PPL extension. With the OpenSearch Spark PPL
-extension, the docker image can be used to test PPL commands.
-
-The Spark EMR image will run an Apache Spark app if one was specified and then shutdown.
-
-## Prepare OpenSearch Spark PPL Extension
-
-Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the
-location of the Jar file as well as the name of the Jar file.
-
-From the root of this repository, build the OpenSearch Spark PPL extension with:
-
-```
-sbt clean
-sbt assembly
-```
-
-Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information.
-
-## Using Docker Compose
-
-There are sample files in this repository at `docker/spark-emr-sample` They can be used to
-run the Spark EMR container:
-
-```
-docker compose up
-```
-
-Remove the docker resources afterwards with:
-
-```
-docker compose down
-```
-
-### Configuration
-
-There is a file `docker/spark-emr-sample/.env` that can be edited to change some settings.
-
-| Variable Name | Description |
-|----------------|---------------------------------------------------|
-| PPL_JAR | Path to the PPL Jar file |
-
-## Logs
-
-The logs are available in `/var/log/spark` in the docker container.
-
-STDERR for the app run is available in `/var/log/spark/user/stderr`.
-
-STDOUT for the app
-run is available in `/var/log/spark/user/stdout`.
-
-## Manual Setup
-
-Need to create two directories. These directories will be bound to the directories in the
-image.
-
-Look in `docker/spark-emr-sample` in this repository for samples of the directories
-described below.
-
-### logging-conf
-Contains two shell scripts that are run during startup to configure logging.
-* `run-adot-collector.sh`
-* `run-fluentd-spark.sh`
-
-Unless you need to make changes to the logging in the docker image, these can both be
-empty shell scripts.
-
-### spark-conf
-
-Contains the Apache Spark configuration. Need to add three lines to the `spark-defaults.conf`
-file:
-```
-spark.sql.legacy.createHiveTableByDefault false
-spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions
-spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog
-```
-
-## Create a Spark App
-
-An Apache Spark app is needed to provide queries to be run on the Spark EMR instance.
-The image has been tested with an app written in Scala.
-
-An example app is available in this repository in `docker/spark-sample--app`.
-
-### Bulid the Example App
-
-The example app can be built using [SBT](https://www.scala-sbt.org/).
-```
-cd docker/spark-sample-app
-sbt clean package
-```
-
-This will produce a Jar file in `docker/spark-sample-app/target/scala-2.12`
-that can be used with the Spark EMR image.
-
-## Prepare OpenSearch Spark PPL Extension
-
-Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the
-location of the Jar file as well as the name of the Jar file.
-
-## Run the Spark EMR Image
-
-The Spark EMR image can be run with the following command from the root of this repository:
-```
-docker run \
- --name spark-emr \
- -v ./docker/spark-emr-sample/logging-conf:/var/loggingConfiguration/spark \
- -v ./docker/spark-sample-app/target/scala-2.12:/app \
- -v ./docker/spark-emr-sample/spark-conf:/etc/spark/conf \
- -v /:/usr/lib/spark/jars/ \
- public.ecr.aws/emr-serverless/spark/emr-7.5.0:20241125 \
- driver \
- --class MyApp \
- /app/myapp_2.12-1.0.jar
-```
-
-* `--name spark-emr`
- Name the docker container `spark-emr`
-* `-v ./docker/spark-emr-sample/logging-conf:/var/loggingConfiguration/spark`
-
- Bind the directory containing logging shell scripts to the docker image. Needs to bind
- to `/var/loggingConfiguration/spark` in the image.
-* `-v ./docker/spark-sample-app/target/scala-2.12:/app`
-
- Bind the directory containing the Apache Spark app Jar file to a location in the
- docker image. The directory in the docker image must match the path used in the final
- argument.
-* `-v ./docker/spark-emr-sample/spark-conf:/etc/spark/conf`
-
- Bind the directory containing the Apache Spark configuration. Needs to bind to
- `/etc/spark/conf` in the image.
-* ``
- Replace with the path to the directory containing the OpenSearch Spark PPL extension
- Jar file.
-* ``
- Replace with the filename of the OpenSearch Spark PPL extension Jar file.
-* `driver`
- Start the Spark EMR container as a driver. This will run `spark-submit` to run an
- app.
-* `--class MyApp`
- The main class of the Spark App to run.
-* `/app/myapp_2.12-1.0.jar`
- The full path within the docker container where the Jar file of the Spark app is
- located.
diff --git a/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java b/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java
index 721685c38..9facd89ef 100644
--- a/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java
+++ b/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java
@@ -98,15 +98,15 @@ static void recordLatency(String metricNamePrefix, long latencyMilliseconds) {
* Otherwise, it increments a general failure metric counter based on the status code category (e.g., 4xx, 5xx).
*
* @param metricNamePrefix the prefix for the metric name which is used to construct the full metric name for failure
- * @param t the exception encountered during the operation, used to determine the type of failure
+ * @param e the exception encountered during the operation, used to determine the type of failure
*/
- static void recordOperationFailure(String metricNamePrefix, Throwable t) {
- OpenSearchException openSearchException = extractOpenSearchException(t);
+ static void recordOperationFailure(String metricNamePrefix, Exception e) {
+ OpenSearchException openSearchException = extractOpenSearchException(e);
int statusCode = openSearchException != null ? openSearchException.status().getStatus() : 500;
if (openSearchException != null) {
CustomLogging.logError(new OperationMessage("OpenSearch Operation failed.", statusCode), openSearchException);
} else {
- CustomLogging.logError("OpenSearch Operation failed with an exception.", t);
+ CustomLogging.logError("OpenSearch Operation failed with an exception.", e);
}
if (statusCode == 403) {
String forbiddenErrorMetricName = metricNamePrefix + ".403.count";
diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java b/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java
index f9d181b70..6ddc6ae9c 100644
--- a/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java
+++ b/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java
@@ -88,11 +88,7 @@ public class FlintOptions implements Serializable {
public static final int DEFAULT_SOCKET_TIMEOUT_MILLIS = 60000;
public static final int DEFAULT_INACTIVITY_LIMIT_MILLIS = 3 * 60 * 1000;
-
- public static final String REQUEST_COMPLETION_DELAY_MILLIS = "request.completionDelayMillis";
- public static final int DEFAULT_REQUEST_COMPLETION_DELAY_MILLIS = 0;
- public static final int DEFAULT_AOSS_REQUEST_COMPLETION_DELAY_MILLIS = 2000;
-
+
public static final String DATA_SOURCE_NAME = "spark.flint.datasource.name";
public static final String BATCH_BYTES = "write.batch_bytes";
@@ -182,13 +178,6 @@ public int getSocketTimeoutMillis() {
return Integer.parseInt(options.getOrDefault(SOCKET_TIMEOUT_MILLIS, String.valueOf(DEFAULT_SOCKET_TIMEOUT_MILLIS)));
}
- public int getRequestCompletionDelayMillis() {
- int defaultValue = SERVICE_NAME_AOSS.equals(getServiceName())
- ? DEFAULT_AOSS_REQUEST_COMPLETION_DELAY_MILLIS
- : DEFAULT_REQUEST_COMPLETION_DELAY_MILLIS;
- return Integer.parseInt(options.getOrDefault(REQUEST_COMPLETION_DELAY_MILLIS, String.valueOf(defaultValue)));
- }
-
public String getDataSourceName() {
return options.getOrDefault(DATA_SOURCE_NAME, "");
}
diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java b/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java
index 597f441ec..8f6e2c07e 100644
--- a/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java
+++ b/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java
@@ -6,12 +6,8 @@
package org.opensearch.flint.core.http;
import static java.time.temporal.ChronoUnit.SECONDS;
-import static org.opensearch.flint.core.FlintOptions.SERVICE_NAME;
-import static org.opensearch.flint.core.FlintOptions.SERVICE_NAME_AOSS;
-import static org.opensearch.flint.core.FlintOptions.SERVICE_NAME_ES;
import dev.failsafe.RetryPolicy;
-import dev.failsafe.RetryPolicyBuilder;
import dev.failsafe.event.ExecutionAttemptedEvent;
import dev.failsafe.function.CheckedPredicate;
import java.time.Duration;
@@ -20,7 +16,6 @@
import java.util.logging.Logger;
import org.opensearch.action.bulk.BulkResponse;
import org.opensearch.flint.core.http.handler.ExceptionClassNameFailurePredicate;
-import org.opensearch.flint.core.http.handler.HttpAOSSResultPredicate;
import org.opensearch.flint.core.http.handler.HttpStatusCodeResultPredicate;
import java.io.Serializable;
@@ -70,7 +65,7 @@ public boolean isRetryEnabled() {
* @return Failsafe retry policy
*/
public RetryPolicy getRetryPolicy() {
- RetryPolicyBuilder builder = RetryPolicy.builder()
+ return RetryPolicy.builder()
// Backoff strategy config (can be configurable as needed in future)
.withBackoff(1, 30, SECONDS)
.withJitter(Duration.ofMillis(100))
@@ -80,11 +75,8 @@ public RetryPolicy getRetryPolicy() {
.handleResultIf(new HttpStatusCodeResultPredicate<>(getRetryableHttpStatusCodes()))
// Logging listener
.onFailedAttempt(FlintRetryOptions::onFailure)
- .onRetry(FlintRetryOptions::onRetry);
- if (SERVICE_NAME_AOSS.equals(getServiceName())) {
- builder.handleResultIf(new HttpAOSSResultPredicate<>());
- }
- return builder.build();
+ .onRetry(FlintRetryOptions::onRetry)
+ .build();
}
public RetryPolicy getBulkRetryPolicy(CheckedPredicate resultPredicate) {
@@ -109,10 +101,6 @@ private static void onRetry(ExecutionAttemptedEvent event) {
LOG.warning("Retrying failed request at #" + event.getAttemptCount());
}
- private String getServiceName() {
- return options.getOrDefault(SERVICE_NAME, SERVICE_NAME_ES);
- }
-
/**
* @return maximum retry option value
*/
diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/http/handler/HttpAOSSResultPredicate.java b/flint-core/src/main/scala/org/opensearch/flint/core/http/handler/HttpAOSSResultPredicate.java
deleted file mode 100644
index 8bfb05fa3..000000000
--- a/flint-core/src/main/scala/org/opensearch/flint/core/http/handler/HttpAOSSResultPredicate.java
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-package org.opensearch.flint.core.http.handler;
-
-import dev.failsafe.function.CheckedPredicate;
-import org.apache.http.HttpEntity;
-import org.apache.http.HttpResponse;
-import org.apache.http.entity.BufferedHttpEntity;
-import org.apache.http.util.EntityUtils;
-
-import java.util.logging.Logger;
-
-/**
- * Failure handler based on HTTP response from AOSS.
- *
- * @param result type (supposed to be HttpResponse for OS client)
- */
-public class HttpAOSSResultPredicate implements CheckedPredicate {
-
- private static final Logger LOG = Logger.getLogger(HttpAOSSResultPredicate.class.getName());
-
- public static final int BAD_REQUEST_STATUS_CODE = 400;
- public static final String RESOURCE_ALREADY_EXISTS_EXCEPTION_MESSAGE = "resource_already_exists_exception";
-
- public HttpAOSSResultPredicate() { }
-
- @Override
- public boolean test(T result) throws Throwable {
- LOG.info("Checking if response is retryable");
-
- int statusCode = ((HttpResponse) result).getStatusLine().getStatusCode();
- if (statusCode != BAD_REQUEST_STATUS_CODE) {
- LOG.info("Status code " + statusCode + " is not " + BAD_REQUEST_STATUS_CODE + ". Check result: false");
- return false;
- }
-
- HttpResponse response = (HttpResponse) result;
- HttpEntity entity = response.getEntity();
- if (entity == null) {
- LOG.info("No response entity found. Check result: false");
- return false;
- }
-
- // Buffer the entity to make it repeatable, so that this retry test does not consume the content stream,
- // resulting in the request caller getting empty response
- BufferedHttpEntity bufferedEntity = new BufferedHttpEntity(entity);
- response.setEntity(bufferedEntity);
-
- try {
- String responseContent = EntityUtils.toString(bufferedEntity);
- // Effectively restores the content stream of the response
- bufferedEntity.getContent().reset();
-
- boolean isRetryable = responseContent.contains(RESOURCE_ALREADY_EXISTS_EXCEPTION_MESSAGE);
-
- LOG.info("Check retryable response result: " + isRetryable);
- return isRetryable;
- } catch (Exception e) {
- LOG.info("Unable to parse response body. Check result: false");
- return false;
- }
- }
-}
diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java
index 5861ccf22..2bc097bba 100644
--- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java
+++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java
@@ -44,7 +44,6 @@ public void createIndex(String indexName, FlintMetadata metadata) {
LOG.info("Creating Flint index " + indexName + " with metadata " + metadata);
try {
createIndex(indexName, FlintOpenSearchIndexMetadataService.serialize(metadata, false), metadata.indexSettings());
- waitRequestComplete(); // Delay to ensure create is complete before making other requests for the index
emitIndexCreationSuccessMetric(metadata.kind());
} catch (IllegalStateException ex) {
emitIndexCreationFailureMetric(metadata.kind());
@@ -132,14 +131,6 @@ private String sanitizeIndexName(String indexName) {
return OpenSearchClientUtils.sanitizeIndexName(indexName);
}
- private void waitRequestComplete() {
- try {
- Thread.sleep(options.getRequestCompletionDelayMillis());
- } catch (InterruptedException e) {
- Thread.currentThread().interrupt();
- }
- }
-
private void emitIndexCreationSuccessMetric(String indexKind) {
emitIndexCreationMetric(indexKind, "success");
}
diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala b/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala
index 8a8927920..7d3b79a9e 100644
--- a/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala
+++ b/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala
@@ -12,13 +12,11 @@ import java.util.concurrent.{ExecutionException, Future}
import scala.collection.JavaConverters.mapAsJavaMapConverter
-import org.apache.http.HttpEntity
import org.apache.http.HttpResponse
import org.apache.http.concurrent.FutureCallback
import org.apache.http.impl.nio.client.{CloseableHttpAsyncClient, HttpAsyncClientBuilder}
import org.apache.http.nio.protocol.{HttpAsyncRequestProducer, HttpAsyncResponseConsumer}
import org.apache.http.protocol.HttpContext
-import org.apache.http.util.EntityUtils
import org.mockito.ArgumentMatchers.any
import org.mockito.Mockito._
import org.mockito.verification.VerificationMode
@@ -155,23 +153,6 @@ class RetryableHttpAsyncClientSuite extends AnyFlatSpec with BeforeAndAfter with
expectFutureGetTimes = times(0))
}
- it should "retry if AOSS response is retryable" in {
- retryableClient
- .withOption("auth.servicename", "aoss")
- .whenResponse(
- 400,
- "OpenSearchStatusException[OpenSearch exception [type=resource_already_exists_exception,")
- .shouldExecute(times(DEFAULT_MAX_RETRIES + 1))
- }
-
- it should "not apply retry policy for AOSS response if service is not AOSS" in {
- retryableClient
- .whenResponse(
- 400,
- "OpenSearchStatusException[OpenSearch exception [type=resource_already_exists_exception,")
- .shouldExecute(times(1))
- }
-
private def retryableClient: AssertionHelper = new AssertionHelper
class AssertionHelper {
@@ -194,17 +175,6 @@ class RetryableHttpAsyncClientSuite extends AnyFlatSpec with BeforeAndAfter with
this
}
- def whenResponse(statusCode: Int, responseMessage: String): AssertionHelper = {
- val entity = mock[HttpEntity](RETURNS_DEEP_STUBS)
- mockStatic(classOf[EntityUtils])
- when(EntityUtils.toString(any[HttpEntity])).thenReturn(responseMessage)
- val response = mock[HttpResponse](RETURNS_DEEP_STUBS)
- when(response.getStatusLine.getStatusCode).thenReturn(statusCode)
- when(response.getEntity).thenReturn(entity)
- when(future.get()).thenReturn(response)
- this
- }
-
def shouldExecute(expectExecuteTimes: VerificationMode): Unit = {
shouldExecute(expectExecuteTimes, expectExecuteTimes)
}
diff --git a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala
index 364a8a1de..bdcc120c0 100644
--- a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala
+++ b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala
@@ -201,11 +201,6 @@ object FlintSparkConf {
.datasourceOption()
.doc("socket duration in milliseconds")
.createWithDefault(String.valueOf(FlintOptions.DEFAULT_SOCKET_TIMEOUT_MILLIS))
- val REQUEST_COMPLETION_DELAY_MILLIS =
- FlintConfig(s"spark.datasource.flint.${FlintOptions.REQUEST_COMPLETION_DELAY_MILLIS}")
- .datasourceOption()
- .doc("delay in milliseconds after index creation is completed")
- .createOptional()
val DATA_SOURCE_NAME =
FlintConfig(s"spark.flint.datasource.name")
.doc("data source name")
@@ -361,8 +356,7 @@ case class FlintSparkConf(properties: JMap[String, String]) extends Serializable
REQUEST_INDEX,
METADATA_ACCESS_AWS_CREDENTIALS_PROVIDER,
EXCLUDE_JOB_IDS,
- SCROLL_SIZE,
- REQUEST_COMPLETION_DELAY_MILLIS)
+ SCROLL_SIZE)
.map(conf => (conf.optionKey, conf.readFrom(reader)))
.flatMap {
case (_, None) => None
diff --git a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala
index 19fe28a2d..a4b23bd46 100644
--- a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala
+++ b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala
@@ -142,7 +142,6 @@ object FlintDataType {
case ByteType => JObject("type" -> JString("byte"))
case DoubleType => JObject("type" -> JString("double"))
case FloatType => JObject("type" -> JString("float"))
- case DecimalType() => JObject("type" -> JString("double"))
// Date
case TimestampType | _: TimestampNTZType =>
@@ -154,9 +153,6 @@ object FlintDataType {
// objects
case st: StructType => serializeJValue(st)
- // Serialize maps as empty objects and let the map entries automap
- case mt: MapType => serializeJValue(new StructType())
-
// array
case ArrayType(elementType, _) => serializeField(elementType, Metadata.empty)
diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
index fbc24e93a..68d2409ee 100644
--- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
+++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala
@@ -510,10 +510,6 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w
private def isSchedulerModeChanged(
originalOptions: FlintSparkIndexOptions,
updatedOptions: FlintSparkIndexOptions): Boolean = {
- // Altering from manual to auto should not be interpreted as a scheduling mode change.
- if (!originalOptions.options.contains(SCHEDULER_MODE.toString)) {
- return false
- }
updatedOptions.isExternalSchedulerEnabled() != originalOptions.isExternalSchedulerEnabled()
}
diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala
index 1d301087f..b675265b7 100644
--- a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala
+++ b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala
@@ -12,7 +12,6 @@ import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
import org.apache.spark.sql.flint.config.{FlintConfigEntry, FlintSparkConf}
import org.apache.spark.sql.flint.config.FlintSparkConf.{EXTERNAL_SCHEDULER_ENABLED, HYBRID_SCAN_ENABLED, METADATA_CACHE_WRITE}
import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH
import org.apache.spark.sql.test.SharedSparkSession
trait FlintSuite extends SharedSparkSession {
@@ -31,7 +30,6 @@ trait FlintSuite extends SharedSparkSession {
.set(
FlintSparkConf.CUSTOM_FLINT_SCHEDULER_CLASS.key,
"org.opensearch.flint.core.scheduler.AsyncQuerySchedulerBuilderTest$AsyncQuerySchedulerForLocalTest")
- .set(WAREHOUSE_PATH.key, s"spark-warehouse/${suiteName}")
conf
}
diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala
index 594322bae..0cde6ab0f 100644
--- a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala
+++ b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala
@@ -114,21 +114,6 @@ class FlintSparkConfSuite extends FlintSuite {
}
}
- test("test request completionDelayMillis default value") {
- FlintSparkConf().flintOptions().getRequestCompletionDelayMillis shouldBe 0
- }
-
- test("test request completionDelayMillis default value for aoss") {
- val options = FlintSparkConf(Map("auth.servicename" -> "aoss").asJava).flintOptions()
- options.getRequestCompletionDelayMillis shouldBe 2000
- }
-
- test("test specified request completionDelayMillis") {
- val options =
- FlintSparkConf(Map("request.completionDelayMillis" -> "1000").asJava).flintOptions()
- options.getRequestCompletionDelayMillis shouldBe 1000
- }
-
test("externalSchedulerIntervalThreshold should return default value when empty") {
val options = FlintSparkConf(Map("spark.flint.job.externalScheduler.interval" -> "").asJava)
assert(options
diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala
index 312f3a5a1..94f4839d6 100644
--- a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala
+++ b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala
@@ -128,35 +128,6 @@ class FlintDataTypeSuite extends FlintSuite with Matchers {
|}""".stripMargin)
}
- test("spark map type serialize") {
- val sparkStructType = StructType(
- StructField("mapField", MapType(StringType, StringType), true) ::
- Nil)
-
- FlintDataType.serialize(sparkStructType) shouldBe compactJson("""{
- | "properties": {
- | "mapField": {
- | "properties": {
- | }
- | }
- | }
- |}""".stripMargin)
- }
-
- test("spark decimal type serialize") {
- val sparkStructType = StructType(
- StructField("decimalField", DecimalType(1, 1), true) ::
- Nil)
-
- FlintDataType.serialize(sparkStructType) shouldBe compactJson("""{
- | "properties": {
- | "decimalField": {
- | "type": "double"
- | }
- | }
- |}""".stripMargin)
- }
-
test("spark varchar and char type serialize") {
val flintDataType = """{
| "properties": {
diff --git a/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala b/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala
index 81bf60f5e..11bc7271c 100644
--- a/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala
+++ b/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala
@@ -81,42 +81,36 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest {
}
}
- def createJobOperator(query: String, jobRunId: String): JobOperator = {
- val streamingRunningCount = new AtomicInteger(0)
-
- /*
- * Because we cannot test from FlintJob.main() for the reason below, we have to configure
- * all Spark conf required by Flint code underlying manually.
- */
- spark.conf.set(DATA_SOURCE_NAME.key, dataSourceName)
- spark.conf.set(JOB_TYPE.key, FlintJobType.STREAMING)
-
- val job = JobOperator(
- appId,
- jobRunId,
- spark,
- query,
- queryId,
- dataSourceName,
- resultIndex,
- FlintJobType.STREAMING,
- streamingRunningCount)
- job.terminateJVM = false
- job
- }
-
def startJob(query: String, jobRunId: String): Future[Unit] = {
val prefix = "flint-job-test"
val threadPool = ThreadUtils.newDaemonThreadPoolScheduledExecutor(prefix, 1)
implicit val executionContext = ExecutionContext.fromExecutor(threadPool)
+ val streamingRunningCount = new AtomicInteger(0)
val futureResult = Future {
+ /*
+ * Because we cannot test from FlintJob.main() for the reason below, we have to configure
+ * all Spark conf required by Flint code underlying manually.
+ */
+ spark.conf.set(DATA_SOURCE_NAME.key, dataSourceName)
+ spark.conf.set(JOB_TYPE.key, FlintJobType.STREAMING)
/**
* FlintJob.main() is not called because we need to manually set these variables within a
* JobOperator instance to accommodate specific runtime requirements.
*/
- val job = createJobOperator(query, jobRunId)
+ val job =
+ JobOperator(
+ appId,
+ jobRunId,
+ spark,
+ query,
+ queryId,
+ dataSourceName,
+ resultIndex,
+ FlintJobType.STREAMING,
+ streamingRunningCount)
+ job.terminateJVM = false
job.start()
}
futureResult.onComplete {
@@ -297,10 +291,6 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest {
}
test("create skipping index with non-existent table") {
- val prefix = "flint-job-test"
- val threadPool = ThreadUtils.newDaemonThreadPoolScheduledExecutor(prefix, 1)
- implicit val executionContext = ExecutionContext.fromExecutor(threadPool)
-
val query =
s"""
| CREATE SKIPPING INDEX ON testTable
@@ -313,9 +303,7 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest {
| """.stripMargin
val queryStartTime = System.currentTimeMillis()
val jobRunId = "00ff4o3b5091080r"
-
- val job = createJobOperator(query, jobRunId)
- threadLocalFuture.set(Future(job.start()))
+ threadLocalFuture.set(startJob(query, jobRunId))
val validation: REPLResult => Boolean = result => {
assert(
@@ -327,9 +315,6 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest {
assert(result.status == "FAILED", s"expected status is FAILED, but got ${result.status}")
assert(!result.error.isEmpty, s"we expect error, but got ${result.error}")
- assert(
- job.throwableHandler.error.contains("Table spark_catalog.default.testTable is not found"),
- "Expected error message to mention 'spark_catalog.default.testTable is not found'")
commonAssert(result, jobRunId, query, queryStartTime)
true
}
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala
index fe3cefef8..a2c2d26f6 100644
--- a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala
+++ b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala
@@ -65,27 +65,6 @@ class FlintOpenSearchClientSuite extends AnyFlatSpec with OpenSearchSuite with M
(settings \ "index.number_of_replicas").extract[String] shouldBe "2"
}
- it should "create index with request completion delay config" in {
- val metadata = FlintOpenSearchIndexMetadataService.deserialize("{}")
- // Create a dummy index to avoid timing the initial overhead
- flintClient.createIndex("dummy", metadata)
-
- val indexName = "flint_test_without_request_completion_delay"
- val elapsedTimeWithoutDelay = timer {
- flintClient.createIndex(indexName, metadata)
- }
-
- val delayIndexName = "flint_test_with_request_completion_delay"
- val delayOptions =
- openSearchOptions + (FlintOptions.REQUEST_COMPLETION_DELAY_MILLIS -> "2000")
- val delayFlintOptions = new FlintOptions(delayOptions.asJava)
- val delayFlintClient = new FlintOpenSearchClient(delayFlintOptions)
- val elapsedTimeWithDelay = timer {
- delayFlintClient.createIndex(delayIndexName, metadata)
- }
- elapsedTimeWithDelay - elapsedTimeWithoutDelay should be >= 1800L // allowing 200ms of wiggle room
- }
-
it should "get all index names with the given index name pattern" in {
val metadata = FlintOpenSearchIndexMetadataService.deserialize(
"""{"properties": {"test": { "type": "integer" } } }""")
@@ -241,11 +220,4 @@ class FlintOpenSearchClientSuite extends AnyFlatSpec with OpenSearchSuite with M
def createTable(indexName: String, options: FlintOptions): Table = {
OpenSearchCluster.apply(indexName, options).asScala.head
}
-
- def timer(block: => Unit): Long = {
- val start = System.currentTimeMillis()
- block
- val end = System.currentTimeMillis()
- end - start
- }
}
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala
index bf5e6309e..ae2e53090 100644
--- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala
+++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala
@@ -523,45 +523,5 @@ class FlintSparkMaterializedViewSqlITSuite extends FlintSparkSuite {
}
}
- test("create materialized view with decimal and map types") {
- val decimalAndMapTable = s"$catalogName.default.mv_test_decimal_map"
- val decimalAndMapMv = s"$catalogName.default.mv_test_decimal_map_ser"
- withTable(decimalAndMapTable) {
- createMapAndDecimalTimeSeriesTable(decimalAndMapTable)
-
- withTempDir { checkpointDir =>
- sql(s"""
- | CREATE MATERIALIZED VIEW $decimalAndMapMv
- | AS
- | SELECT
- | base_score, mymap
- | FROM $decimalAndMapTable
- | WITH (
- | auto_refresh = true,
- | checkpoint_location = '${checkpointDir.getAbsolutePath}'
- | )
- |""".stripMargin)
-
- // Wait for streaming job complete current micro batch
- val flintIndex = getFlintIndexName(decimalAndMapMv)
- val job = spark.streams.active.find(_.name == flintIndex)
- job shouldBe defined
- failAfter(streamingTimeout) {
- job.get.processAllAvailable()
- }
-
- flint.describeIndex(flintIndex) shouldBe defined
- checkAnswer(
- flint.queryIndex(flintIndex).select("base_score", "mymap"),
- Seq(
- Row(3.1415926, Row(null, null, null, null, "mapvalue1")),
- Row(4.1415926, Row("mapvalue2", null, null, null, null)),
- Row(5.1415926, Row(null, null, "mapvalue3", null, null)),
- Row(6.1415926, Row(null, null, null, "mapvalue4", null)),
- Row(7.1415926, Row(null, "mapvalue5", null, null, null))))
- }
- }
- }
-
private def timestamp(ts: String): Timestamp = Timestamp.valueOf(ts)
}
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala
index 7c19cab12..68d370791 100644
--- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala
+++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala
@@ -445,34 +445,6 @@ trait FlintSparkSuite extends QueryTest with FlintSuite with OpenSearchSuite wit
sql(s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 03:00:00', 'E', 15, 'Vancouver')")
}
- protected def createMapAndDecimalTimeSeriesTable(testTable: String): Unit = {
- // CSV tables do not support MAP types so we use JSON instead
- val finalTableType = if (tableType == "CSV") "JSON" else tableType
-
- sql(s"""
- | CREATE TABLE $testTable
- | (
- | time TIMESTAMP,
- | name STRING,
- | age INT,
- | base_score DECIMAL(8, 7),
- | mymap MAP
- | )
- | USING $finalTableType $tableOptions
- |""".stripMargin)
-
- sql(
- s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 00:01:00', 'A', 30, 3.1415926, Map('mapkey1', 'mapvalue1'))")
- sql(
- s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 00:10:00', 'B', 20, 4.1415926, Map('mapkey2', 'mapvalue2'))")
- sql(
- s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 00:15:00', 'C', 35, 5.1415926, Map('mapkey3', 'mapvalue3'))")
- sql(
- s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 01:00:00', 'D', 40, 6.1415926, Map('mapkey4', 'mapvalue4'))")
- sql(
- s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 03:00:00', 'E', 15, 7.1415926, Map('mapkey5', 'mapvalue5'))")
- }
-
protected def createTimeSeriesTransactionTable(testTable: String): Unit = {
sql(s"""
| CREATE TABLE $testTable
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala
index f27c0dae9..c9f6c47f7 100644
--- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala
+++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala
@@ -618,44 +618,6 @@ class FlintSparkUpdateIndexITSuite extends FlintSparkSuite {
flint.queryIndex(testIndex).collect().toSet should have size 2
}
- test("update full refresh index to auto refresh should start job with external scheduler") {
- setFlintSparkConf(FlintSparkConf.EXTERNAL_SCHEDULER_ENABLED, "true")
-
- withTempDir { checkpointDir =>
- // Create full refresh Flint index
- flint
- .skippingIndex()
- .onTable(testTable)
- .addPartitions("year", "month")
- .options(FlintSparkIndexOptions(Map("auto_refresh" -> "false")), testIndex)
- .create()
-
- spark.streams.active.find(_.name == testIndex) shouldBe empty
- flint.queryIndex(testIndex).collect().toSet should have size 0
- val indexInitial = flint.describeIndex(testIndex).get
- indexInitial.options.isExternalSchedulerEnabled() shouldBe false
-
- val updatedIndex = flint
- .skippingIndex()
- .copyWithUpdate(
- indexInitial,
- FlintSparkIndexOptions(
- Map(
- "auto_refresh" -> "true",
- "checkpoint_location" -> checkpointDir.getAbsolutePath)))
-
- val jobId = flint.updateIndex(updatedIndex)
- jobId shouldBe empty
- val indexFinal = flint.describeIndex(testIndex).get
- indexFinal.options.isExternalSchedulerEnabled() shouldBe true
- indexFinal.options.autoRefresh() shouldBe true
- indexFinal.options.refreshInterval() shouldBe Some(
- FlintOptions.DEFAULT_EXTERNAL_SCHEDULER_INTERVAL)
-
- verifySchedulerIndex(testIndex, 5, "MINUTES")
- }
- }
-
test("update incremental refresh index to auto refresh should start job") {
withTempDir { checkpointDir =>
// Create incremental refresh Flint index and wait for complete
@@ -705,51 +667,6 @@ class FlintSparkUpdateIndexITSuite extends FlintSparkSuite {
}
}
- test(
- "update incremental refresh index to auto refresh should start job with external scheduler") {
- setFlintSparkConf(FlintSparkConf.EXTERNAL_SCHEDULER_ENABLED, "true")
-
- withTempDir { checkpointDir =>
- // Create incremental refresh Flint index
- flint
- .skippingIndex()
- .onTable(testTable)
- .addPartitions("year", "month")
- .options(
- FlintSparkIndexOptions(
- Map(
- "incremental_refresh" -> "true",
- "checkpoint_location" -> checkpointDir.getAbsolutePath)),
- testIndex)
- .create()
-
- spark.streams.active.find(_.name == testIndex) shouldBe empty
- flint.queryIndex(testIndex).collect().toSet should have size 0
- val indexInitial = flint.describeIndex(testIndex).get
- indexInitial.options.isExternalSchedulerEnabled() shouldBe false
-
- val updatedIndex = flint
- .skippingIndex()
- .copyWithUpdate(
- indexInitial,
- FlintSparkIndexOptions(
- Map(
- "auto_refresh" -> "true",
- "incremental_refresh" -> "false",
- "checkpoint_location" -> checkpointDir.getAbsolutePath)))
-
- val jobId = flint.updateIndex(updatedIndex)
- jobId shouldBe empty
- val indexFinal = flint.describeIndex(testIndex).get
- indexFinal.options.isExternalSchedulerEnabled() shouldBe true
- indexFinal.options.autoRefresh() shouldBe true
- indexFinal.options.refreshInterval() shouldBe Some(
- FlintOptions.DEFAULT_EXTERNAL_SCHEDULER_INTERVAL)
-
- verifySchedulerIndex(testIndex, 5, "MINUTES")
- }
- }
-
test("update auto refresh index to full refresh should stop job") {
// Create auto refresh Flint index and wait for complete
flint
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala
deleted file mode 100644
index a9b01b9e3..000000000
--- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-package org.opensearch.flint.spark.ppl
-
-import java.sql.Date
-import java.sql.Timestamp
-
-import org.apache.spark.sql.{QueryTest, Row}
-import org.apache.spark.sql.streaming.StreamTest
-
-class FlintSparkPPLCastITSuite
- extends QueryTest
- with LogicalPlanTestUtils
- with FlintPPLSuite
- with StreamTest {
-
- /** Test table and index name */
- private val testTable = "spark_catalog.default.flint_ppl_test"
-
- override def beforeAll(): Unit = {
- super.beforeAll()
- // Create test table
- createNullableJsonContentTable(testTable)
- }
-
- protected override def afterEach(): Unit = {
- super.afterEach()
- // Stop all streaming jobs if any
- spark.streams.active.foreach { job =>
- job.stop()
- job.awaitTermination()
- }
- }
-
- test("test cast number to compatible data types") {
- val frame = sql(s"""
- | source=$testTable | eval
- | id_string = cast(id as string),
- | id_double = cast(id as double),
- | id_long = cast(id as long),
- | id_boolean = cast(id as boolean)
- | | fields id, id_string, id_double, id_long, id_boolean | head 1
- | """.stripMargin)
-
- assert(
- frame.dtypes.sameElements(
- Array(
- ("id", "IntegerType"),
- ("id_string", "StringType"),
- ("id_double", "DoubleType"),
- ("id_long", "LongType"),
- ("id_boolean", "BooleanType"))))
- assertSameRows(Seq(Row(1, "1", 1.0, 1L, true)), frame)
- }
-
- test("test cast string to compatible data types") {
- val frame = sql(s"""
- | source=$testTable | eval
- | id_int = cast(cast(id as string) as integer),
- | cast_true = cast("True" as boolean),
- | cast_false = cast("false" as boolean),
- | cast_timestamp = cast("2024-11-26 23:39:06" as timestamp),
- | cast_date = cast("2024-11-26" as date)
- | | fields id_int, cast_true, cast_false, cast_timestamp, cast_date | head 1
- | """.stripMargin)
-
- assert(
- frame.dtypes.sameElements(
- Array(
- ("id_int", "IntegerType"),
- ("cast_true", "BooleanType"),
- ("cast_false", "BooleanType"),
- ("cast_timestamp", "TimestampType"),
- ("cast_date", "DateType"))))
- assertSameRows(
- Seq(
- Row(
- 1,
- true,
- false,
- Timestamp.valueOf("2024-11-26 23:39:06"),
- Date.valueOf("2024-11-26"))),
- frame)
- }
-
- test("test cast time related types to compatible data types") {
- val frame = sql(s"""
- | source=$testTable | eval
- | timestamp = cast("2024-11-26 23:39:06" as timestamp),
- | ts_str = cast(timestamp as string),
- | ts_date = cast(timestamp as date),
- | date_str = cast(ts_date as string),
- | date_ts = cast(ts_date as timestamp)
- | | fields timestamp, ts_str, ts_date, date_str, date_ts | head 1
- | """.stripMargin)
-
- assert(
- frame.dtypes.sameElements(
- Array(
- ("timestamp", "TimestampType"),
- ("ts_str", "StringType"),
- ("ts_date", "DateType"),
- ("date_str", "StringType"),
- ("date_ts", "TimestampType"))))
- assertSameRows(
- Seq(
- Row(
- Timestamp.valueOf("2024-11-26 23:39:06"),
- "2024-11-26 23:39:06",
- Date.valueOf("2024-11-26"),
- "2024-11-26",
- Timestamp.valueOf("2024-11-26 00:00:00"))),
- frame)
- }
-
-}
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala
index ca96c126f..4788aa23f 100644
--- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala
+++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala
@@ -277,26 +277,6 @@ class FlintSparkPPLFillnullITSuite
assert(ex.getMessage().contains("Syntax error "))
}
- test("test fillnull with null_replacement type mismatch") {
- val frame = sql(s"""
- | source = $testTable | fillnull with cast(0 as long) in status_code
- | """.stripMargin)
-
- assert(frame.columns.sameElements(Array("id", "request_path", "timestamp", "status_code")))
- val results: Array[Row] = frame.collect()
- val expectedResults: Array[Row] =
- Array(
- Row(1, "/home", null, 200),
- Row(2, "/about", "2023-10-01 10:05:00", 0),
- Row(3, "/contact", "2023-10-01 10:10:00", 0),
- Row(4, null, "2023-10-01 10:15:00", 301),
- Row(5, null, "2023-10-01 10:20:00", 200),
- Row(6, "/home", null, 403))
- // Compare the results
- implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Int](_.getAs[Int](0))
- assert(results.sorted.sameElements(expectedResults.sorted))
- }
-
private def fillNullExpectedPlan(
nullReplacements: Seq[(String, Expression)],
addDefaultProject: Boolean = true): LogicalPlan = {
diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala
index 7d1b6e437..e714a5f7e 100644
--- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala
+++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala
@@ -9,7 +9,7 @@ import java.nio.file.Files
import org.opensearch.flint.spark.FlattenGenerator
import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq
-import org.apache.spark.sql.{AnalysisException, QueryTest, Row}
+import org.apache.spark.sql.{QueryTest, Row}
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar}
import org.apache.spark.sql.catalyst.expressions.{Alias, EqualTo, GeneratorOuter, Literal, Or}
import org.apache.spark.sql.catalyst.plans.logical._
@@ -347,85 +347,4 @@ class FlintSparkPPLFlattenITSuite
val expectedPlan = Project(Seq(UnresolvedStar(None)), flattenMultiValue)
comparePlans(logicalPlan, expectedPlan, checkAnalysis = false)
}
-
- test("flatten struct nested table using alias") {
- val frame = sql(s"""
- | source = $structNestedTable
- | | flatten struct_col
- | | flatten field1 as subfield_1
- | | flatten struct_col2 as (field1, field2_2)
- | | flatten field1 as subfield_2
- | """.stripMargin)
-
- assert(
- frame.columns.sameElements(
- Array("int_col", "field2", "subfield_1", "field2_2", "subfield_2")))
- val results: Array[Row] = frame.collect()
- implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Int](_.getAs[Int](0))
- val expectedResults: Array[Row] =
- Array(
- Row(30, 123, "value1", 23, "valueA"),
- Row(40, 123, "value5", 33, "valueB"),
- Row(30, 823, "value4", 83, "valueC"),
- Row(40, 456, "value2", 46, "valueD"),
- Row(50, 789, "value3", 89, "valueE")).sorted
- // Compare the results
- assert(results.sorted.sameElements(expectedResults))
-
- // duplicate alias names
- val frame2 = sql(s"""
- | source = $structNestedTable
- | | flatten struct_col as (field1, field2_2)
- | | flatten field1 as subfield_1
- | | flatten struct_col2 as (field1, field2_2)
- | | flatten field1 as subfield_2
- | """.stripMargin)
-
- // alias names duplicate with existing fields
- assert(
- frame2.columns.sameElements(
- Array("int_col", "field2_2", "subfield_1", "field2_2", "subfield_2")))
- assert(frame2.collect().sorted.sameElements(expectedResults))
-
- val frame3 = sql(s"""
- | source = $structNestedTable
- | | flatten struct_col as (field1, field2_2)
- | | flatten field1 as int_col
- | | flatten struct_col2 as (field1, field2_2)
- | | flatten field1 as int_col
- | """.stripMargin)
-
- assert(
- frame3.columns.sameElements(Array("int_col", "field2_2", "int_col", "field2_2", "int_col")))
- assert(frame3.collect().sorted.sameElements(expectedResults))
-
- // Throw AnalysisException if The number of aliases supplied in the AS clause does not match the
- // number of columns output
- val except = intercept[AnalysisException] {
- sql(s"""
- | source = $structNestedTable
- | | flatten struct_col as (field1)
- | | flatten field1 as int_col
- | | flatten struct_col2 as (field1, field2_2)
- | | flatten field1 as int_col
- | """.stripMargin)
- }
- assert(except.message.contains(
- "The number of aliases supplied in the AS clause does not match the number of columns output by the UDTF"))
-
- // Throw AnalysisException because of ambiguous
- val except2 = intercept[AnalysisException] {
- sql(s"""
- | source = $structNestedTable
- | | flatten struct_col as (field1, field2_2)
- | | flatten field1 as int_col
- | | flatten struct_col2 as (field1, field2_2)
- | | flatten field1 as int_col
- | | fields field2_2
- | """.stripMargin)
- }
- assert(except2.message.contains(
- "[AMBIGUOUS_REFERENCE] Reference `field2_2` is ambiguous, could be: [`field2_2`, `field2_2`]."))
- }
-
}
diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4
index d15f5c8e3..f3c6acda9 100644
--- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4
+++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4
@@ -416,9 +416,6 @@ ISPRESENT: 'ISPRESENT';
BETWEEN: 'BETWEEN';
CIDRMATCH: 'CIDRMATCH';
-// Geo Loction
-GEOIP: 'GEOIP';
-
// FLOWCONTROL FUNCTIONS
IFNULL: 'IFNULL';
NULLIF: 'NULLIF';
@@ -428,6 +425,19 @@ TYPEOF: 'TYPEOF';
//OTHER CONDITIONAL EXPRESSIONS
COALESCE: 'COALESCE';
+//GEOLOCATION FUNCTIONS
+GEOIP: 'GEOIP';
+
+//GEOLOCATION PROPERTIES
+COUNTRY_ISO_CODE: 'COUNTRY_ISO_CODE';
+COUNTRY_NAME: 'COUNTRY_NAME';
+CONTINENT_NAME: 'CONTINENT_NAME';
+REGION_ISO_CODE: 'REGION_ISO_CODE';
+REGION_NAME: 'REGION_NAME';
+CITY_NAME: 'CITY_NAME';
+LAT: 'LAT';
+LON: 'LON';
+
// RELEVANCE FUNCTIONS AND PARAMETERS
MATCH: 'MATCH';
MATCH_PHRASE: 'MATCH_PHRASE';
diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4
index 2466a3d23..b15f59b4b 100644
--- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4
+++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4
@@ -45,6 +45,7 @@ commands
| headCommand
| topCommand
| rareCommand
+ | geoipCommand
| evalCommand
| grokCommand
| parseCommand
@@ -177,6 +178,10 @@ evalCommand
: EVAL evalClause (COMMA evalClause)*
;
+geoipCommand
+ : EVAL fieldExpression EQUAL GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = geoIpPropertyList)? RT_PRTHS
+ ;
+
headCommand
: HEAD (number = integerLiteral)? (FROM from = integerLiteral)?
;
@@ -237,16 +242,21 @@ fillnullCommand
| fillNullWithFieldVariousValues)
;
-fillNullWithTheSameValue
- : WITH nullReplacement = valueExpression IN nullableFieldList = fieldList
- ;
+ fillNullWithTheSameValue
+ : WITH nullReplacement IN nullableField (COMMA nullableField)*
+ ;
+
+ fillNullWithFieldVariousValues
+ : USING nullableField EQUAL nullReplacement (COMMA nullableField EQUAL nullReplacement)*
+ ;
-fillNullWithFieldVariousValues
- : USING nullableReplacementExpression (COMMA nullableReplacementExpression)*
+
+ nullableField
+ : fieldExpression
;
-nullableReplacementExpression
- : nullableField = fieldExpression EQUAL nullableReplacement = valueExpression
+ nullReplacement
+ : expression
;
expandCommand
@@ -254,7 +264,7 @@ expandCommand
;
flattenCommand
- : FLATTEN fieldExpression (AS alias = identifierSeq)?
+ : FLATTEN fieldExpression
;
trendlineCommand
@@ -446,7 +456,6 @@ valueExpression
| positionFunction # positionFunctionCall
| caseFunction # caseExpr
| timestampFunction # timestampFunctionCall
- | geoipFunction # geoFunctionCall
| LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr
| LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr
| ident ARROW expression # lambda
@@ -457,7 +466,6 @@ primaryExpression
: evalFunctionCall
| fieldExpression
| literalValue
- | dataTypeFunctionCall
;
positionFunction
@@ -544,11 +552,6 @@ dataTypeFunctionCall
: CAST LT_PRTHS expression AS convertedDataType RT_PRTHS
;
-// geoip function
-geoipFunction
- : GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = stringLiteral)? RT_PRTHS
- ;
-
// boolean functions
booleanFunctionCall
: conditionFunctionBase LT_PRTHS functionArgs RT_PRTHS
@@ -582,7 +585,6 @@ evalFunctionName
| cryptographicFunctionName
| jsonFunctionName
| collectionFunctionName
- | geoipFunctionName
| lambdaFunctionName
;
@@ -913,6 +915,22 @@ coalesceFunctionName
: COALESCE
;
+geoIpPropertyList
+ : geoIpProperty (COMMA geoIpProperty)*
+ ;
+
+geoIpProperty
+ : COUNTRY_ISO_CODE
+ | COUNTRY_NAME
+ | CONTINENT_NAME
+ | REGION_ISO_CODE
+ | REGION_NAME
+ | CITY_NAME
+ | TIME_ZONE
+ | LAT
+ | LON
+ ;
+
// operators
comparisonOperator
: EQUAL
@@ -1039,11 +1057,6 @@ qualifiedName
: ident (DOT ident)* # identsAsQualifiedName
;
-identifierSeq
- : qualifiedName (COMMA qualifiedName)* # identsAsQualifiedNameSeq
- | LT_PRTHS qualifiedName (COMMA qualifiedName)* RT_PRTHS # identsAsQualifiedNameSeq
- ;
-
tableQualifiedName
: tableIdent (DOT ident)* # identsAsTableQualifiedName
;
@@ -1178,6 +1191,7 @@ keywordsCanBeId
| FULL
| SEMI
| ANTI
+ | GEOIP
| BETWEEN
| CIDRMATCH
| trendlineType
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java
index dadf6b968..87e9f1ecb 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java
@@ -13,14 +13,12 @@
import org.opensearch.sql.ast.expression.AttributeList;
import org.opensearch.sql.ast.expression.Between;
import org.opensearch.sql.ast.expression.Case;
-import org.opensearch.sql.ast.expression.Cast;
import org.opensearch.sql.ast.expression.Cidr;
import org.opensearch.sql.ast.expression.Compare;
import org.opensearch.sql.ast.expression.EqualTo;
import org.opensearch.sql.ast.expression.Field;
import org.opensearch.sql.ast.expression.FieldList;
import org.opensearch.sql.ast.expression.LambdaFunction;
-import org.opensearch.sql.ast.tree.FieldSummary;
import org.opensearch.sql.ast.expression.FieldsMapping;
import org.opensearch.sql.ast.expression.Function;
import org.opensearch.sql.ast.expression.In;
@@ -41,6 +39,7 @@
import org.opensearch.sql.ast.expression.When;
import org.opensearch.sql.ast.expression.WindowFunction;
import org.opensearch.sql.ast.expression.Xor;
+import org.opensearch.sql.ast.tree.FieldSummary;
import org.opensearch.sql.ast.statement.Explain;
import org.opensearch.sql.ast.statement.Query;
import org.opensearch.sql.ast.statement.Statement;
@@ -48,7 +47,11 @@
import org.opensearch.sql.ast.tree.Correlation;
import org.opensearch.sql.ast.tree.Dedupe;
import org.opensearch.sql.ast.tree.Eval;
+import org.opensearch.sql.ast.tree.Expand;
+import org.opensearch.sql.ast.tree.FillNull;
import org.opensearch.sql.ast.tree.Filter;
+import org.opensearch.sql.ast.tree.Flatten;
+import org.opensearch.sql.ast.tree.GeoIp;
import org.opensearch.sql.ast.tree.Head;
import org.opensearch.sql.ast.tree.Join;
import org.opensearch.sql.ast.tree.Kmeans;
@@ -62,8 +65,9 @@
import org.opensearch.sql.ast.tree.Sort;
import org.opensearch.sql.ast.tree.SubqueryAlias;
import org.opensearch.sql.ast.tree.TableFunction;
+import org.opensearch.sql.ast.tree.Trendline;
import org.opensearch.sql.ast.tree.Values;
-import org.opensearch.sql.ast.tree.*;
+import org.opensearch.sql.ast.tree.Window;
/** AST nodes visitor Defines the traverse path. */
public abstract class AbstractNodeVisitor {
@@ -189,10 +193,6 @@ public T visitFunction(Function node, C context) {
return visitChildren(node, context);
}
- public T visitCast(Cast node, C context) {
- return visitChildren(node, context);
- }
-
public T visitLambdaFunction(LambdaFunction node, C context) {
return visitChildren(node, context);
}
@@ -338,9 +338,14 @@ public T visitExistsSubquery(ExistsSubquery node, C context) {
return visitChildren(node, context);
}
+ public T visitGeoIp(GeoIp node, C context) {
+ return visitChildren(node, context);
+ }
+
public T visitWindow(Window node, C context) {
return visitChildren(node, context);
}
+
public T visitCidr(Cidr node, C context) {
return visitChildren(node, context);
}
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java
index 226ff7a8c..7b3078629 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java
@@ -13,23 +13,27 @@
import org.opensearch.sql.ast.AbstractNodeVisitor;
/**
- * Alias abstraction that associate an unnamed expression with a name.
- * The name information preserved is useful for semantic analysis and response formatting
+ * Alias abstraction that associate an unnamed expression with a name and an optional alias. The
+ * name and alias information preserved is useful for semantic analysis and response formatting
* eventually. This can avoid restoring the info in toString() method which is inaccurate because
* original info is already lost.
*/
+@AllArgsConstructor
@EqualsAndHashCode(callSuper = false)
@Getter
@RequiredArgsConstructor
@ToString
public class Alias extends UnresolvedExpression {
- /** The name to be associated with the result of computing delegated expression. */
+ /** Original field name. */
private final String name;
/** Expression aliased. */
private final UnresolvedExpression delegated;
+ /** Optional field alias. */
+ private String alias;
+
@Override
public T accept(AbstractNodeVisitor nodeVisitor, C context) {
return nodeVisitor.visitAlias(this, context);
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Cast.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Cast.java
deleted file mode 100644
index 0668fbf7b..000000000
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Cast.java
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-package org.opensearch.sql.ast.expression;
-
-import java.util.Collections;
-import java.util.List;
-import lombok.EqualsAndHashCode;
-import lombok.Getter;
-import lombok.RequiredArgsConstructor;
-import org.opensearch.sql.ast.AbstractNodeVisitor;
-
-/**
- * Expression node of cast
- */
-@Getter
-@EqualsAndHashCode(callSuper = false)
-@RequiredArgsConstructor
-public class Cast extends UnresolvedExpression {
- private final UnresolvedExpression expression;
- private final DataType dataType;
-
- @Override
- public List getChild() {
- return Collections.singletonList(expression);
- }
-
- @Override
- public R accept(AbstractNodeVisitor nodeVisitor, C context) {
- return nodeVisitor.visitCast(this, context);
- }
-
- @Override
- public String toString() {
- return String.format("CAST(%s AS %s)", expression, dataType);
- }
-}
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java
index 6f0de02f5..9843158b4 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java
@@ -30,8 +30,4 @@ public enum DataType {
INTERVAL(ExprCoreType.INTERVAL);
@Getter private final ExprCoreType coreType;
-
- public static DataType fromString(String name) {
- return valueOf(name.toUpperCase());
- }
}
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java
index 36c126591..9c57d2adf 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java
@@ -7,7 +7,6 @@
import org.opensearch.sql.ast.expression.Field;
import java.util.List;
-import org.opensearch.sql.ast.expression.UnresolvedExpression;
@RequiredArgsConstructor
public class Flatten extends UnresolvedPlan {
@@ -16,8 +15,6 @@ public class Flatten extends UnresolvedPlan {
@Getter
private final Field field;
- @Getter
- private final List aliasSequence;
@Override
public UnresolvedPlan attach(UnresolvedPlan child) {
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java
new file mode 100644
index 000000000..8861694d9
--- /dev/null
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java
@@ -0,0 +1,40 @@
+package org.opensearch.sql.ast.tree;
+
+import com.google.common.collect.ImmutableList;
+import lombok.EqualsAndHashCode;
+import lombok.Getter;
+import lombok.RequiredArgsConstructor;
+import lombok.ToString;
+import org.opensearch.sql.ast.AbstractNodeVisitor;
+import org.opensearch.sql.ast.Node;
+import org.opensearch.sql.ast.expression.UnresolvedExpression;
+
+import java.util.Arrays;
+import java.util.List;
+
+@ToString
+@Getter
+@RequiredArgsConstructor
+@EqualsAndHashCode(callSuper = false)
+public class GeoIp extends UnresolvedPlan {
+ private UnresolvedPlan child;
+ private final UnresolvedExpression datasource;
+ private final UnresolvedExpression ipAddress;
+ private final UnresolvedExpression properties;
+
+ @Override
+ public List extends Node> getChild() {
+ return ImmutableList.of(child);
+ }
+
+ @Override
+ public T accept(AbstractNodeVisitor nodeVisitor, C context) {
+ return nodeVisitor.visitGeoIp(this, context);
+ }
+
+ @Override
+ public UnresolvedPlan attach(UnresolvedPlan child) {
+ this.child = child;
+ return this;
+ }
+}
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java
index 2541b3743..619f558c1 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java
@@ -8,11 +8,13 @@
import inet.ipaddr.AddressStringException;
import inet.ipaddr.IPAddressString;
import inet.ipaddr.IPAddressStringParameters;
+
+import scala.Function1;
import scala.Function2;
import scala.Serializable;
+import scala.runtime.AbstractFunction1;
import scala.runtime.AbstractFunction2;
-
public interface SerializableUdf {
Function2 cidrFunction = new SerializableAbstractFunction2<>() {
@@ -48,8 +50,57 @@ public Boolean apply(String ipAddress, String cidrBlock) {
}
return parsedCidrBlock.contains(parsedIpAddress);
- }
- };
+ }};
+
+ Function1 isIpv4 = new SerializableAbstractFunction1<>() {
+
+ IPAddressStringParameters valOptions = new IPAddressStringParameters.Builder()
+ .allowEmpty(false)
+ .setEmptyAsLoopback(false)
+ .allow_inet_aton(false)
+ .allowSingleSegment(false)
+ .toParams();
+
+ @Override
+ public Boolean apply(String ipAddress) {
+
+ IPAddressString parsedIpAddress = new IPAddressString(ipAddress, valOptions);
+
+ try {
+ parsedIpAddress.validate();
+ } catch (AddressStringException e) {
+ throw new RuntimeException("The given ipAddress '"+ipAddress+"' is invalid. It must be a valid IPv4 or IPv6 address. Error details: "+e.getMessage());
+ }
+
+ return parsedIpAddress.isIPv4();
+ }};
+
+ Function1 ipToInt = new SerializableAbstractFunction1<>() {
+
+ IPAddressStringParameters valOptions = new IPAddressStringParameters.Builder()
+ .allowEmpty(false)
+ .setEmptyAsLoopback(false)
+ .allow_inet_aton(false)
+ .allowSingleSegment(false)
+ .toParams();
+
+ @Override
+ public Boolean apply(String ipAddress) {
+
+ IPAddressString parsedIpAddress = new IPAddressString(ipAddress, valOptions);
+
+ try {
+ parsedIpAddress.validate();
+ } catch (AddressStringException e) {
+ throw new RuntimeException("The given ipAddress '"+ipAddress+"' is invalid. It must be a valid IPv4 or IPv6 address. Error details: "+e.getMessage());
+ }
+
+ return parsedIpAddress.isIPv4();
+ }};
+
+ abstract class SerializableAbstractFunction1 extends AbstractFunction1
+ implements Serializable {
+ }
abstract class SerializableAbstractFunction2 extends AbstractFunction2
implements Serializable {
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java
index bc14ba9d4..a651f83e9 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java
@@ -8,58 +8,31 @@
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute;
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute$;
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation;
+import org.apache.spark.sql.catalyst.analysis.UnresolvedStar;
import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$;
import org.apache.spark.sql.catalyst.expressions.CaseWhen;
-import org.apache.spark.sql.catalyst.expressions.Cast$;
-import org.apache.spark.sql.catalyst.expressions.CurrentRow$;
import org.apache.spark.sql.catalyst.expressions.Exists$;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual;
import org.apache.spark.sql.catalyst.expressions.In$;
import org.apache.spark.sql.catalyst.expressions.InSubquery$;
import org.apache.spark.sql.catalyst.expressions.LambdaFunction$;
-import org.apache.spark.sql.catalyst.expressions.LessThan;
import org.apache.spark.sql.catalyst.expressions.LessThanOrEqual;
import org.apache.spark.sql.catalyst.expressions.ListQuery$;
import org.apache.spark.sql.catalyst.expressions.MakeInterval$;
import org.apache.spark.sql.catalyst.expressions.NamedExpression;
import org.apache.spark.sql.catalyst.expressions.Predicate;
-import org.apache.spark.sql.catalyst.expressions.RowFrame$;
import org.apache.spark.sql.catalyst.expressions.ScalaUDF;
import org.apache.spark.sql.catalyst.expressions.ScalarSubquery$;
import org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable;
import org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable$;
-import org.apache.spark.sql.catalyst.expressions.SpecifiedWindowFrame;
-import org.apache.spark.sql.catalyst.expressions.WindowExpression;
-import org.apache.spark.sql.catalyst.expressions.WindowSpecDefinition;
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructField;
+import org.apache.spark.unsafe.types.UTF8String;
+
import org.opensearch.sql.ast.AbstractNodeVisitor;
-import org.opensearch.sql.ast.expression.AggregateFunction;
-import org.opensearch.sql.ast.expression.Alias;
-import org.opensearch.sql.ast.expression.AllFields;
-import org.opensearch.sql.ast.expression.And;
-import org.opensearch.sql.ast.expression.Between;
-import org.opensearch.sql.ast.expression.BinaryExpression;
-import org.opensearch.sql.ast.expression.Case;
-import org.opensearch.sql.ast.expression.Cast;
-import org.opensearch.sql.ast.expression.Compare;
-import org.opensearch.sql.ast.expression.DataType;
-import org.opensearch.sql.ast.expression.FieldsMapping;
-import org.opensearch.sql.ast.expression.Function;
-import org.opensearch.sql.ast.expression.In;
-import org.opensearch.sql.ast.expression.Interval;
-import org.opensearch.sql.ast.expression.IsEmpty;
-import org.opensearch.sql.ast.expression.Literal;
-import org.opensearch.sql.ast.expression.Not;
-import org.opensearch.sql.ast.expression.Or;
-import org.opensearch.sql.ast.expression.LambdaFunction;
-import org.opensearch.sql.ast.expression.QualifiedName;
-import org.opensearch.sql.ast.expression.Span;
-import org.opensearch.sql.ast.expression.UnresolvedExpression;
-import org.opensearch.sql.ast.expression.When;
-import org.opensearch.sql.ast.expression.WindowFunction;
-import org.opensearch.sql.ast.expression.Xor;
+import org.opensearch.sql.ast.expression.*;
import org.opensearch.sql.ast.expression.subquery.ExistsSubquery;
import org.opensearch.sql.ast.expression.subquery.InSubquery;
import org.opensearch.sql.ast.expression.subquery.ScalarSubquery;
@@ -68,9 +41,7 @@
import org.opensearch.sql.ast.tree.FillNull;
import org.opensearch.sql.ast.tree.Kmeans;
import org.opensearch.sql.ast.tree.RareTopN;
-import org.opensearch.sql.ast.tree.Trendline;
import org.opensearch.sql.ast.tree.UnresolvedPlan;
-import org.opensearch.sql.expression.function.BuiltinFunctionName;
import org.opensearch.sql.expression.function.SerializableUdf;
import org.opensearch.sql.ppl.utils.AggregatorTransformer;
import org.opensearch.sql.ppl.utils.BuiltinFunctionTransformer;
@@ -83,6 +54,7 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.Optional;
import java.util.Stack;
import java.util.function.BiFunction;
@@ -144,7 +116,6 @@ public Expression visitBinaryArithmetic(BinaryExpression node, BiFunction()),
Option.empty(),
@@ -468,16 +439,6 @@ public Expression visitLambdaFunction(LambdaFunction node, CatalystPlanContext c
return context.getNamedParseExpressions().push(LambdaFunction$.MODULE$.apply(functionResult, seq(argsResult), false));
}
- @Override
- public Expression visitCast(Cast node, CatalystPlanContext context) {
- analyze(node.getExpression(), context);
- Optional ret = context.popNamedParseExpressions();
- if (ret.isEmpty()) {
- throw new UnsupportedOperationException(
- String.format("Invalid use of expression %s", node.getExpression()));
- }
- return context.getNamedParseExpressions().push(Cast$.MODULE$.apply(ret.get(), translate(node.getDataType()), false));
- }
private List visitExpressionList(List expressionList, CatalystPlanContext context) {
return expressionList.isEmpty()
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java
index d7f59bae3..3349715cb 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java
@@ -5,16 +5,28 @@
package org.opensearch.sql.ppl;
+import org.apache.spark.sql.catalyst.AliasIdentifier;
import org.apache.spark.sql.catalyst.TableIdentifier;
+import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute$;
import org.apache.spark.sql.catalyst.analysis.UnresolvedFunction;
import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation;
+import org.apache.spark.sql.catalyst.analysis.UnresolvedStar;
import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$;
+import org.apache.spark.sql.catalyst.expressions.And;
import org.apache.spark.sql.catalyst.expressions.Ascending$;
+import org.apache.spark.sql.catalyst.expressions.AttributeReference;
import org.apache.spark.sql.catalyst.expressions.Descending$;
+import org.apache.spark.sql.catalyst.expressions.EqualTo;
import org.apache.spark.sql.catalyst.expressions.Explode;
+import org.apache.spark.sql.catalyst.expressions.ExprId;
import org.apache.spark.sql.catalyst.expressions.Expression;
import org.apache.spark.sql.catalyst.expressions.GeneratorOuter;
+import org.apache.spark.sql.catalyst.expressions.GreaterThan;
+import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual;
+import org.apache.spark.sql.catalyst.expressions.LessThan;
+import org.apache.spark.sql.catalyst.expressions.LessThanOrEqual;
import org.apache.spark.sql.catalyst.expressions.NamedExpression;
+import org.apache.spark.sql.catalyst.expressions.ScalaUDF;
import org.apache.spark.sql.catalyst.expressions.SortDirection;
import org.apache.spark.sql.catalyst.expressions.SortOrder;
import org.apache.spark.sql.catalyst.plans.logical.Aggregate;
@@ -23,12 +35,14 @@
import org.apache.spark.sql.catalyst.plans.logical.Generate;
import org.apache.spark.sql.catalyst.plans.logical.Limit;
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan;
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$;
import org.apache.spark.sql.catalyst.plans.logical.Project$;
+import org.apache.spark.sql.catalyst.plans.logical.Union;
import org.apache.spark.sql.execution.ExplainMode;
import org.apache.spark.sql.execution.command.DescribeTableCommand;
import org.apache.spark.sql.execution.command.ExplainCommand;
import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.Metadata;
+import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.util.CaseInsensitiveStringMap;
import org.opensearch.flint.spark.FlattenGenerator;
import org.opensearch.sql.ast.AbstractNodeVisitor;
@@ -37,6 +51,7 @@
import org.opensearch.sql.ast.expression.Argument;
import org.opensearch.sql.ast.expression.Field;
import org.opensearch.sql.ast.expression.Function;
+import org.opensearch.sql.ast.tree.GeoIp;
import org.opensearch.sql.ast.expression.In;
import org.opensearch.sql.ast.expression.Let;
import org.opensearch.sql.ast.expression.Literal;
@@ -71,6 +86,7 @@
import org.opensearch.sql.ast.tree.Trendline;
import org.opensearch.sql.ast.tree.Window;
import org.opensearch.sql.common.antlr.SyntaxCheckException;
+import org.opensearch.sql.expression.function.SerializableUdf;
import org.opensearch.sql.ppl.utils.FieldSummaryTransformer;
import org.opensearch.sql.ppl.utils.ParseTransformer;
import org.opensearch.sql.ppl.utils.SortUtils;
@@ -83,12 +99,14 @@
import java.util.ArrayList;
import java.util.List;
+import java.util.Locale;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import static java.util.Collections.emptyList;
import static java.util.List.of;
+import static org.opensearch.sql.ppl.CatalystPlanContext.findRelation;
import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq;
import static org.opensearch.sql.ppl.utils.DedupeTransformer.retainMultipleDuplicateEvents;
import static org.opensearch.sql.ppl.utils.DedupeTransformer.retainMultipleDuplicateEventsAndKeepEmpty;
@@ -292,7 +310,6 @@ public LogicalPlan visitSubqueryAlias(SubqueryAlias node, CatalystPlanContext co
context.withSubqueryAlias(alias);
return alias;
});
-
}
@Override
@@ -453,30 +470,10 @@ public LogicalPlan visitFillNull(FillNull fillNull, CatalystPlanContext context)
Seq projectExpressions = context.retainAllNamedParseExpressions(p -> (NamedExpression) p);
// build the plan with the projection step
context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p));
- LogicalPlan resultWithoutDuplicatedColumns = context.apply(dropOriginalColumns(p -> p.children().head(), toDrop));
+ LogicalPlan resultWithoutDuplicatedColumns = context.apply(logicalPlan -> DataFrameDropColumns$.MODULE$.apply(seq(toDrop), logicalPlan));
return Objects.requireNonNull(resultWithoutDuplicatedColumns, "FillNull operation failed");
}
- /**
- * This method is used to generate DataFrameDropColumns operator for dropping duplicated columns
- * in the original plan. Then achieving similar effect like updating columns.
- *
- * PLAN_ID_TAG is a mechanism inner Spark that explicitly specify a plan to resolve the
- * UnresolvedAttributes. Set toDrop expressions' PLAN_ID_TAG to the same value as that of the
- * original plan, so Spark will resolve them correctly by that plan instead of the child.
- */
- private java.util.function.Function dropOriginalColumns(
- java.util.function.Function findOriginalPlan,
- List toDrop) {
- return logicalPlan -> {
- LogicalPlan originalPlan = findOriginalPlan.apply(logicalPlan);
- long planId = logicalPlan.hashCode();
- originalPlan.setTagValue(LogicalPlan$.MODULE$.PLAN_ID_TAG(), planId);
- toDrop.forEach(e -> e.setTagValue(LogicalPlan$.MODULE$.PLAN_ID_TAG(), planId));
- return DataFrameDropColumns$.MODULE$.apply(seq(toDrop), logicalPlan);
- };
- }
-
@Override
public LogicalPlan visitFlatten(Flatten flatten, CatalystPlanContext context) {
visitFirstChild(flatten, context);
@@ -485,13 +482,9 @@ public LogicalPlan visitFlatten(Flatten flatten, CatalystPlanContext context) {
context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty()));
}
Expression field = visitExpression(flatten.getField(), context);
- List alias = flatten.getAliasSequence().stream()
- .map(aliasNode -> visitExpression(aliasNode, context))
- .collect(Collectors.toList());
context.retainAllNamedParseExpressions(p -> (NamedExpression) p);
FlattenGenerator flattenGenerator = new FlattenGenerator(field);
- scala.collection.mutable.Seq outputs = alias.isEmpty() ? seq() : seq(alias);
- context.apply(p -> new Generate(new GeneratorOuter(flattenGenerator), seq(), true, (Option) None$.MODULE$, outputs, p));
+ context.apply(p -> new Generate(new GeneratorOuter(flattenGenerator), seq(), true, (Option) None$.MODULE$, seq(), p));
return context.apply(logicalPlan -> DataFrameDropColumns$.MODULE$.apply(seq(field), logicalPlan));
}
@@ -577,6 +570,103 @@ public LogicalPlan visitEval(Eval node, CatalystPlanContext context) {
return context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p));
}
+ @Override
+ public LogicalPlan visitGeoIp(GeoIp node, CatalystPlanContext context) {
+
+ visitFirstChild(node, context);
+// expressionAnalyzer.analyze(node.getDatasource(), context);
+// Expression datasourceExpression = context.getNamedParseExpressions().pop();
+ Expression ipAddressExpression = visitExpression(node.getIpAddress(), context);
+// expressionAnalyzer.analyze(node.getProperties(), context);
+
+// List attributeList = new ArrayList<>();
+// Expression nextExpression = context.getNamedParseExpressions().peek();
+// while (nextExpression != null && !(nextExpression instanceof UnresolvedStar)) {
+// String attributeName = nextExpression.toString();
+//
+// if (attributeList.contains(attributeName)) {
+// throw new IllegalStateException("Duplicate attribute in GEOIP attribute list");
+// }
+//
+// attributeList.add(0, attributeName);
+// context.getNamedParseExpressions().pop();
+// nextExpression = context.getNamedParseExpressions().peek();
+// }
+
+ ScalaUDF ipInt = new ScalaUDF(SerializableUdf.ipToInt,
+ DataTypes.BooleanType,
+ seq(ipAddressExpression),
+ seq(),
+ Option.empty(),
+ Option.apply("ip_to_int"),
+ false,
+ true);
+
+ ScalaUDF isIpv4 = new ScalaUDF(SerializableUdf.isIpv4,
+ DataTypes.BooleanType,
+ seq(ipAddressExpression),
+ seq(),
+ Option.empty(),
+ Option.apply("is_ipv4"),
+ false,
+ true);
+
+ LogicalPlan plan = context.apply(left -> {
+ LogicalPlan right = new UnresolvedRelation(seq("geoip"), CaseInsensitiveStringMap.empty(), false);
+ Optional joinCondition = Optional.of(new And(
+ new And(
+ new GreaterThanOrEqual(
+ ipInt,
+ UnresolvedAttribute$.MODULE$.apply(seq("ip_range_start"))
+ ),
+ new LessThan(
+ ipInt,
+ UnresolvedAttribute$.MODULE$.apply(seq("ip_range_end"))
+ )
+ ),
+ new EqualTo(
+ isIpv4,
+ UnresolvedAttribute$.MODULE$.apply(seq("ip_type"))
+ )
+ ));
+ context.retainAllNamedParseExpressions(p -> p);
+ context.retainAllPlans(p -> p);
+ return join(left,
+ right,
+ Join.JoinType.INNER,
+ joinCondition,
+ new Join.JoinHint());
+ });
+
+ System.out.println("Wow I like Pancakes");
+ System.out.println(plan);
+
+ return plan;
+ }
+
+ private StructField[] createGeoIpStructFields(List attributeList) {
+ List attributeListToUse;
+ if (attributeList == null || attributeList.isEmpty()) {
+ attributeListToUse = List.of(
+ "country_iso_code",
+ "country_name",
+ "continent_name",
+ "region_iso_code",
+ "region_name",
+ "city_name",
+ "time_zone",
+ "lat",
+ "lon"
+ );
+ } else {
+ attributeListToUse = attributeList;
+ }
+
+ return attributeListToUse.stream()
+ .map(a -> DataTypes.createStructField(a.toLowerCase(Locale.ROOT), DataTypes.StringType, true))
+ .toArray(StructField[]::new);
+ }
+
@Override
public LogicalPlan visitKmeans(Kmeans node, CatalystPlanContext context) {
throw new IllegalStateException("Not Supported operation : Kmeans");
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java
index d4f9ece87..d2242d9b3 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java
@@ -340,6 +340,18 @@ public UnresolvedPlan visitEvalCommand(OpenSearchPPLParser.EvalCommandContext ct
.collect(Collectors.toList()));
}
+ @Override
+ public UnresolvedPlan visitGeoipCommand(OpenSearchPPLParser.GeoipCommandContext ctx) {
+ UnresolvedExpression datasource =
+ (ctx.datasource != null) ?
+ internalVisitExpression(ctx.datasource) :
+ // TODO Make default value var
+ new Literal("https://geoip.maps.opensearch.org/v1/geolite2-city/manifest.json", DataType.STRING);
+ UnresolvedExpression ipAddress = internalVisitExpression(ctx.ipAddress);
+ UnresolvedExpression properties = ctx.properties == null ? new AttributeList(Collections.emptyList()) : internalVisitExpression(ctx.properties);
+ return new GeoIp(datasource, ipAddress, properties);
+ }
+
private List getGroupByList(OpenSearchPPLParser.ByClauseContext ctx) {
return ctx.fieldList().fieldExpression().stream()
.map(this::internalVisitExpression)
@@ -581,18 +593,19 @@ public UnresolvedPlan visitFillnullCommand(OpenSearchPPLParser.FillnullCommandCo
FillNullWithFieldVariousValuesContext variousValuesContext = ctx.fillNullWithFieldVariousValues();
if (sameValueContext != null) {
// todo consider using expression instead of Literal
- UnresolvedExpression replaceNullWithMe = internalVisitExpression(sameValueContext.nullReplacement);
- List fieldsToReplace = sameValueContext.nullableFieldList.fieldExpression()
+ UnresolvedExpression replaceNullWithMe = internalVisitExpression(sameValueContext.nullReplacement().expression());
+ List fieldsToReplace = sameValueContext.nullableField()
.stream()
.map(this::internalVisitExpression)
.map(Field.class::cast)
.collect(Collectors.toList());
return new FillNull(ofSameValue(replaceNullWithMe, fieldsToReplace));
} else if (variousValuesContext != null) {
- List nullableFieldFills = IntStream.range(0, variousValuesContext.nullableReplacementExpression().size())
+ List nullableFieldFills = IntStream.range(0, variousValuesContext.nullableField().size())
.mapToObj(index -> {
- UnresolvedExpression replaceNullWithMe = internalVisitExpression(variousValuesContext.nullableReplacementExpression(index).nullableReplacement);
- Field nullableFieldReference = (Field) internalVisitExpression(variousValuesContext.nullableReplacementExpression(index).nullableField);
+ variousValuesContext.nullableField(index);
+ UnresolvedExpression replaceNullWithMe = internalVisitExpression(variousValuesContext.nullReplacement(index).expression());
+ Field nullableFieldReference = (Field) internalVisitExpression(variousValuesContext.nullableField(index));
return new NullableFieldFill(nullableFieldReference, replaceNullWithMe);
})
.collect(Collectors.toList());
@@ -605,8 +618,7 @@ public UnresolvedPlan visitFillnullCommand(OpenSearchPPLParser.FillnullCommandCo
@Override
public UnresolvedPlan visitFlattenCommand(OpenSearchPPLParser.FlattenCommandContext ctx) {
Field unresolvedExpression = (Field) internalVisitExpression(ctx.fieldExpression());
- List alias = ctx.alias == null ? emptyList() : ((AttributeList) internalVisitExpression(ctx.alias)).getAttrList();
- return new Flatten(unresolvedExpression, alias);
+ return new Flatten(unresolvedExpression);
}
/** AD command. */
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java
index 1fe57d13e..e9e4c7cbe 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java
@@ -19,13 +19,13 @@
import org.opensearch.sql.ast.expression.AttributeList;
import org.opensearch.sql.ast.expression.Between;
import org.opensearch.sql.ast.expression.Case;
-import org.opensearch.sql.ast.expression.Cast;
import org.opensearch.sql.ast.expression.Cidr;
import org.opensearch.sql.ast.expression.Compare;
import org.opensearch.sql.ast.expression.DataType;
import org.opensearch.sql.ast.expression.EqualTo;
import org.opensearch.sql.ast.expression.Field;
import org.opensearch.sql.ast.expression.Function;
+import org.opensearch.sql.ast.tree.GeoIp;
import org.opensearch.sql.ast.expression.In;
import org.opensearch.sql.ast.expression.Interval;
import org.opensearch.sql.ast.expression.IntervalUnit;
@@ -45,8 +45,6 @@
import org.opensearch.sql.ast.expression.subquery.ExistsSubquery;
import org.opensearch.sql.ast.expression.subquery.InSubquery;
import org.opensearch.sql.ast.expression.subquery.ScalarSubquery;
-import org.opensearch.sql.ast.tree.Trendline;
-import org.opensearch.sql.common.antlr.SyntaxCheckException;
import org.opensearch.sql.common.utils.StringUtils;
import org.opensearch.sql.ppl.utils.ArgumentFactory;
@@ -280,9 +278,9 @@ public UnresolvedExpression visitEvalFunctionCall(OpenSearchPPLParser.EvalFuncti
return buildFunction(ctx.evalFunctionName().getText(), ctx.functionArgs().functionArg());
}
- @Override public UnresolvedExpression visitDataTypeFunctionCall(OpenSearchPPLParser.DataTypeFunctionCallContext ctx) {
- // TODO: for long term consideration, needs to implement DataTypeBuilder/Visitor to parse all data types
- return new Cast(this.visit(ctx.expression()), DataType.fromString(ctx.convertedDataType().getText()));
+ @Override
+ public UnresolvedExpression visitConvertedDataType(OpenSearchPPLParser.ConvertedDataTypeContext ctx) {
+ return new Literal(ctx.getText(), DataType.STRING);
}
@Override
@@ -330,11 +328,6 @@ public UnresolvedExpression visitIdentsAsQualifiedName(OpenSearchPPLParser.Ident
return visitIdentifiers(ctx.ident());
}
- @Override
- public UnresolvedExpression visitIdentsAsQualifiedNameSeq(OpenSearchPPLParser.IdentsAsQualifiedNameSeqContext ctx) {
- return new AttributeList(ctx.qualifiedName().stream().map(this::visit).collect(Collectors.toList()));
- }
-
@Override
public UnresolvedExpression visitIdentsAsTableQualifiedName(
OpenSearchPPLParser.IdentsAsTableQualifiedNameContext ctx) {
@@ -383,7 +376,8 @@ public UnresolvedExpression visitBooleanLiteral(OpenSearchPPLParser.BooleanLiter
public UnresolvedExpression visitBySpanClause(OpenSearchPPLParser.BySpanClauseContext ctx) {
String name = ctx.spanClause().getText();
return ctx.alias != null
- ? new Alias(StringUtils.unquoteIdentifier(ctx.alias.getText()), visit(ctx.spanClause()))
+ ? new Alias(
+ name, visit(ctx.spanClause()), StringUtils.unquoteIdentifier(ctx.alias.getText()))
: new Alias(name, visit(ctx.spanClause()));
}
@@ -450,6 +444,39 @@ public UnresolvedExpression visitLambda(OpenSearchPPLParser.LambdaContext ctx) {
return new LambdaFunction(function, arguments);
}
+ @Override
+ public UnresolvedExpression visitGeoIpPropertyList(OpenSearchPPLParser.GeoIpPropertyListContext ctx) {
+ ImmutableList.Builder properties = ImmutableList.builder();
+ if (ctx != null) {
+ for (OpenSearchPPLParser.GeoIpPropertyContext property : ctx.geoIpProperty()) {
+ String propertyName;
+ if (property.COUNTRY_ISO_CODE() != null) {
+ propertyName = "COUNTRY_ISO_CODE";
+ } else if (property.COUNTRY_NAME() != null) {
+ propertyName = "COUNTRY_NAME";
+ } else if (property.CONTINENT_NAME() != null) {
+ propertyName = "CONTINENT_NAME";
+ } else if (property.REGION_ISO_CODE() != null) {
+ propertyName = "REGION_ISO_CODE";
+ } else if (property.CITY_NAME() != null) {
+ propertyName = "CITY_NAME";
+ } else if (property.TIME_ZONE() != null) {
+ propertyName = "TIME_ZONE";
+ } else if (property.LAT() != null) {
+ propertyName = "LAT";
+ } else if (property.LON() != null) {
+ propertyName = "LON";
+ } else {
+ continue;
+ }
+
+ properties.add(new Literal(propertyName, DataType.STRING));
+ }
+ }
+
+ return new AttributeList(properties.build());
+ }
+
private List timestampFunctionArguments(
OpenSearchPPLParser.TimestampFunctionCallContext ctx) {
List args =
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java
index f583d7847..e4defad52 100644
--- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java
@@ -9,7 +9,6 @@
import org.apache.spark.sql.types.BooleanType$;
import org.apache.spark.sql.types.ByteType$;
import org.apache.spark.sql.types.DataType;
-import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.DateType$;
import org.apache.spark.sql.types.DoubleType$;
import org.apache.spark.sql.types.FloatType$;
@@ -50,12 +49,8 @@ static Seq seq(List list) {
static DataType translate(org.opensearch.sql.ast.expression.DataType source) {
switch (source.getCoreType()) {
- case DATE:
+ case TIME:
return DateType$.MODULE$;
- case TIMESTAMP:
- return DataTypes.TimestampType;
- case STRING:
- return DataTypes.StringType;
case INTEGER:
return IntegerType$.MODULE$;
case LONG:
@@ -73,7 +68,7 @@ static DataType translate(org.opensearch.sql.ast.expression.DataType source) {
case UNDEFINED:
return NullType$.MODULE$;
default:
- throw new IllegalArgumentException("Unsupported data type for Spark: " + source);
+ return StringType$.MODULE$;
}
}
@@ -125,4 +120,4 @@ static String translate(SpanUnit unit) {
}
return "";
}
-}
+}
\ No newline at end of file
diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoipCatalystUtils.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoipCatalystUtils.java
new file mode 100644
index 000000000..a35114140
--- /dev/null
+++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoipCatalystUtils.java
@@ -0,0 +1,4 @@
+package org.opensearch.sql.ppl.utils;
+
+public interface GeoipCatalystUtils {
+}
diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala
deleted file mode 100644
index 829b7ff1f..000000000
--- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright OpenSearch Contributors
- * SPDX-License-Identifier: Apache-2.0
- */
-
-package org.opensearch.flint.spark.ppl
-
-import org.opensearch.flint.spark.ppl.PlaneUtils.plan
-import org.opensearch.sql.common.antlr.SyntaxCheckException
-import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor}
-import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq
-import org.scalatest.matchers.should.Matchers
-
-import org.apache.spark.SparkFunSuite
-import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar}
-import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Literal}
-import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.Project
-import org.apache.spark.sql.types.{IntegerType, StringType}
-
-class PPLLogicalPlanCastTestSuite
- extends SparkFunSuite
- with PlanTest
- with LogicalPlanTestUtils
- with Matchers {
-
- private val planTransformer = new CatalystQueryPlanVisitor()
- private val pplParser = new PPLSyntaxParser()
-
- test("test cast with case sensitive") {
- val table = UnresolvedRelation(Seq("t"))
- val expectedPlan = Project(
- seq(UnresolvedStar(None)),
- Project(
- seq(UnresolvedStar(None), Alias(Cast(UnresolvedAttribute("a"), StringType), "a")()),
- table))
-
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(plan(pplParser, """source=t | eval a = cast(a as STRING)"""), context)
- comparePlans(expectedPlan, logPlan, false)
-
- // test case insensitive
- val context2 = new CatalystPlanContext
- val logPlan2 =
- planTransformer.visit(
- plan(pplParser, """source=t | eval a = cast(a as string)"""),
- context2)
- comparePlans(expectedPlan, logPlan2, false)
- }
-
- test("test cast literal") {
- val table = UnresolvedRelation(Seq("t"))
- val expectedPlan = Project(
- seq(UnresolvedStar(None)),
- Project(
- seq(
- UnresolvedStar(None),
- Alias(Cast(Cast(Literal("a"), IntegerType), StringType), "a")()),
- table))
-
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, """source=t | eval a = cast(cast("a" as INTEGER) as STRING)"""),
- context)
- comparePlans(expectedPlan, logPlan, false)
- }
-
- test("test chained cast") {
- val table = UnresolvedRelation(Seq("t"))
- val expectedPlan = Project(
- seq(UnresolvedStar(None)),
- Project(
- seq(
- UnresolvedStar(None),
- Alias(Cast(Cast(UnresolvedAttribute("a"), IntegerType), StringType), "a")()),
- table))
-
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, """source=t | eval a = cast(cast(a as INTEGER) as STRING)"""),
- context)
- comparePlans(expectedPlan, logPlan, false)
- }
-
- test("test cast with unsupported dataType") {
- // Unsupported data type for opensearch parser
- val context = new CatalystPlanContext
- val exception = intercept[SyntaxCheckException] {
- planTransformer.visit(
- plan(pplParser, """source=t | eval a = cast(a as UNSUPPORTED_DATATYPE)"""),
- context)
- }
- assert(
- exception.getMessage.contains(
- "Failed to parse query due to offending symbol [UNSUPPORTED_DATATYPE]"))
-
- // Unsupported data type for Spark
- val context2 = new CatalystPlanContext
- val exception2 = intercept[IllegalArgumentException] {
- planTransformer.visit(plan(pplParser, """source=t | eval a = cast(a as time)"""), context2)
- }
- assert(exception2.getMessage == "Unsupported data type for Spark: TIME")
- }
-
-}
diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala
index ba0d78670..1b61dc98f 100644
--- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala
+++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala
@@ -6,15 +6,17 @@
package org.opensearch.flint.spark.ppl
import org.opensearch.flint.spark.ppl.PlaneUtils.plan
+import org.opensearch.sql.expression.function.SerializableUdf
import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor}
import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq
import org.scalatest.matchers.should.Matchers
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar}
-import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, ExprId, In, Literal, NamedExpression, SortOrder}
-import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{Project, Sort}
+import org.apache.spark.sql.catalyst.expressions.{Alias, And, Descending, EqualTo, ExprId, GreaterThanOrEqual, In, LessThan, Literal, NamedExpression, ScalaUDF, SortOrder}
+import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest}
+import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint, Project, Sort, SubqueryAlias}
+import org.apache.spark.sql.types.DataTypes
class PPLLogicalPlanEvalTranslatorTestSuite
extends SparkFunSuite
@@ -25,192 +27,243 @@ class PPLLogicalPlanEvalTranslatorTestSuite
private val planTransformer = new CatalystQueryPlanVisitor()
private val pplParser = new PPLSyntaxParser()
- test("test eval expressions not included in fields expressions") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1 | fields c"), context)
- val evalProjectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")())
- val expectedPlan = Project(
- seq(UnresolvedAttribute("c")),
- Project(evalProjectList, UnresolvedRelation(Seq("t"))))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
-
- test("test eval expressions included in fields expression") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=t | eval a = 1, c = 1 | fields a, b, c"),
- context)
+// test("test eval expressions not included in fields expressions") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1 | fields c"), context)
+// val evalProjectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")())
+// val expectedPlan = Project(
+// seq(UnresolvedAttribute("c")),
+// Project(evalProjectList, UnresolvedRelation(Seq("t"))))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
- val evalProjectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "c")())
- val expectedPlan = Project(
- seq(UnresolvedAttribute("a"), UnresolvedAttribute("b"), UnresolvedAttribute("c")),
- Project(evalProjectList, UnresolvedRelation(Seq("t"))))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+// test("test eval expressions included in fields expression") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval a = 1, c = 1 | fields a, b, c"),
+// context)
+//
+// val evalProjectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "c")())
+// val expectedPlan = Project(
+// seq(UnresolvedAttribute("a"), UnresolvedAttribute("b"), UnresolvedAttribute("c")),
+// Project(evalProjectList, UnresolvedRelation(Seq("t"))))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test eval expressions without fields command") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1"), context)
+//
+// val evalProjectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")())
+// val expectedPlan =
+// Project(seq(UnresolvedStar(None)), Project(evalProjectList, UnresolvedRelation(Seq("t"))))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test eval expressions with sort") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval a = 1, b = 1 | sort - a | fields b"),
+// context)
+//
+// val evalProjectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")())
+// val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t")))
+// val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty)
+// val sort = Sort(seq(sortOrder), global = true, evalProject)
+// val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort)
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test eval expressions with multiple recursive sort") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval a = 1, a = a | sort - a | fields b"),
+// context)
+//
+// val evalProjectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(UnresolvedAttribute("a"), "a")())
+// val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t")))
+// val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty)
+// val sort = Sort(seq(sortOrder), global = true, evalProject)
+// val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort)
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple eval expressions") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval a = 1, b = 'hello' | eval b = a | sort - b | fields b"),
+// context)
+//
+// val evalProjectList1: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal("hello"), "b")())
+// val evalProjectList2: Seq[NamedExpression] = Seq(
+// UnresolvedStar(None),
+// Alias(UnresolvedAttribute("a"), "b")(exprId = ExprId(2), qualifier = Seq.empty))
+// val evalProject1 = Project(evalProjectList1, UnresolvedRelation(Seq("t")))
+// val evalProject2 = Project(evalProjectList2, evalProject1)
+// val sortOrder = SortOrder(UnresolvedAttribute("b"), Descending, Seq.empty)
+// val sort = Sort(seq(sortOrder), global = true, evalProject2)
+// val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort)
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test complex eval expressions - date function") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval a = TIMESTAMP('2020-09-16 17:30:00') | fields a"),
+// context)
+//
+// val evalProjectList: Seq[NamedExpression] = Seq(
+// UnresolvedStar(None),
+// Alias(
+// UnresolvedFunction("timestamp", seq(Literal("2020-09-16 17:30:00")), isDistinct = false),
+// "a")())
+// val expectedPlan = Project(
+// seq(UnresolvedAttribute("a")),
+// Project(evalProjectList, UnresolvedRelation(Seq("t"))))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test complex eval expressions - math function") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(plan(pplParser, "source=t | eval a = RAND() | fields a"), context)
+//
+// val evalProjectList: Seq[NamedExpression] = Seq(
+// UnresolvedStar(None),
+// Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "a")(
+// exprId = ExprId(0),
+// qualifier = Seq.empty))
+// val expectedPlan = Project(
+// seq(UnresolvedAttribute("a")),
+// Project(evalProjectList, UnresolvedRelation(Seq("t"))))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// test("test complex eval expressions - compound function") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval a = if(like(b, '%Hello%'), 'World', 'Hi') | fields a"),
+// context)
+//
+// val evalProjectList: Seq[NamedExpression] = Seq(
+// UnresolvedStar(None),
+// Alias(
+// UnresolvedFunction(
+// "if",
+// seq(
+// UnresolvedFunction(
+// "like",
+// seq(UnresolvedAttribute("b"), Literal("%Hello%")),
+// isDistinct = false),
+// Literal("World"),
+// Literal("Hi")),
+// isDistinct = false),
+// "a")())
+// val expectedPlan = Project(
+// seq(UnresolvedAttribute("a")),
+// Project(evalProjectList, UnresolvedRelation(Seq("t"))))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
- test("test eval expressions without fields command") {
+ test("test eval expression - geoip function") {
val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1"), context)
- val evalProjectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")())
- val expectedPlan =
- Project(seq(UnresolvedStar(None)), Project(evalProjectList, UnresolvedRelation(Seq("t"))))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+ //scalastyle:off
+ println("Wow I like Pancakes");
+ //scalastyle:on
- test("test eval expressions with sort") {
- val context = new CatalystPlanContext
val logPlan =
planTransformer.visit(
- plan(pplParser, "source=t | eval a = 1, b = 1 | sort - a | fields b"),
+ plan(pplParser, "source=t | eval a = geoip(lol,ip_address,TIME_ZONE)"),
context)
- val evalProjectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")())
- val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t")))
- val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty)
- val sort = Sort(seq(sortOrder), global = true, evalProject)
- val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort)
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+ //scalastyle:off
+ println("Wow I like Pancakes");
+ //scalastyle:on
- test("test eval expressions with multiple recursive sort") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=t | eval a = 1, a = a | sort - a | fields b"),
- context)
+ val ipAddress = UnresolvedAttribute("ip_address")
- val evalProjectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(UnresolvedAttribute("a"), "a")())
- val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t")))
- val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty)
- val sort = Sort(seq(sortOrder), global = true, evalProject)
- val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort)
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+ val is_ipv4 = ScalaUDF(
+ SerializableUdf.isIpv4,
+ DataTypes.BooleanType,
+ seq(ipAddress),
+ seq(),
+ Option.empty,
+ Option.apply("is_ipv4")
+ )
- test("test multiple eval expressions") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=t | eval a = 1, b = 'hello' | eval b = a | sort - b | fields b"),
- context)
+ val ip_int = ScalaUDF(
+ SerializableUdf.ipToInt,
+ DataTypes.IntegerType,
+ seq(ipAddress),
+ seq(),
+ Option.empty,
+ Option.apply("ip_to_int")
+ )
- val evalProjectList1: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal("hello"), "b")())
- val evalProjectList2: Seq[NamedExpression] = Seq(
- UnresolvedStar(None),
- Alias(UnresolvedAttribute("a"), "b")(exprId = ExprId(2), qualifier = Seq.empty))
- val evalProject1 = Project(evalProjectList1, UnresolvedRelation(Seq("t")))
- val evalProject2 = Project(evalProjectList2, evalProject1)
- val sortOrder = SortOrder(UnresolvedAttribute("b"), Descending, Seq.empty)
- val sort = Sort(seq(sortOrder), global = true, evalProject2)
- val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort)
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+ val sourceTable = SubqueryAlias("l", UnresolvedRelation(seq("users")))
+ val geoTable = SubqueryAlias("r", UnresolvedRelation(seq("geoip")))
- test("test complex eval expressions - date function") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=t | eval a = TIMESTAMP('2020-09-16 17:30:00') | fields a"),
- context)
-
- val evalProjectList: Seq[NamedExpression] = Seq(
- UnresolvedStar(None),
- Alias(
- UnresolvedFunction("timestamp", seq(Literal("2020-09-16 17:30:00")), isDistinct = false),
- "a")())
- val expectedPlan = Project(
- seq(UnresolvedAttribute("a")),
- Project(evalProjectList, UnresolvedRelation(Seq("t"))))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+ val ipRangeStartCondition = GreaterThanOrEqual(ip_int, UnresolvedAttribute("r.ip_t"))
+ val ipRangeEndCondition = LessThan(ip_int, UnresolvedAttribute("r.ip"))
+ val isIpv4Condition = EqualTo(is_ipv4, UnresolvedAttribute("r.ip_type"))
- test("test complex eval expressions - math function") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(plan(pplParser, "source=t | eval a = RAND() | fields a"), context)
-
- val evalProjectList: Seq[NamedExpression] = Seq(
- UnresolvedStar(None),
- Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "a")(
- exprId = ExprId(0),
- qualifier = Seq.empty))
- val expectedPlan = Project(
- seq(UnresolvedAttribute("a")),
- Project(evalProjectList, UnresolvedRelation(Seq("t"))))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+ val joinCondition = And(And(ipRangeStartCondition, ipRangeEndCondition), isIpv4Condition)
- test("test complex eval expressions - compound function") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=t | eval a = if(like(b, '%Hello%'), 'World', 'Hi') | fields a"),
- context)
+ val joinPlan = Join(sourceTable, geoTable, Inner, Some(joinCondition), JoinHint.NONE)
+ val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- val evalProjectList: Seq[NamedExpression] = Seq(
- UnresolvedStar(None),
- Alias(
- UnresolvedFunction(
- "if",
- seq(
- UnresolvedFunction(
- "like",
- seq(UnresolvedAttribute("b"), Literal("%Hello%")),
- isDistinct = false),
- Literal("World"),
- Literal("Hi")),
- isDistinct = false),
- "a")())
- val expectedPlan = Project(
- seq(UnresolvedAttribute("a")),
- Project(evalProjectList, UnresolvedRelation(Seq("t"))))
comparePlans(expectedPlan, logPlan, checkAnalysis = false)
}
- // Todo fields-excluded command not supported
- ignore("test eval expressions with fields-excluded command") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields - b"), context)
-
- val projectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")())
- val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t")))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
-
- // Todo fields-included command not supported
- ignore("test eval expressions with fields-included command") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields + b"), context)
-
- val projectList: Seq[NamedExpression] =
- Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")())
- val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t")))
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
-
- test("test IN expr in eval") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=t | eval in = a in ('Hello', 'World') | fields in"),
- context)
-
- val in = Alias(In(UnresolvedAttribute("a"), Seq(Literal("Hello"), Literal("World"))), "in")()
- val eval = Project(Seq(UnresolvedStar(None), in), UnresolvedRelation(Seq("t")))
- val expectedPlan = Project(Seq(UnresolvedAttribute("in")), eval)
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
+// // Todo fields-excluded command not supported
+// ignore("test eval expressions with fields-excluded command") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields - b"), context)
+//
+// val projectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")())
+// val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t")))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+//
+// // Todo fields-included command not supported
+// ignore("test eval expressions with fields-included command") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields + b"), context)
+//
+// val projectList: Seq[NamedExpression] =
+// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")())
+// val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t")))
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
+////
+// test("test IN expr in eval") {
+// val context = new CatalystPlanContext
+// val logPlan =
+// planTransformer.visit(
+// plan(pplParser, "source=t | eval in = a in ('Hello', 'World') | fields in"),
+// context)
+//
+// val in = Alias(In(UnresolvedAttribute("a"), Seq(Literal("Hello"), Literal("World"))), "in")()
+// val eval = Project(Seq(UnresolvedStar(None), in), UnresolvedRelation(Seq("t")))
+// val expectedPlan = Project(Seq(UnresolvedAttribute("in")), eval)
+// comparePlans(expectedPlan, logPlan, checkAnalysis = false)
+// }
}
diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala
index 543e5c05d..58a6c04b3 100644
--- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala
+++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala
@@ -13,9 +13,9 @@ import org.scalatest.matchers.should.Matchers
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar}
-import org.apache.spark.sql.catalyst.expressions.{Alias, GeneratorOuter, Literal, RegExpExtract}
+import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, GeneratorOuter, Literal, NullsLast, RegExpExtract, SortOrder}
import org.apache.spark.sql.catalyst.plans.PlanTest
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, DataFrameDropColumns, Generate, Project}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, DataFrameDropColumns, Generate, GlobalLimit, LocalLimit, Project, Sort}
import org.apache.spark.sql.types.IntegerType
class PPLLogicalPlanFlattenCommandTranslatorTestSuite
@@ -153,45 +153,4 @@ class PPLLogicalPlanFlattenCommandTranslatorTestSuite
comparePlans(expectedPlan, logPlan, checkAnalysis = false)
}
- test("test flatten with one alias") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=relation | flatten field_with_array as col1"),
- context)
-
- val relation = UnresolvedRelation(Seq("relation"))
- val flattenGenerator = new FlattenGenerator(UnresolvedAttribute("field_with_array"))
- val outerGenerator = GeneratorOuter(flattenGenerator)
- val generate =
- Generate(outerGenerator, seq(), true, None, Seq(UnresolvedAttribute("col1")), relation)
- val dropSourceColumn =
- DataFrameDropColumns(Seq(UnresolvedAttribute("field_with_array")), generate)
- val expectedPlan = Project(seq(UnresolvedStar(None)), dropSourceColumn)
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
-
- test("test flatten with alias list") {
- val context = new CatalystPlanContext
- val logPlan =
- planTransformer.visit(
- plan(pplParser, "source=relation | flatten field_with_array as (col1, col2)"),
- context)
-
- val relation = UnresolvedRelation(Seq("relation"))
- val flattenGenerator = new FlattenGenerator(UnresolvedAttribute("field_with_array"))
- val outerGenerator = GeneratorOuter(flattenGenerator)
- val generate = Generate(
- outerGenerator,
- seq(),
- true,
- None,
- Seq(UnresolvedAttribute("col1"), UnresolvedAttribute("col2")),
- relation)
- val dropSourceColumn =
- DataFrameDropColumns(Seq(UnresolvedAttribute("field_with_array")), generate)
- val expectedPlan = Project(seq(UnresolvedStar(None)), dropSourceColumn)
- comparePlans(expectedPlan, logPlan, checkAnalysis = false)
- }
-
}
diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala
index f4ed397e3..d75de8d9f 100644
--- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala
+++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala
@@ -30,12 +30,30 @@ class PPLLogicalPlanJoinTranslatorTestSuite
private val testTable3 = "spark_catalog.default.flint_ppl_test3"
private val testTable4 = "spark_catalog.default.flint_ppl_test4"
+// test("test two-tables inner join: join condition with aliases") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+
test("test two-tables inner join: join condition with aliases") {
val context = new CatalystPlanContext
val logPlan = plan(
pplParser,
s"""
- | source = $testTable1| JOIN left = l right = r ON l.id = r.id $testTable2
+ | source=users | join left = t1 right = t2 on t1.ip_int>=t2.ip_range_start and t1.ip_int 10 AND lower(r.name) = 'hello' $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = And(
- And(
- EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")),
- EqualTo(
- Literal("hello"),
- UnresolvedFunction.apply(
- "lower",
- Seq(UnresolvedAttribute("r.name")),
- isDistinct = false))),
- LessThan(Literal(10), UnresolvedAttribute("l.count")))
- val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test inner join: join condition with table names and predicates") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| INNER JOIN left = l right = r ON $testTable1.id = $testTable2.id AND $testTable1.count > 10 AND lower($testTable2.name) = 'hello' $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = And(
- And(
- EqualTo(UnresolvedAttribute(s"$testTable1.id"), UnresolvedAttribute(s"$testTable2.id")),
- EqualTo(
- Literal("hello"),
- UnresolvedFunction.apply(
- "lower",
- Seq(UnresolvedAttribute(s"$testTable2.name")),
- isDistinct = false))),
- LessThan(Literal(10), UnresolvedAttribute(s"$testTable1.count")))
- val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test left outer join") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| LEFT OUTER JOIN left = l right = r ON l.id = r.id $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test right outer join") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| RIGHT JOIN left = l right = r ON l.id = r.id $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, RightOuter, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test left semi join") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| LEFT SEMI JOIN left = l right = r ON l.id = r.id $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, LeftSemi, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test left anti join") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| LEFT ANTI JOIN left = l right = r ON l.id = r.id $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, LeftAnti, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test full outer join") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| FULL JOIN left = l right = r ON l.id = r.id $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, FullOuter, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test cross join") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| CROSS JOIN left = l right = r $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinPlan = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test cross join with join condition") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| CROSS JOIN left = l right = r ON l.id = r.id $testTable2
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightPlan = SubqueryAlias("r", table2)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, Cross, Some(joinCondition), JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1
- | | inner JOIN left = l right = r ON l.id = r.id $testTable2
- | | left JOIN left = l right = r ON l.name = r.name $testTable3
- | | cross JOIN left = l right = r $testTable4
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
- val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4"))
- var leftPlan = SubqueryAlias("l", table1)
- var rightPlan = SubqueryAlias("r", table2)
- val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE)
- leftPlan = SubqueryAlias("l", joinPlan1)
- rightPlan = SubqueryAlias("r", table3)
- val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name"))
- val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE)
- leftPlan = SubqueryAlias("l", joinPlan2)
- rightPlan = SubqueryAlias("r", table4)
- val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test complex join: TPC-H Q13") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | SEARCH source = $testTable1
- | | FIELDS id, name
- | | LEFT OUTER JOIN left = c right = o ON c.custkey = o.custkey $testTable2
- | | STATS count(o.orderkey) AS o_count BY c.custkey
- | | STATS count(1) AS custdist BY o_count
- | | SORT - custdist, - o_count
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val tableC = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val tableO = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val left = SubqueryAlias(
- "c",
- Project(Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), tableC))
- val right = SubqueryAlias("o", tableO)
- val joinCondition =
- EqualTo(UnresolvedAttribute("o.custkey"), UnresolvedAttribute("c.custkey"))
- val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE)
- val groupingExpression1 = Alias(UnresolvedAttribute("c.custkey"), "c.custkey")()
- val aggregateExpressions1 =
- Alias(
- UnresolvedFunction(
- Seq("COUNT"),
- Seq(UnresolvedAttribute("o.orderkey")),
- isDistinct = false),
- "o_count")()
- val agg1 =
- Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join)
- val groupingExpression2 = Alias(UnresolvedAttribute("o_count"), "o_count")()
- val aggregateExpressions2 =
- Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")()
- val agg2 =
- Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg1)
- val sort = Sort(
- Seq(
- SortOrder(UnresolvedAttribute("custdist"), Descending),
- SortOrder(UnresolvedAttribute("o_count"), Descending)),
- global = true,
- agg2)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), sort)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test inner join with relation subquery") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| JOIN left = l right = r ON l.id = r.id
- | [
- | source = $testTable2
- | | where id > 10 and name = 'abc'
- | | fields id, name
- | | sort id
- | | head 10
- | ]
- | | stats count(id) as cnt by type
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightSubquery =
- GlobalLimit(
- Literal(10),
- LocalLimit(
- Literal(10),
- Sort(
- Seq(SortOrder(UnresolvedAttribute("id"), Ascending)),
- global = true,
- Project(
- Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")),
- Filter(
- And(
- GreaterThan(UnresolvedAttribute("id"), Literal(10)),
- EqualTo(UnresolvedAttribute("name"), Literal("abc"))),
- table2)))))
- val rightPlan = SubqueryAlias("r", rightSubquery)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
- val groupingExpression = Alias(UnresolvedAttribute("type"), "type")()
- val aggregateExpression = Alias(
- UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false),
- "cnt")()
- val aggPlan =
- Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test left outer join with relation subquery") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1| LEFT JOIN left = l right = r ON l.id = r.id
- | [
- | source = $testTable2
- | | where id > 10 and name = 'abc'
- | | fields id, name
- | | sort id
- | | head 10
- | ]
- | | stats count(id) as cnt by type
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val leftPlan = SubqueryAlias("l", table1)
- val rightSubquery =
- GlobalLimit(
- Literal(10),
- LocalLimit(
- Literal(10),
- Sort(
- Seq(SortOrder(UnresolvedAttribute("id"), Ascending)),
- global = true,
- Project(
- Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")),
- Filter(
- And(
- GreaterThan(UnresolvedAttribute("id"), Literal(10)),
- EqualTo(UnresolvedAttribute("name"), Literal("abc"))),
- table2)))))
- val rightPlan = SubqueryAlias("r", rightSubquery)
- val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE)
- val groupingExpression = Alias(UnresolvedAttribute("type"), "type")()
- val aggregateExpression = Alias(
- UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false),
- "cnt")()
- val aggPlan =
- Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins with relation subquery") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1
- | | head 10
- | | inner JOIN left = l right = r ON l.id = r.id
- | [
- | source = $testTable2
- | | where id > 10
- | ]
- | | left JOIN left = l right = r ON l.name = r.name
- | [
- | source = $testTable3
- | | fields id
- | ]
- | | cross JOIN left = l right = r
- | [
- | source = $testTable4
- | | sort id
- | ]
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
- val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4"))
- var leftPlan = SubqueryAlias("l", GlobalLimit(Literal(10), LocalLimit(Literal(10), table1)))
- var rightPlan =
- SubqueryAlias("r", Filter(GreaterThan(UnresolvedAttribute("id"), Literal(10)), table2))
- val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
- val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE)
- leftPlan = SubqueryAlias("l", joinPlan1)
- rightPlan = SubqueryAlias("r", Project(Seq(UnresolvedAttribute("id")), table3))
- val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name"))
- val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE)
- leftPlan = SubqueryAlias("l", joinPlan2)
- rightPlan = SubqueryAlias(
- "r",
- Sort(Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), global = true, table4))
- val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test complex join: TPC-H Q13 with relation subquery") {
- // select
- // c_count,
- // count(*) as custdist
- // from
- // (
- // select
- // c_custkey,
- // count(o_orderkey) as c_count
- // from
- // customer left outer join orders on
- // c_custkey = o_custkey
- // and o_comment not like '%special%requests%'
- // group by
- // c_custkey
- // ) as c_orders
- // group by
- // c_count
- // order by
- // custdist desc,
- // c_count desc
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | SEARCH source = [
- | SEARCH source = customer
- | | LEFT OUTER JOIN left = c right = o ON c_custkey = o_custkey
- | [
- | SEARCH source = orders
- | | WHERE not like(o_comment, '%special%requests%')
- | ]
- | | STATS COUNT(o_orderkey) AS c_count BY c_custkey
- | ] AS c_orders
- | | STATS COUNT(o_orderkey) AS c_count BY c_custkey
- | | STATS COUNT(1) AS custdist BY c_count
- | | SORT - custdist, - c_count
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val tableC = UnresolvedRelation(Seq("customer"))
- val tableO = UnresolvedRelation(Seq("orders"))
- val left = SubqueryAlias("c", tableC)
- val filterNot = Filter(
- Not(
- UnresolvedFunction(
- Seq("like"),
- Seq(UnresolvedAttribute("o_comment"), Literal("%special%requests%")),
- isDistinct = false)),
- tableO)
- val right = SubqueryAlias("o", filterNot)
- val joinCondition =
- EqualTo(UnresolvedAttribute("o_custkey"), UnresolvedAttribute("c_custkey"))
- val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE)
- val groupingExpression1 = Alias(UnresolvedAttribute("c_custkey"), "c_custkey")()
- val aggregateExpressions1 =
- Alias(
- UnresolvedFunction(
- Seq("COUNT"),
- Seq(UnresolvedAttribute("o_orderkey")),
- isDistinct = false),
- "c_count")()
- val agg3 =
- Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join)
- val subqueryAlias = SubqueryAlias("c_orders", agg3)
- val agg2 =
- Aggregate(
- Seq(groupingExpression1),
- Seq(aggregateExpressions1, groupingExpression1),
- subqueryAlias)
- val groupingExpression2 = Alias(UnresolvedAttribute("c_count"), "c_count")()
- val aggregateExpressions2 =
- Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")()
- val agg1 =
- Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg2)
- val sort = Sort(
- Seq(
- SortOrder(UnresolvedAttribute("custdist"), Descending),
- SortOrder(UnresolvedAttribute("c_count"), Descending)),
- global = true,
- agg1)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), sort)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins with table alias") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = table1 as t1
- | | JOIN ON t1.id = t2.id
- | [
- | source = table2 as t2
- | ]
- | | JOIN ON t2.id = t3.id
- | [
- | source = table3 as t3
- | ]
- | | JOIN ON t3.id = t4.id
- | [
- | source = table4 as t4
- | ]
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("table1"))
- val table2 = UnresolvedRelation(Seq("table2"))
- val table3 = UnresolvedRelation(Seq("table3"))
- val table4 = UnresolvedRelation(Seq("table4"))
- val joinPlan1 = Join(
- SubqueryAlias("t1", table1),
- SubqueryAlias("t2", table2),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))),
- JoinHint.NONE)
- val joinPlan2 = Join(
- joinPlan1,
- SubqueryAlias("t3", table3),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))),
- JoinHint.NONE)
- val joinPlan3 = Join(
- joinPlan2,
- SubqueryAlias("t4", table4),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))),
- JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins with table and subquery alias") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = table1 as t1
- | | JOIN left = l right = r ON t1.id = t2.id
- | [
- | source = table2 as t2
- | ]
- | | JOIN left = l right = r ON t2.id = t3.id
- | [
- | source = table3 as t3
- | ]
- | | JOIN left = l right = r ON t3.id = t4.id
- | [
- | source = table4 as t4
- | ]
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("table1"))
- val table2 = UnresolvedRelation(Seq("table2"))
- val table3 = UnresolvedRelation(Seq("table3"))
- val table4 = UnresolvedRelation(Seq("table4"))
- val joinPlan1 = Join(
- SubqueryAlias("l", SubqueryAlias("t1", table1)),
- SubqueryAlias("r", SubqueryAlias("t2", table2)),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))),
- JoinHint.NONE)
- val joinPlan2 = Join(
- SubqueryAlias("l", joinPlan1),
- SubqueryAlias("r", SubqueryAlias("t3", table3)),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))),
- JoinHint.NONE)
- val joinPlan3 = Join(
- SubqueryAlias("l", joinPlan2),
- SubqueryAlias("r", SubqueryAlias("t4", table4)),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))),
- JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins without table aliases") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = table1
- | | JOIN ON table1.id = table2.id table2
- | | JOIN ON table1.id = table3.id table3
- | | JOIN ON table2.id = table4.id table4
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("table1"))
- val table2 = UnresolvedRelation(Seq("table2"))
- val table3 = UnresolvedRelation(Seq("table3"))
- val table4 = UnresolvedRelation(Seq("table4"))
- val joinPlan1 = Join(
- table1,
- table2,
- Inner,
- Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table2.id"))),
- JoinHint.NONE)
- val joinPlan2 = Join(
- joinPlan1,
- table3,
- Inner,
- Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table3.id"))),
- JoinHint.NONE)
- val joinPlan3 = Join(
- joinPlan2,
- table4,
- Inner,
- Some(EqualTo(UnresolvedAttribute("table2.id"), UnresolvedAttribute("table4.id"))),
- JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins with part subquery aliases") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = table1
- | | JOIN left = t1 right = t2 ON t1.name = t2.name table2
- | | JOIN right = t3 ON t1.name = t3.name table3
- | | JOIN right = t4 ON t2.name = t4.name table4
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("table1"))
- val table2 = UnresolvedRelation(Seq("table2"))
- val table3 = UnresolvedRelation(Seq("table3"))
- val table4 = UnresolvedRelation(Seq("table4"))
- val joinPlan1 = Join(
- SubqueryAlias("t1", table1),
- SubqueryAlias("t2", table2),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
- JoinHint.NONE)
- val joinPlan2 = Join(
- joinPlan1,
- SubqueryAlias("t3", table3),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))),
- JoinHint.NONE)
- val joinPlan3 = Join(
- joinPlan2,
- SubqueryAlias("t4", table4),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t2.name"), UnresolvedAttribute("t4.name"))),
- JoinHint.NONE)
- val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins with self join 1") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1
- | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2
- | | JOIN right = t3 ON t1.name = t3.name $testTable3
- | | JOIN right = t4 ON t1.name = t4.name $testTable1
- | | fields t1.name, t2.name, t3.name, t4.name
- | """.stripMargin)
-
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
- val joinPlan1 = Join(
- SubqueryAlias("t1", table1),
- SubqueryAlias("t2", table2),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
- JoinHint.NONE)
- val joinPlan2 = Join(
- joinPlan1,
- SubqueryAlias("t3", table3),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))),
- JoinHint.NONE)
- val joinPlan3 = Join(
- joinPlan2,
- SubqueryAlias("t4", table1),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))),
- JoinHint.NONE)
- val expectedPlan = Project(
- Seq(
- UnresolvedAttribute("t1.name"),
- UnresolvedAttribute("t2.name"),
- UnresolvedAttribute("t3.name"),
- UnresolvedAttribute("t4.name")),
- joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test multiple joins with self join 2") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1
- | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2
- | | JOIN right = t3 ON t1.name = t3.name $testTable3
- | | JOIN ON t1.name = t4.name
- | [
- | source = $testTable1
- | ] as t4
- | | fields t1.name, t2.name, t3.name, t4.name
- | """.stripMargin)
-
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
- val joinPlan1 = Join(
- SubqueryAlias("t1", table1),
- SubqueryAlias("t2", table2),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
- JoinHint.NONE)
- val joinPlan2 = Join(
- joinPlan1,
- SubqueryAlias("t3", table3),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))),
- JoinHint.NONE)
- val joinPlan3 = Join(
- joinPlan2,
- SubqueryAlias("t4", table1),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))),
- JoinHint.NONE)
- val expectedPlan = Project(
- Seq(
- UnresolvedAttribute("t1.name"),
- UnresolvedAttribute("t2.name"),
- UnresolvedAttribute("t3.name"),
- UnresolvedAttribute("t4.name")),
- joinPlan3)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
-
- test("test side alias will override the subquery alias") {
- val context = new CatalystPlanContext
- val logPlan = plan(
- pplParser,
- s"""
- | source = $testTable1
- | | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = $testTable2 as ttt ] as tt
- | | fields t1.name, t2.name
- | """.stripMargin)
- val logicalPlan = planTransformer.visit(logPlan, context)
- val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
- val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
- val joinPlan1 = Join(
- SubqueryAlias("t1", table1),
- SubqueryAlias("t2", SubqueryAlias("tt", SubqueryAlias("ttt", table2))),
- Inner,
- Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
- JoinHint.NONE)
- val expectedPlan =
- Project(Seq(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name")), joinPlan1)
- comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
- }
+//
+// test("test two-tables inner join: join condition with table names") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| JOIN left = l right = r ON $testTable1.id = $testTable2.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition =
+// EqualTo(UnresolvedAttribute(s"$testTable1.id"), UnresolvedAttribute(s"$testTable2.id"))
+// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test inner join: join condition without prefix") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| JOIN left = l right = r ON id = name $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition =
+// EqualTo(UnresolvedAttribute("id"), UnresolvedAttribute("name"))
+// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test inner join: join condition with aliases and predicates") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| JOIN left = l right = r ON l.id = r.id AND l.count > 10 AND lower(r.name) = 'hello' $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = And(
+// And(
+// EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")),
+// EqualTo(
+// Literal("hello"),
+// UnresolvedFunction.apply(
+// "lower",
+// Seq(UnresolvedAttribute("r.name")),
+// isDistinct = false))),
+// LessThan(Literal(10), UnresolvedAttribute("l.count")))
+// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test inner join: join condition with table names and predicates") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| INNER JOIN left = l right = r ON $testTable1.id = $testTable2.id AND $testTable1.count > 10 AND lower($testTable2.name) = 'hello' $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = And(
+// And(
+// EqualTo(UnresolvedAttribute(s"$testTable1.id"), UnresolvedAttribute(s"$testTable2.id")),
+// EqualTo(
+// Literal("hello"),
+// UnresolvedFunction.apply(
+// "lower",
+// Seq(UnresolvedAttribute(s"$testTable2.name")),
+// isDistinct = false))),
+// LessThan(Literal(10), UnresolvedAttribute(s"$testTable1.count")))
+// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test left outer join") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| LEFT OUTER JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test right outer join") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| RIGHT JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, RightOuter, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test left semi join") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| LEFT SEMI JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, LeftSemi, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test left anti join") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| LEFT ANTI JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, LeftAnti, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test full outer join") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| FULL JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, FullOuter, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test cross join") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| CROSS JOIN left = l right = r $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinPlan = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test cross join with join condition") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| CROSS JOIN left = l right = r ON l.id = r.id $testTable2
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightPlan = SubqueryAlias("r", table2)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, Cross, Some(joinCondition), JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1
+// | | inner JOIN left = l right = r ON l.id = r.id $testTable2
+// | | left JOIN left = l right = r ON l.name = r.name $testTable3
+// | | cross JOIN left = l right = r $testTable4
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
+// val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4"))
+// var leftPlan = SubqueryAlias("l", table1)
+// var rightPlan = SubqueryAlias("r", table2)
+// val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE)
+// leftPlan = SubqueryAlias("l", joinPlan1)
+// rightPlan = SubqueryAlias("r", table3)
+// val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name"))
+// val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE)
+// leftPlan = SubqueryAlias("l", joinPlan2)
+// rightPlan = SubqueryAlias("r", table4)
+// val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test complex join: TPC-H Q13") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | SEARCH source = $testTable1
+// | | FIELDS id, name
+// | | LEFT OUTER JOIN left = c right = o ON c.custkey = o.custkey $testTable2
+// | | STATS count(o.orderkey) AS o_count BY c.custkey
+// | | STATS count(1) AS custdist BY o_count
+// | | SORT - custdist, - o_count
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val tableC = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val tableO = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val left = SubqueryAlias(
+// "c",
+// Project(Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), tableC))
+// val right = SubqueryAlias("o", tableO)
+// val joinCondition =
+// EqualTo(UnresolvedAttribute("o.custkey"), UnresolvedAttribute("c.custkey"))
+// val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE)
+// val groupingExpression1 = Alias(UnresolvedAttribute("c.custkey"), "c.custkey")()
+// val aggregateExpressions1 =
+// Alias(
+// UnresolvedFunction(
+// Seq("COUNT"),
+// Seq(UnresolvedAttribute("o.orderkey")),
+// isDistinct = false),
+// "o_count")()
+// val agg1 =
+// Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join)
+// val groupingExpression2 = Alias(UnresolvedAttribute("o_count"), "o_count")()
+// val aggregateExpressions2 =
+// Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")()
+// val agg2 =
+// Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg1)
+// val sort = Sort(
+// Seq(
+// SortOrder(UnresolvedAttribute("custdist"), Descending),
+// SortOrder(UnresolvedAttribute("o_count"), Descending)),
+// global = true,
+// agg2)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), sort)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test inner join with relation subquery") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| JOIN left = l right = r ON l.id = r.id
+// | [
+// | source = $testTable2
+// | | where id > 10 and name = 'abc'
+// | | fields id, name
+// | | sort id
+// | | head 10
+// | ]
+// | | stats count(id) as cnt by type
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightSubquery =
+// GlobalLimit(
+// Literal(10),
+// LocalLimit(
+// Literal(10),
+// Sort(
+// Seq(SortOrder(UnresolvedAttribute("id"), Ascending)),
+// global = true,
+// Project(
+// Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")),
+// Filter(
+// And(
+// GreaterThan(UnresolvedAttribute("id"), Literal(10)),
+// EqualTo(UnresolvedAttribute("name"), Literal("abc"))),
+// table2)))))
+// val rightPlan = SubqueryAlias("r", rightSubquery)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE)
+// val groupingExpression = Alias(UnresolvedAttribute("type"), "type")()
+// val aggregateExpression = Alias(
+// UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false),
+// "cnt")()
+// val aggPlan =
+// Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test left outer join with relation subquery") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1| LEFT JOIN left = l right = r ON l.id = r.id
+// | [
+// | source = $testTable2
+// | | where id > 10 and name = 'abc'
+// | | fields id, name
+// | | sort id
+// | | head 10
+// | ]
+// | | stats count(id) as cnt by type
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val leftPlan = SubqueryAlias("l", table1)
+// val rightSubquery =
+// GlobalLimit(
+// Literal(10),
+// LocalLimit(
+// Literal(10),
+// Sort(
+// Seq(SortOrder(UnresolvedAttribute("id"), Ascending)),
+// global = true,
+// Project(
+// Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")),
+// Filter(
+// And(
+// GreaterThan(UnresolvedAttribute("id"), Literal(10)),
+// EqualTo(UnresolvedAttribute("name"), Literal("abc"))),
+// table2)))))
+// val rightPlan = SubqueryAlias("r", rightSubquery)
+// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE)
+// val groupingExpression = Alias(UnresolvedAttribute("type"), "type")()
+// val aggregateExpression = Alias(
+// UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false),
+// "cnt")()
+// val aggPlan =
+// Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins with relation subquery") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1
+// | | head 10
+// | | inner JOIN left = l right = r ON l.id = r.id
+// | [
+// | source = $testTable2
+// | | where id > 10
+// | ]
+// | | left JOIN left = l right = r ON l.name = r.name
+// | [
+// | source = $testTable3
+// | | fields id
+// | ]
+// | | cross JOIN left = l right = r
+// | [
+// | source = $testTable4
+// | | sort id
+// | ]
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
+// val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4"))
+// var leftPlan = SubqueryAlias("l", GlobalLimit(Literal(10), LocalLimit(Literal(10), table1)))
+// var rightPlan =
+// SubqueryAlias("r", Filter(GreaterThan(UnresolvedAttribute("id"), Literal(10)), table2))
+// val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id"))
+// val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE)
+// leftPlan = SubqueryAlias("l", joinPlan1)
+// rightPlan = SubqueryAlias("r", Project(Seq(UnresolvedAttribute("id")), table3))
+// val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name"))
+// val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE)
+// leftPlan = SubqueryAlias("l", joinPlan2)
+// rightPlan = SubqueryAlias(
+// "r",
+// Sort(Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), global = true, table4))
+// val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test complex join: TPC-H Q13 with relation subquery") {
+// // select
+// // c_count,
+// // count(*) as custdist
+// // from
+// // (
+// // select
+// // c_custkey,
+// // count(o_orderkey) as c_count
+// // from
+// // customer left outer join orders on
+// // c_custkey = o_custkey
+// // and o_comment not like '%special%requests%'
+// // group by
+// // c_custkey
+// // ) as c_orders
+// // group by
+// // c_count
+// // order by
+// // custdist desc,
+// // c_count desc
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | SEARCH source = [
+// | SEARCH source = customer
+// | | LEFT OUTER JOIN left = c right = o ON c_custkey = o_custkey
+// | [
+// | SEARCH source = orders
+// | | WHERE not like(o_comment, '%special%requests%')
+// | ]
+// | | STATS COUNT(o_orderkey) AS c_count BY c_custkey
+// | ] AS c_orders
+// | | STATS COUNT(o_orderkey) AS c_count BY c_custkey
+// | | STATS COUNT(1) AS custdist BY c_count
+// | | SORT - custdist, - c_count
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val tableC = UnresolvedRelation(Seq("customer"))
+// val tableO = UnresolvedRelation(Seq("orders"))
+// val left = SubqueryAlias("c", tableC)
+// val filterNot = Filter(
+// Not(
+// UnresolvedFunction(
+// Seq("like"),
+// Seq(UnresolvedAttribute("o_comment"), Literal("%special%requests%")),
+// isDistinct = false)),
+// tableO)
+// val right = SubqueryAlias("o", filterNot)
+// val joinCondition =
+// EqualTo(UnresolvedAttribute("o_custkey"), UnresolvedAttribute("c_custkey"))
+// val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE)
+// val groupingExpression1 = Alias(UnresolvedAttribute("c_custkey"), "c_custkey")()
+// val aggregateExpressions1 =
+// Alias(
+// UnresolvedFunction(
+// Seq("COUNT"),
+// Seq(UnresolvedAttribute("o_orderkey")),
+// isDistinct = false),
+// "c_count")()
+// val agg3 =
+// Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join)
+// val subqueryAlias = SubqueryAlias("c_orders", agg3)
+// val agg2 =
+// Aggregate(
+// Seq(groupingExpression1),
+// Seq(aggregateExpressions1, groupingExpression1),
+// subqueryAlias)
+// val groupingExpression2 = Alias(UnresolvedAttribute("c_count"), "c_count")()
+// val aggregateExpressions2 =
+// Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")()
+// val agg1 =
+// Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg2)
+// val sort = Sort(
+// Seq(
+// SortOrder(UnresolvedAttribute("custdist"), Descending),
+// SortOrder(UnresolvedAttribute("c_count"), Descending)),
+// global = true,
+// agg1)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), sort)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins with table alias") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = table1 as t1
+// | | JOIN ON t1.id = t2.id
+// | [
+// | source = table2 as t2
+// | ]
+// | | JOIN ON t2.id = t3.id
+// | [
+// | source = table3 as t3
+// | ]
+// | | JOIN ON t3.id = t4.id
+// | [
+// | source = table4 as t4
+// | ]
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("table1"))
+// val table2 = UnresolvedRelation(Seq("table2"))
+// val table3 = UnresolvedRelation(Seq("table3"))
+// val table4 = UnresolvedRelation(Seq("table4"))
+// val joinPlan1 = Join(
+// SubqueryAlias("t1", table1),
+// SubqueryAlias("t2", table2),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))),
+// JoinHint.NONE)
+// val joinPlan2 = Join(
+// joinPlan1,
+// SubqueryAlias("t3", table3),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))),
+// JoinHint.NONE)
+// val joinPlan3 = Join(
+// joinPlan2,
+// SubqueryAlias("t4", table4),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))),
+// JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins with table and subquery alias") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = table1 as t1
+// | | JOIN left = l right = r ON t1.id = t2.id
+// | [
+// | source = table2 as t2
+// | ]
+// | | JOIN left = l right = r ON t2.id = t3.id
+// | [
+// | source = table3 as t3
+// | ]
+// | | JOIN left = l right = r ON t3.id = t4.id
+// | [
+// | source = table4 as t4
+// | ]
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("table1"))
+// val table2 = UnresolvedRelation(Seq("table2"))
+// val table3 = UnresolvedRelation(Seq("table3"))
+// val table4 = UnresolvedRelation(Seq("table4"))
+// val joinPlan1 = Join(
+// SubqueryAlias("l", SubqueryAlias("t1", table1)),
+// SubqueryAlias("r", SubqueryAlias("t2", table2)),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))),
+// JoinHint.NONE)
+// val joinPlan2 = Join(
+// SubqueryAlias("l", joinPlan1),
+// SubqueryAlias("r", SubqueryAlias("t3", table3)),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))),
+// JoinHint.NONE)
+// val joinPlan3 = Join(
+// SubqueryAlias("l", joinPlan2),
+// SubqueryAlias("r", SubqueryAlias("t4", table4)),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))),
+// JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins without table aliases") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = table1
+// | | JOIN ON table1.id = table2.id table2
+// | | JOIN ON table1.id = table3.id table3
+// | | JOIN ON table2.id = table4.id table4
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("table1"))
+// val table2 = UnresolvedRelation(Seq("table2"))
+// val table3 = UnresolvedRelation(Seq("table3"))
+// val table4 = UnresolvedRelation(Seq("table4"))
+// val joinPlan1 = Join(
+// table1,
+// table2,
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table2.id"))),
+// JoinHint.NONE)
+// val joinPlan2 = Join(
+// joinPlan1,
+// table3,
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table3.id"))),
+// JoinHint.NONE)
+// val joinPlan3 = Join(
+// joinPlan2,
+// table4,
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("table2.id"), UnresolvedAttribute("table4.id"))),
+// JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins with part subquery aliases") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = table1
+// | | JOIN left = t1 right = t2 ON t1.name = t2.name table2
+// | | JOIN right = t3 ON t1.name = t3.name table3
+// | | JOIN right = t4 ON t2.name = t4.name table4
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("table1"))
+// val table2 = UnresolvedRelation(Seq("table2"))
+// val table3 = UnresolvedRelation(Seq("table3"))
+// val table4 = UnresolvedRelation(Seq("table4"))
+// val joinPlan1 = Join(
+// SubqueryAlias("t1", table1),
+// SubqueryAlias("t2", table2),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
+// JoinHint.NONE)
+// val joinPlan2 = Join(
+// joinPlan1,
+// SubqueryAlias("t3", table3),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))),
+// JoinHint.NONE)
+// val joinPlan3 = Join(
+// joinPlan2,
+// SubqueryAlias("t4", table4),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t2.name"), UnresolvedAttribute("t4.name"))),
+// JoinHint.NONE)
+// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins with self join 1") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1
+// | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2
+// | | JOIN right = t3 ON t1.name = t3.name $testTable3
+// | | JOIN right = t4 ON t1.name = t4.name $testTable1
+// | | fields t1.name, t2.name, t3.name, t4.name
+// | """.stripMargin)
+//
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
+// val joinPlan1 = Join(
+// SubqueryAlias("t1", table1),
+// SubqueryAlias("t2", table2),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
+// JoinHint.NONE)
+// val joinPlan2 = Join(
+// joinPlan1,
+// SubqueryAlias("t3", table3),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))),
+// JoinHint.NONE)
+// val joinPlan3 = Join(
+// joinPlan2,
+// SubqueryAlias("t4", table1),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))),
+// JoinHint.NONE)
+// val expectedPlan = Project(
+// Seq(
+// UnresolvedAttribute("t1.name"),
+// UnresolvedAttribute("t2.name"),
+// UnresolvedAttribute("t3.name"),
+// UnresolvedAttribute("t4.name")),
+// joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test multiple joins with self join 2") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1
+// | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2
+// | | JOIN right = t3 ON t1.name = t3.name $testTable3
+// | | JOIN ON t1.name = t4.name
+// | [
+// | source = $testTable1
+// | ] as t4
+// | | fields t1.name, t2.name, t3.name, t4.name
+// | """.stripMargin)
+//
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3"))
+// val joinPlan1 = Join(
+// SubqueryAlias("t1", table1),
+// SubqueryAlias("t2", table2),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
+// JoinHint.NONE)
+// val joinPlan2 = Join(
+// joinPlan1,
+// SubqueryAlias("t3", table3),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))),
+// JoinHint.NONE)
+// val joinPlan3 = Join(
+// joinPlan2,
+// SubqueryAlias("t4", table1),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))),
+// JoinHint.NONE)
+// val expectedPlan = Project(
+// Seq(
+// UnresolvedAttribute("t1.name"),
+// UnresolvedAttribute("t2.name"),
+// UnresolvedAttribute("t3.name"),
+// UnresolvedAttribute("t4.name")),
+// joinPlan3)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
+//
+// test("test side alias will override the subquery alias") {
+// val context = new CatalystPlanContext
+// val logPlan = plan(
+// pplParser,
+// s"""
+// | source = $testTable1
+// | | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = $testTable2 as ttt ] as tt
+// | | fields t1.name, t2.name
+// | """.stripMargin)
+// val logicalPlan = planTransformer.visit(logPlan, context)
+// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1"))
+// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2"))
+// val joinPlan1 = Join(
+// SubqueryAlias("t1", table1),
+// SubqueryAlias("t2", SubqueryAlias("tt", SubqueryAlias("ttt", table2))),
+// Inner,
+// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))),
+// JoinHint.NONE)
+// val expectedPlan =
+// Project(Seq(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name")), joinPlan1)
+// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false)
+// }
}
diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
index ad26cf21a..63c120a2c 100644
--- a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
+++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala
@@ -168,12 +168,10 @@ trait FlintJobExecutor {
IRestHighLevelClient.recordOperationSuccess(
MetricConstants.RESULT_METADATA_WRITE_METRIC_PREFIX)
} catch {
- case t: Throwable =>
+ case e: Exception =>
IRestHighLevelClient.recordOperationFailure(
MetricConstants.RESULT_METADATA_WRITE_METRIC_PREFIX,
- t)
- // Re-throw the exception
- throw t
+ e)
}
}
@@ -452,8 +450,7 @@ trait FlintJobExecutor {
statusCode.foreach(code => errorDetails.put("StatusCode", code.toString))
val errorJson = mapper.writeValueAsString(errorDetails)
- // Record the processed error message
- throwableHandler.setError(errorJson)
+
// CustomLogging will call log4j logger.error() underneath
statusCode match {
case Some(code) =>