diff --git a/.github/workflows/snapshot-publish.yml b/.github/workflows/snapshot-publish.yml index c9a2efa7b..0ac112b62 100644 --- a/.github/workflows/snapshot-publish.yml +++ b/.github/workflows/snapshot-publish.yml @@ -27,9 +27,6 @@ jobs: distribution: 'temurin' java-version: 11 - - name: Set up SBT - uses: sbt/setup-sbt@v1 - - name: Publish to Local Maven run: | sbt standaloneCosmetic/publishM2 diff --git a/.github/workflows/test-and-build-workflow.yml b/.github/workflows/test-and-build-workflow.yml index 216f8292d..e3b2b20f4 100644 --- a/.github/workflows/test-and-build-workflow.yml +++ b/.github/workflows/test-and-build-workflow.yml @@ -22,9 +22,6 @@ jobs: distribution: 'temurin' java-version: 11 - - name: Set up SBT - uses: sbt/setup-sbt@v1 - - name: Style check run: sbt scalafmtCheckAll diff --git a/README.md b/README.md index db3790e64..2a3754e6c 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,8 @@ bin/spark-shell --packages "org.opensearch:opensearch-spark-standalone_2.12:0.7. To build and run this PPL in Spark, you can run (requires Java 11): ``` -sbt clean sparkPPLCosmetic/publishM2 + + ``` Then add org.opensearch:opensearch-spark-ppl_2.12 when run spark application, for example, diff --git a/build.sbt b/build.sbt index 365b88aa3..131fb2347 100644 --- a/build.sbt +++ b/build.sbt @@ -2,7 +2,8 @@ * Copyright OpenSearch Contributors * SPDX-License-Identifier: Apache-2.0 */ -import Dependencies.* +import Dependencies._ +import sbtassembly.AssemblyPlugin.autoImport.ShadeRule lazy val scala212 = "2.12.14" lazy val sparkVersion = "3.5.1" @@ -37,11 +38,6 @@ ThisBuild / scalastyleConfig := baseDirectory.value / "scalastyle-config.xml" */ ThisBuild / Test / parallelExecution := false -/** - * Set the parallelism of forked tests to 4 to accelerate integration test - */ -concurrentRestrictions in Global := Seq(Tags.limit(Tags.ForkedTestGroup, 4)) - // Run as part of compile task. lazy val compileScalastyle = taskKey[Unit]("compileScalastyle") @@ -194,6 +190,9 @@ lazy val pplSparkIntegration = (project in file("ppl-spark-integration")) "com.github.sbt" % "junit-interface" % "0.13.3" % "test", "org.projectlombok" % "lombok" % "1.18.30", "com.github.seancfoley" % "ipaddress" % "5.5.1", + "org.apache.commons" % "commons-lang3" % "3.17.0", + "org.apache.commons" % "commons-csv" % "1.12.0", + "com.fasterxml.jackson.core" % "jackson-annotations" % "2.14.2", ), libraryDependencies ++= deps(sparkVersion), // ANTLR settings @@ -278,29 +277,13 @@ lazy val integtest = (project in file("integ-test")) IntegrationTest / javaSource := baseDirectory.value / "src/integration/java", IntegrationTest / scalaSource := baseDirectory.value / "src/integration/scala", IntegrationTest / resourceDirectory := baseDirectory.value / "src/integration/resources", - IntegrationTest / parallelExecution := true, // enable parallel execution - IntegrationTest / testForkedParallel := false, // disable forked parallel execution to avoid duplicate spark context in the same JVM + IntegrationTest / parallelExecution := false, IntegrationTest / fork := true, - IntegrationTest / testGrouping := { - val tests = (IntegrationTest / definedTests).value - val forkOptions = ForkOptions() - val groups = tests.grouped(tests.size / 4 + 1).zipWithIndex.map { case (group, index) => - val groupName = s"group-${index + 1}" - new Tests.Group( - name = groupName, - tests = group, - runPolicy = Tests.SubProcess( - forkOptions.withRunJVMOptions(forkOptions.runJVMOptions ++ - Seq(s"-Djava.io.tmpdir=${baseDirectory.value}/integ-test/target/tmp/$groupName"))) - ) - } - groups.toSeq - } )), inConfig(AwsIntegrationTest)(Defaults.testSettings ++ Seq( AwsIntegrationTest / javaSource := baseDirectory.value / "src/aws-integration/java", AwsIntegrationTest / scalaSource := baseDirectory.value / "src/aws-integration/scala", - AwsIntegrationTest / parallelExecution := true, + AwsIntegrationTest / parallelExecution := false, AwsIntegrationTest / fork := true, )), libraryDependencies ++= Seq( diff --git a/docker/apache-spark-sample/.env b/docker/apache-spark-sample/.env deleted file mode 100644 index a047df5ba..000000000 --- a/docker/apache-spark-sample/.env +++ /dev/null @@ -1,4 +0,0 @@ -MASTER_UI_PORT=8080 -MASTER_PORT=7077 -UI_PORT=4040 -PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar diff --git a/docker/apache-spark-sample/docker-compose.yml b/docker/apache-spark-sample/docker-compose.yml deleted file mode 100644 index df2da6d52..000000000 --- a/docker/apache-spark-sample/docker-compose.yml +++ /dev/null @@ -1,41 +0,0 @@ -services: - spark: - image: bitnami/spark:3.5.3 - ports: - - "${MASTER_UI_PORT:-8080}:8080" - - "${MASTER_PORT:-7077}:7077" - - "${UI_PORT:-4040}:4040" - environment: - - SPARK_MODE=master - - SPARK_RPC_AUTHENTICATION_ENABLED=no - - SPARK_RPC_ENCRYPTION_ENABLED=no - - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - - SPARK_SSL_ENABLED=no - - SPARK_PUBLIC_DNS=localhost - volumes: - - type: bind - source: ./spark-defaults.conf - target: /opt/bitnami/spark/conf/spark-defaults.conf - - type: bind - source: $PPL_JAR - target: /opt/bitnami/spark/jars/ppl-spark-integration.jar - - spark-worker: - image: bitnami/spark:3.5.3 - environment: - - SPARK_MODE=worker - - SPARK_MASTER_URL=spark://spark:7077 - - SPARK_WORKER_MEMORY=${WORKER_MEMORY:-1G} - - SPARK_WORKER_CORES=${WORKER_CORES:-1} - - SPARK_RPC_AUTHENTICATION_ENABLED=no - - SPARK_RPC_ENCRYPTION_ENABLED=no - - SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no - - SPARK_SSL_ENABLED=no - - SPARK_PUBLIC_DNS=localhost - volumes: - - type: bind - source: ./spark-defaults.conf - target: /opt/bitnami/spark/conf/spark-defaults.conf - - type: bind - source: $PPL_JAR - target: /opt/bitnami/spark/jars/ppl-spark-integration.jar diff --git a/docker/apache-spark-sample/spark-defaults.conf b/docker/apache-spark-sample/spark-defaults.conf deleted file mode 100644 index 47fdaae03..000000000 --- a/docker/apache-spark-sample/spark-defaults.conf +++ /dev/null @@ -1,29 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# Default system properties included when running spark-submit. -# This is useful for setting default environmental settings. - -# Example: -# spark.master spark://master:7077 -# spark.eventLog.enabled true -# spark.eventLog.dir hdfs://namenode:8021/directory -# spark.serializer org.apache.spark.serializer.KryoSerializer -# spark.driver.memory 5g -# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" -spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions -spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog diff --git a/docker/spark-emr-sample/.env b/docker/spark-emr-sample/.env deleted file mode 100644 index a717532a4..000000000 --- a/docker/spark-emr-sample/.env +++ /dev/null @@ -1 +0,0 @@ -PPL_JAR=../../ppl-spark-integration/target/scala-2.12/ppl-spark-integration-assembly-0.7.0-SNAPSHOT.jar diff --git a/docker/spark-emr-sample/docker-compose.yml b/docker/spark-emr-sample/docker-compose.yml deleted file mode 100644 index d0da9f166..000000000 --- a/docker/spark-emr-sample/docker-compose.yml +++ /dev/null @@ -1,17 +0,0 @@ -services: - spark-emr: - image: public.ecr.aws/emr-serverless/spark/emr-7.5.0:20241125 - volumes: - - type: bind - source: ./logging-conf - target: /var/loggingConfiguration/spark - - type: bind - source: ../spark-sample-app/target/scala-2.12 - target: /app - - type: bind - source: ./spark-conf - target: /etc/spark/conf - - type: bind - source: ${PPL_JAR} - target: /usr/lib/spark/jars/ppl-spark-integration.jar - command: driver --class MyApp /app/myapp_2.12-1.0.jar diff --git a/docker/spark-emr-sample/logging-conf/run-adot-collector.sh b/docker/spark-emr-sample/logging-conf/run-adot-collector.sh deleted file mode 100644 index 0873413aa..000000000 --- a/docker/spark-emr-sample/logging-conf/run-adot-collector.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -# Do nothing as default logging is sufficient diff --git a/docker/spark-emr-sample/logging-conf/run-fluentd-spark.sh b/docker/spark-emr-sample/logging-conf/run-fluentd-spark.sh deleted file mode 100644 index 0873413aa..000000000 --- a/docker/spark-emr-sample/logging-conf/run-fluentd-spark.sh +++ /dev/null @@ -1,3 +0,0 @@ -#!/bin/bash - -# Do nothing as default logging is sufficient diff --git a/docker/spark-emr-sample/spark-conf/hive-site.xml b/docker/spark-emr-sample/spark-conf/hive-site.xml deleted file mode 100644 index f0dc50e1e..000000000 --- a/docker/spark-emr-sample/spark-conf/hive-site.xml +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - hive.metastore.connect.retries - 15 - - \ No newline at end of file diff --git a/docker/spark-emr-sample/spark-conf/log4j2.properties b/docker/spark-emr-sample/spark-conf/log4j2.properties deleted file mode 100644 index 27ff7047f..000000000 --- a/docker/spark-emr-sample/spark-conf/log4j2.properties +++ /dev/null @@ -1,74 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -# This property will be overridden for JVMs running inside YARN containers. -# Other log4j configurations may reference the property, for example, in order to -# cause a log file to appear in the usual log directory for the YARN container, -# so that LogPusher will upload it to S3. The following provides a default value -# to be used for this property such that logs are still written to a valid location -# even for Spark processes run *outside* of a YARN container (e.g., a Spark -# driver run in client deploy-mode). -spark.yarn.app.container.log.dir=/var/log/spark/user/${user.name} - -# Set everything to be logged to the console -rootLogger.level = info -rootLogger.appenderRef.stdout.ref = console - -appender.console.type = Console -appender.console.name = console -appender.console.target = SYSTEM_ERR -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n - -# Set the default spark-shell/spark-sql log level to WARN. When running the -# spark-shell/spark-sql, the log level for these classes is used to overwrite -# the root logger's log level, so that the user can have different defaults -# for the shell and regular Spark apps. -logger.repl.name = org.apache.spark.repl.Main -logger.repl.level = warn - -logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver -logger.thriftserver.level = warn - -# Settings to quiet third party logs that are too verbose -logger.jetty1.name = org.sparkproject.jetty -logger.jetty1.level = warn -logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle -logger.jetty2.level = error -logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper -logger.replexprTyper.level = info -logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter -logger.replSparkILoopInterpreter.level = info -logger.parquet1.name = org.apache.parquet -logger.parquet1.level = error -logger.parquet2.name = parquet -logger.parquet2.level = error -logger.hudi.name = org.apache.hudi -logger.hudi.level = warn - -# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support -logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler -logger.RetryingHMSHandler.level = fatal -logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry -logger.FunctionRegistry.level = error - -# For deploying Spark ThriftServer -# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 -appender.console.filter.1.type = RegexFilter -appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* -appender.console.filter.1.onMatch = deny -appender.console.filter.1.onMismatch = neutral \ No newline at end of file diff --git a/docker/spark-emr-sample/spark-conf/metrics.properties b/docker/spark-emr-sample/spark-conf/metrics.properties deleted file mode 100644 index e69de29bb..000000000 diff --git a/docker/spark-emr-sample/spark-conf/spark-defaults.conf b/docker/spark-emr-sample/spark-conf/spark-defaults.conf deleted file mode 100644 index 0a5dabe7d..000000000 --- a/docker/spark-emr-sample/spark-conf/spark-defaults.conf +++ /dev/null @@ -1,65 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -spark.driver.extraClassPath /usr/lib/livy/rsc-jars/*:/usr/lib/livy/repl_2.12-jars/*:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/goodies/lib/emr-serverless-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/usr/share/aws/redshift/jdbc/RedshiftJDBC.jar:/usr/share/aws/redshift/spark-redshift/lib/*:/usr/share/aws/iceberg/lib/iceberg-emr-common.jar:/usr/share/aws/iceberg/lib/iceberg-spark3-runtime.jar -spark.driver.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native -spark.executor.extraClassPath /usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/goodies/lib/emr-serverless-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/docker/usr/lib/hadoop-lzo/lib/*:/docker/usr/lib/hadoop/hadoop-aws.jar:/docker/usr/share/aws/aws-java-sdk/*:/docker/usr/share/aws/emr/emrfs/conf:/docker/usr/share/aws/emr/emrfs/lib/*:/docker/usr/share/aws/emr/emrfs/auxlib/*:/docker/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/docker/usr/share/aws/emr/security/conf:/docker/usr/share/aws/emr/security/lib/*:/docker/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/docker/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/docker/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/docker/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/usr/share/aws/redshift/jdbc/RedshiftJDBC.jar:/usr/share/aws/redshift/spark-redshift/lib/*:/usr/share/aws/iceberg/lib/iceberg-emr-common.jar:/usr/share/aws/iceberg/lib/iceberg-spark3-runtime.jar -spark.executor.extraLibraryPath /usr/lib/hadoop/lib/native:/usr/lib/hadoop-lzo/lib/native:/docker/usr/lib/hadoop/lib/native:/docker/usr/lib/hadoop-lzo/lib/native -spark.eventLog.enabled true -spark.eventLog.dir file:///var/log/spark/apps -spark.history.fs.logDirectory file:///var/log/spark/apps -spark.history.ui.port 18080 -spark.blacklist.decommissioning.enabled true -spark.blacklist.decommissioning.timeout 1h -spark.resourceManager.cleanupExpiredHost true -spark.stage.attempt.ignoreOnDecommissionFetchFailure true -spark.decommissioning.timeout.threshold 20 -spark.files.fetchFailure.unRegisterOutputOnHost true -spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version.emr_internal_use_only.EmrFileSystem 2 -spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored.emr_internal_use_only.EmrFileSystem true -spark.hadoop.fs.s3.getObject.initialSocketTimeoutMilliseconds 2000 -spark.sql.parquet.output.committer.class com.amazon.emr.committer.EmrOptimizedSparkSqlParquetOutputCommitter -spark.sql.parquet.fs.optimized.committer.optimization-enabled true -spark.sql.emr.internal.extensions com.amazonaws.emr.spark.EmrSparkSessionExtensions -spark.executor.memory 14G -spark.executor.cores 4 -spark.driver.memory 14G -spark.driver.cores 4 -spark.executor.defaultJavaOptions -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:+UseParallelGC -XX:InitiatingHeapOccupancyPercent=70 -XX:OnOutOfMemoryError='kill -9 %p' -spark.driver.defaultJavaOptions -XX:OnOutOfMemoryError='kill -9 %p' -spark.hadoop.mapreduce.output.fs.optimized.committer.enabled true - -spark.master custom:emr-serverless -spark.submit.deployMode client -spark.submit.customResourceManager.submit.class org.apache.spark.deploy.emrserverless.submit.EmrServerlessClientApplication -spark.hadoop.fs.defaultFS file:/// -spark.dynamicAllocation.enabled true -spark.dynamicAllocation.shuffleTracking.enabled true -spark.hadoop.fs.s3.customAWSCredentialsProvider com.amazonaws.auth.DefaultAWSCredentialsProviderChain -spark.authenticate true -spark.ui.enabled false -spark.ui.custom.executor.log.url /logs/{{CONTAINER_ID}}/{{FILE_NAME}}.gz - -spark.emr-serverless.client.create.batch.size 100 -spark.emr-serverless.client.describe.batch.size 100 -spark.emr-serverless.client.release.batch.size 100 -spark.dynamicAllocation.initialExecutors 3 -spark.dynamicAllocation.minExecutors 0 -spark.executor.instances 3 -spark.hadoop.fs.s3a.aws.credentials.provider software.amazon.awssdk.auth.credentials.DefaultCredentialsProvider -spark.sql.hive.metastore.sharedPrefixes software.amazon.awssdk.services.dynamodb -spark.sql.legacy.createHiveTableByDefault false -spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions -spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog diff --git a/docker/spark-emr-sample/spark-conf/spark-env.sh b/docker/spark-emr-sample/spark-conf/spark-env.sh deleted file mode 100644 index a40f294b6..000000000 --- a/docker/spark-emr-sample/spark-conf/spark-env.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -export SPARK_HOME=${SPARK_HOME:-/usr/lib/spark} -export SPARK_LOG_DIR=${SPARK_LOG_DIR:-/var/log/spark} -export HADOOP_HOME=${HADOOP_HOME:-/usr/lib/hadoop} -export HADOOP_CONF_DIR=${HADOOP_CONF_DIR:-/etc/hadoop/conf} -export HIVE_CONF_DIR=${HIVE_CONF_DIR:-/etc/hive/conf} - -export SPARK_MASTER_PORT=7077 -export SPARK_MASTER_IP=$STANDALONE_SPARK_MASTER_HOST -export SPARK_MASTER_WEBUI_PORT=8080 - -export SPARK_WORKER_DIR=${SPARK_WORKER_DIR:-/var/run/spark/work} -export SPARK_WORKER_PORT=7078 -export SPARK_WORKER_WEBUI_PORT=8081 - -export HIVE_SERVER2_THRIFT_BIND_HOST=0.0.0.0 -export HIVE_SERVER2_THRIFT_PORT=10001 - - -export SPARK_DAEMON_JAVA_OPTS="$SPARK_DAEMON_JAVA_OPTS -XX:OnOutOfMemoryError='kill -9 %p'" -export PYSPARK_PYTHON=${PYSPARK_PYTHON:-/usr/bin/python3} -export PYSPARK_DRIVER_PYTHON=${PYSPARK_DRIVER_PYTHON:-/usr/bin/python3} - -export AWS_STS_REGIONAL_ENDPOINTS=regional diff --git a/docker/spark-sample-app/build.sbt b/docker/spark-sample-app/build.sbt deleted file mode 100644 index ea49bfd20..000000000 --- a/docker/spark-sample-app/build.sbt +++ /dev/null @@ -1,8 +0,0 @@ -name := "MyApp" - -version := "1.0" - -scalaVersion := "2.12.20" - -libraryDependencies += "org.apache.spark" %% "spark-sql" % "3.5.3" - diff --git a/docker/spark-sample-app/src/main/scala/MyApp.scala b/docker/spark-sample-app/src/main/scala/MyApp.scala deleted file mode 100644 index 6e2171c41..000000000 --- a/docker/spark-sample-app/src/main/scala/MyApp.scala +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -import org.apache.spark.sql.SparkSession - -object MyApp { - def main(args: Array[String]): Unit = { - var spark = SparkSession.builder() - .master("local[1]") - .appName("MyApp") - .getOrCreate(); - - println("APP Name :" + spark.sparkContext.appName); - println("Deploy Mode :" + spark.sparkContext.deployMode); - println("Master :" + spark.sparkContext.master); - - spark.sql("CREATE table foo (id int, name varchar(100))").show() - println(">>> Table created") - spark.sql("SELECT * FROM foo").show() - println(">>> SQL query of table completed") - - spark.sql("source=foo | fields id").show() - println(">>> PPL query of table completed") - } -} diff --git a/docs/index.md b/docs/index.md index abc801bde..82c147de2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -546,7 +546,6 @@ In the index mapping, the `_meta` and `properties`field stores meta and schema i - `spark.flint.index.checkpointLocation.rootDir`: default is None. Flint will create a default checkpoint location in format of '//' to isolate checkpoint data. - `spark.flint.index.checkpoint.mandatory`: default is true. - `spark.datasource.flint.socket_timeout_millis`: default value is 60000. -- `spark.datasource.flint.request.completionDelayMillis`: Time to wait in milliseconds after request is complete. Applied after index creation. Default value is 2000 if using aoss service, otherwise 0. - `spark.flint.monitor.initialDelaySeconds`: Initial delay in seconds before starting the monitoring task. Default value is 15. - `spark.flint.monitor.intervalSeconds`: Interval in seconds for scheduling the monitoring task. Default value is 60. - `spark.flint.monitor.maxErrorCount`: Maximum number of consecutive errors allowed before stopping the monitoring task. Default value is 5. @@ -578,10 +577,6 @@ The following table define the data type mapping between Flint data type and Spa * Spark data types VarcharType(length) and CharType(length) are both currently mapped to Flint data type *keyword*, dropping their length property. On the other hand, Flint data type *keyword* only maps to StringType. -* Spark data type MapType is mapped to an empty OpenSearch object. The inner fields then rely on - dynamic mapping. On the other hand, Flint data type *object* only maps to StructType. -* Spark data type DecimalType is mapped to an OpenSearch double. On the other hand, Flint data type - *double* only maps to DoubleType. Unsupported Spark data types: * DecimalType diff --git a/docs/load_geoip_data.scala b/docs/load_geoip_data.scala deleted file mode 100644 index 1540dbfb1..000000000 --- a/docs/load_geoip_data.scala +++ /dev/null @@ -1,440 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -import java.io.BufferedReader -import java.io.FileReader -import java.io.PrintStream -import java.math.BigInteger -import scala.collection.mutable.ListBuffer - -var ipv4NodeCount = 0 -var ipv6NodeCount = 0 -var ipv4NodeOutputCount = 0 -var ipv6NodeOutputCount = 0 - -/* Create a binary tree based on the bits of the start IP address of the subnets. Only use the - first bits needed for the netmask. For example with a subnet of "192.168.2.0/24", only use the - first 24 bits. - - If a node for a subnet has children, then there is an overlap that must be corrected. To correct - an overlap, make sure that both children of the node exist and remove the subnet for the current - node. Finally check the child nodes for overlapping subnets and continue. - */ -class TreeNode(var ipAddressBytes: Array[Byte], var netmask: Int, var isIPv4: Boolean, var lineRemainder: String) { - var falseChild: TreeNode = null - var trueChild: TreeNode = null - - def maxNetmask: Integer = if (isIPv4) 32 else 128 - - // Add a new node to the tree in the correct position - def addNode(nodeToAdd: TreeNode): Unit = { - if (netmask >= nodeToAdd.netmask || netmask == maxNetmask) { - return - } - - var byteIndex = netmask / 8 - var bitValue = (nodeToAdd.ipAddressBytes(byteIndex) & (1 << (7 - (netmask % 8)))) > 0 - - if (netmask + 1 == nodeToAdd.netmask) { - if (bitValue) { - trueChild = nodeToAdd - } else { - falseChild = nodeToAdd - } - } else { - var nextChild: TreeNode = null - if (bitValue) { - nextChild = trueChild - if (trueChild == null) { - nextChild = new TreeNode(null, netmask + 1, isIPv4, null) - trueChild = nextChild - } - } else { - nextChild = falseChild - if (falseChild == null) { - nextChild = new TreeNode(null, netmask + 1, isIPv4, null) - falseChild = nextChild - } - } - - nextChild.addNode(nodeToAdd) - } - - return - } - - def haveOverlap(): Boolean = falseChild != null || trueChild != null - - // Convert the IP address to a string. For IPv6, this is more complicated, since it may - // need to be reduced. - def ipAddressString(): String = { - if (isIPv4) { - return ipAddressBytes.map(v => 255 & v).mkString(".") - } else { - var allZeroes = true - for (b <- ipAddressBytes) { - if (b != 0) { - allZeroes = false - } - } - - if (allZeroes) { - return "::" - } - - var zeroes: ListBuffer[(Int, Int)] = ListBuffer() - var zeroesStart = -1 - var zeroesStartIndex = -1 - for (i <- 0 to 7) { - if (ipAddressBytes(i * 2) == 0 && ipAddressBytes(i * 2 + 1) == 0) { - if (zeroesStart == -1) { - zeroesStart = i - zeroesStartIndex = zeroes.length - zeroes = zeroes :+ (i, 1) - } else { - var existingTuple = zeroes(zeroesStartIndex) - zeroes.update(zeroesStartIndex, (existingTuple._1, 1 + existingTuple._2)) - } - } else { - zeroesStart = -1 - zeroesStartIndex = -1 - } - } - - var longestZeroesIndex = -1 - var longestZeroesLength = 0 - for (v <- zeroes) { - if (v._2 >= longestZeroesLength) { - longestZeroesLength = v._2 - longestZeroesIndex = v._1 - } - } - - var fullIpAddress: Array[String] = Array.fill(8){null} - for (i <- 0 to 7) { - var strValue = (((255 & ipAddressBytes(i * 2)) << 8) + (255 & ipAddressBytes(i * 2 + 1))).toHexString - fullIpAddress(i) = strValue - } - - if (longestZeroesIndex == -1) { - return fullIpAddress.mkString(":") - } else { - var ipPartsStart = fullIpAddress.slice(0, longestZeroesIndex) - var ipPartsEnd = fullIpAddress.slice(longestZeroesIndex + longestZeroesLength, 8) - return ipPartsStart.mkString(":") + "::" + ipPartsEnd.mkString(":") - } - } - } - - def getStart(): BigInteger = new BigInteger(ipAddressBytes) - - def getEnd(): BigInteger = { - var valueToAdd = new BigInteger(Array.fill(maxNetmask / 8){0.toByte}) - if (netmask < maxNetmask) { - valueToAdd = valueToAdd.flipBit(maxNetmask - netmask) - valueToAdd = valueToAdd.subtract(new BigInteger("1")) - } - return getStart().add(valueToAdd) - } - - def valueToByteArray(value: BigInteger): Array[Byte] = { - var fullArray = Array.fill(maxNetmask / 8){0.toByte} - var valueArray = value.toByteArray() - valueArray.copyToArray(fullArray, (maxNetmask / 8) - valueArray.length, valueArray.length) - return fullArray - } - - def incrementNodeCount(): Unit = { - if (isIPv4) { - ipv4NodeCount += ipv4NodeCount - } else { - ipv6NodeCount += ipv6NodeCount - } - } - - // Split a node. Make sure that both children exist and remove the subnet for the current node. - def split(): Unit = { - if (ipAddressBytes == null) { - return - } - - var ipAddressStr = ipAddressString() - println(s">>> Splitting IP: $ipAddressStr") - - if (falseChild == null) { - falseChild = new TreeNode(ipAddressBytes, netmask + 1, isIPv4, lineRemainder) - } else if (falseChild.ipAddressBytes == null) { - falseChild.ipAddressBytes = ipAddressBytes - falseChild.lineRemainder = lineRemainder - } - - if (trueChild == null) { - var valueStart = falseChild.getEnd().add(new BigInteger("1")) - var startArray = valueToByteArray(valueStart) - trueChild = new TreeNode(startArray, netmask + 1, isIPv4, lineRemainder) - } else if (trueChild.ipAddressBytes == null) { - var valueStart = falseChild.getEnd().add(new BigInteger("1")) - var startArray = valueToByteArray(valueStart) - trueChild.ipAddressBytes = startArray - trueChild.lineRemainder = lineRemainder - } - - ipAddressBytes = null - lineRemainder = null - - return - } - - def fixTree(): Unit = { - if (haveOverlap()) { - split() - } - - if (falseChild != null) { - falseChild.fixTree() - } - - if (trueChild != null) { - trueChild.fixTree() - } - } - - def printTree(outStream: PrintStream, tenPercentCount: Int): Unit = { - if (ipAddressBytes != null) { - outStream.print(ipAddressString()) - outStream.print("/") - outStream.print(netmask.toString) - outStream.print(",") - outStream.print(lineRemainder) - outStream.print(",") - outStream.print(getStart().toString()) - outStream.print(",") - outStream.print(getEnd().toString()) - outStream.print(",") - outStream.println(isIPv4.toString) - - var currentNodeCount = if (isIPv4) ipv4NodeOutputCount else ipv6NodeOutputCount - if (currentNodeCount % tenPercentCount == 0) { - print((currentNodeCount * 10 / tenPercentCount).toString + "%..") - } - - if (isIPv4) { - ipv4NodeOutputCount += 1 - } else { - ipv6NodeOutputCount += 1 - } - } - - if (falseChild != null) { - falseChild.printTree(outStream, tenPercentCount) - } - if (trueChild != null) { - trueChild.printTree(outStream, tenPercentCount) - } - } -} - -// Create a node for an IPv4 entry -def createIPv4TreeNode(fullLine: String): TreeNode = { - var charIndex = fullLine.indexOf(",") - var subnet = fullLine.substring(0, charIndex) - var lineRemainder = fullLine.substring(charIndex + 1) - - charIndex = subnet.indexOf("/") - var ipAddressStr = subnet.substring(0, charIndex) - var netmask = subnet.substring(charIndex + 1).toInt - - var addrParts = ipAddressStr.split("\\.") - var bytes = Array[Byte]( - addrParts(0).toInt.toByte, - addrParts(1).toInt.toByte, - addrParts(2).toInt.toByte, - addrParts(3).toInt.toByte - ) - - return new TreeNode(bytes, netmask, true, lineRemainder) -} - -// Create a node for an IPv6 entry -def createIPv6TreeNode(fullLine: String): TreeNode = { - var charIndex = fullLine.indexOf(",") - var subnet = fullLine.substring(0, charIndex) - var lineRemainder = fullLine.substring(charIndex + 1) - - charIndex = subnet.indexOf("/") - var ipAddressStr = subnet.substring(0, charIndex) - var netmask = subnet.substring(charIndex + 1).toInt - - var bytes: Array[Byte] = null - charIndex = ipAddressStr.indexOf("::") - - if (charIndex == -1) { - var values = ipAddressStr.split(":").map(x => Integer.parseInt(x, 16)) - bytes = Array.fill(16){0.toByte} - for (i <- 0 to 7) { - bytes(i * 2) = (values(i) >> 8).toByte - bytes(i * 2 + 1) = (values(i) & 255).toByte - } - } else if ("::" == ipAddressStr) { - bytes = Array.fill(16){0.toByte} - } else { - if (charIndex == 0) { - var values = ipAddressStr.substring(2).split(":").map(x => Integer.parseInt(x, 16)) - bytes = Array.fill(16){0.toByte} - for (i <- 8 - values.length to 7) { - var valuesIndex = i - 8 + values.length - bytes(i * 2) = (values(valuesIndex) >> 8).toByte - bytes(i * 2 + 1) = (values(valuesIndex) & 255).toByte - } - } else if (charIndex == ipAddressStr.length - 2) { - var values = ipAddressStr.substring(0, ipAddressStr.length - 2).split(":").map(x => Integer.parseInt(x, 16)) - bytes = Array.fill(16){0.toByte} - for (i <- 0 to values.length - 1) { - bytes(i * 2) = (values(i) >> 8).toByte - bytes(i * 2 + 1) = (values(i) & 255).toByte - } - } else { - var startValues = ipAddressStr.substring(0, charIndex).split(":").map(x => Integer.parseInt(x, 16)) - var endValues = ipAddressStr.substring(charIndex + 2).split(":").map(x => Integer.parseInt(x, 16)) - bytes = Array.fill(16){0.toByte} - for (i <- 0 to startValues.length - 1) { - bytes(i * 2) = (startValues(i) >> 8).toByte - bytes(i * 2 + 1) = (startValues(i) & 255).toByte - } - for (i <- 8 - endValues.length to 7) { - var valuesIndex = i - 8 + endValues.length - bytes(i * 2) = (endValues(valuesIndex) >> 8).toByte - bytes(i * 2 + 1) = (endValues(valuesIndex) & 255).toByte - } - } - } - - return new TreeNode(bytes, netmask, false, lineRemainder) -} - -def createTreeNode(fullLine: String): TreeNode = { - var charIndex = fullLine.indexOf(",") - var subnet = fullLine.substring(0, charIndex) - if (subnet.indexOf(':') > -1) { - return createIPv6TreeNode(fullLine) - } else { - return createIPv4TreeNode(fullLine) - } -} - -var header: String = null -def readSubnets(fileName: String, ipv4Root: TreeNode, ipv6Root: TreeNode): Unit = { - var reader = new BufferedReader(new FileReader(fileName)) - header = reader.readLine() - - var line = reader.readLine() - while (line != null) { - var newNode = createTreeNode(line) - if (newNode.isIPv4) { - ipv4Root.addNode(newNode) - ipv4NodeCount += 1 - } else { - ipv6Root.addNode(newNode) - ipv6NodeCount += 1 - } - - line = reader.readLine() - } - - reader.close() -} - -def writeSubnets(fileName: String, ipv4Root: TreeNode, ipv6Root: TreeNode): Unit = { - var outStream = new PrintStream(fileName) - outStream.print(header) - outStream.print(",ip_range_start,ip_range_end,ipv4") - outStream.print("\r\n") - - println("Writing IPv4 data") - ipv4NodeOutputCount = 0 - ipv4Root.printTree(outStream, (ipv4NodeCount / 10).floor.toInt) - println() - - println("Writing IPv6 data") - ipv6NodeOutputCount = 0 - ipv6Root.printTree(outStream, (ipv6NodeCount / 10).floor.toInt) - println() - - outStream.close() -} - -// Create the table in Spark -def createTable(fileName: String, tableName: String): Unit = { - try { - var sparkSessionClass = Class.forName("org.apache.spark.sql.SparkSession") - var activeSessionMethod = sparkSessionClass.getMethod("active") - var sparkSession = activeSessionMethod.invoke(sparkSessionClass) - - var readMethod = sparkSessionClass.getMethod("read") - var dataFrameReader = readMethod.invoke(sparkSession) - - var dataFrameReaderClass = Class.forName("org.apache.spark.sql.DataFrameReader") - var formatMethod = dataFrameReaderClass.getMethod("format", classOf[java.lang.String]) - dataFrameReader = formatMethod.invoke(dataFrameReader, "csv") - - var optionMethod = dataFrameReaderClass.getMethod("option", classOf[java.lang.String], classOf[java.lang.String]) - dataFrameReader = optionMethod.invoke(dataFrameReader, "inferSchema", "true") - dataFrameReader = optionMethod.invoke(dataFrameReader, "header", "true") - - var loadMethod = dataFrameReaderClass.getMethod("load", classOf[java.lang.String]) - var dataset = loadMethod.invoke(dataFrameReader, fileName) - - var datasetClass = Class.forName("org.apache.spark.sql.Dataset") - var writeMethod = datasetClass.getMethod("write") - var dataFrameWriter = writeMethod.invoke(dataset) - - var dataFrameWriterClass = Class.forName("org.apache.spark.sql.DataFrameWriter") - var saveAsTableMethod = dataFrameWriterClass.getMethod("saveAsTable", classOf[java.lang.String]) - saveAsTableMethod.invoke(dataFrameWriter, tableName) - } catch { - case e: Exception => { - println("Unable to load data into table") - e.printStackTrace() - } - } -} - -// Sanitize the data and import it into a Spark table -def cleanAndImport(inputFile: String, outputFile: String, tableName: String): Unit = { - if (tableName != null) { - try { - Class.forName("org.apache.spark.sql.SparkSession") - } catch { - case e: ClassNotFoundException => { - println("Must run in Spark CLI to create the Spark table") - return - } - } - } - - println("Loading data") - var ipv4Root = new TreeNode(null, 0, true, null) - var ipv6Root = new TreeNode(null, 0, false, null) - readSubnets(inputFile, ipv4Root, ipv6Root) - - println("Fixing overlapping subnets") - ipv4Root.fixTree() - ipv6Root.fixTree() - - println("Writing data to file") - writeSubnets(outputFile, ipv4Root, ipv6Root) - - if (tableName != null) { - println("Creating and populating Spark table") - createTable(outputFile, tableName) - } - - println("Done") -} - -var FILE_PATH_TO_INPUT_CSV: String = "/replace/this/value" -var FILE_PATH_TO_OUTPUT_CSV: String = "/replace/this/value" -var TABLE_NAME: String = null -var result = cleanAndImport(FILE_PATH_TO_INPUT_CSV, FILE_PATH_TO_OUTPUT_CSV, TABLE_NAME) diff --git a/docs/opensearch-geoip.md b/docs/opensearch-geoip.md deleted file mode 100644 index cd262e187..000000000 --- a/docs/opensearch-geoip.md +++ /dev/null @@ -1,90 +0,0 @@ -# OpenSearch Geographic IP Location Data - -## Overview - -OpenSearch has PPL functions for looking up the geographic location of IP addresses. In order -to use these functions, a table needs to be created containing the geographic location -information. - -## How to Create Geographic Location Index - -A script has been created that can cleanup and augment a CSV file that contains geographic -location information for IP addresses ranges. The CSV file is expected to have the following -columns: - -| Column Name | Description | -|------------------|---------------------------------------------------------------------------------------------------------| -| cidr | IP address subnet in format `IP_ADDRESS/NETMASK` (ex. `192.168.0.0/24`). IP address can be IPv4 or IPv6 | -| country_iso_code | ISO code of the country where the IP address subnet is located | -| country_name | Name of the country where the IP address subnet is located | -| continent_name | Name of the continent where the IP address subent is located | -| region_iso_code | ISO code of the region where the IP address subnet is located | -| region_name | Name of the region where the IP address subnet is located | -| city_name | Name of the city where the IP address subnet is located | -| time_zone | Time zone where the IP address subnet is located | -| location | Latitude and longitude where the IP address subnet is located | - -The script will cleanup the data by splitting IP address subnets so that an IP address can only be in at most one subnet. - -The data is augmented by adding 3 fields. - -| Column Name | Description | -|----------------|--------------------------------------------------------------------| -| ip_range_start | An integer value used to determine if an IP address is in a subnet | -| ip_range_end | An integer value used to determine if an IP address is in a subnet | -| ipv4 | A boolean value, `true` if the IP address subnet is in IPv4 format | - -## Run the Script - -1. Create a copy of the scala file `load_geoip_data.scala` -2. Edit the copy of the file `load_geoip_data.scala` - There are three variables that need to be updated. - 1. `FILE_PATH_TO_INPUT_CSV` - the full path to the CSV file to load - 2. `FILE_PATH_TO_OUTPUT_CSV` - the full path of the CSV file to write the sanitized data to - 3. `TABLE_NAME` - name of the index to create in OpenSearch. No table is created if this is null -4. Save the file -5. Run the Apache Spark CLI and connect to the database -6. Load the Scala script - ```scala - :load FILENAME - ``` - Replace `FILENAME` with the full path to the Scala script. - -## Notes for EMR - -With EMR it is necessary to load the data from an S3 object. Follow the instructions for -**Run the Script**, but make sure that `TABLE_NAME` is set to `null`. Upload the -`FILE_PATH_TO_OUTPUT_CSV` to S3. - -## End-to-End - -How to download a sample data GeoIP location data set, clean it up and import it into a -Spark table. - -1. Use a web browser to download the [data set Zip file](https://geoip.maps.opensearch.org/v1/geolite2-city/data/geolite2-city_1732905911000.zip) -2. Extract the Zip file -3. Copy the file `geolite2-City.csv` to the computer where you run `spark-shell` -4. Copy the file file `load_geoip_data.scala` to the computer where you run `spark-shell` -5. Connect to the computer where you run `spark-shell` -6. Change to the directory containing `geolite2-City.csv` and `load_geoip_data.scala` -7. Update the `load_geoip_data.scala` file to specify the CSV files to read and write. Also update - it to specify the Spark table to create (`geo_ip_data` in this case). - ``` - sed -i \ - -e "s#^var FILE_PATH_TO_INPUT_CSV: String =.*#var FILE_PATH_TO_INPUT_CSV: String = \"${PWD}/geolite2-City.csv\"#" \ - load_geoip_data.scala - sed -i \ - -e "s#^var FILE_PATH_TO_OUTPUT_CSV: String = .*#var FILE_PATH_TO_OUTPUT_CSV: String = \"${PWD}/geolite2-City-fixed.csv\"#" \ - load_geoip_data.scala - sed -i \ - -e 's#^var TABLE_NAME: String = .*#var TABLE_NAME: String = "geo_ip_data"#' \ - load_geoip_data.scala - ``` -8. Run `spark-shell` - ``` - spark-shell - ``` -9. Load and run the `load_geoip_data.scala` script - ``` - :load load_geoip_data.scala - ``` diff --git a/docs/ppl-lang/PPL-Example-Commands.md b/docs/ppl-lang/PPL-Example-Commands.md index 5a61992de..7766c3b50 100644 --- a/docs/ppl-lang/PPL-Example-Commands.md +++ b/docs/ppl-lang/PPL-Example-Commands.md @@ -118,7 +118,6 @@ Assumptions: `a`, `b`, `c` are existing fields in `table` - `source = table | eval r = coalesce(a, b, c) | fields r` - `source = table | eval e = isempty(a) | fields e` - `source = table | eval e = isblank(a) | fields e` -- `source = table | eval e = cast(a as timestamp) | fields e` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one', a = 2, 'two', a = 3, 'three', a = 4, 'four', a = 5, 'five', a = 6, 'six', a = 7, 'se7en', a = 8, 'eight', a = 9, 'nine')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else 'unknown')` - `source = table | eval f = case(a = 0, 'zero', a = 1, 'one' else concat(a, ' is an incorrect binary digit'))` @@ -141,7 +140,6 @@ Assumptions: `a`, `b`, `c`, `d`, `e` are existing fields in `table` Assumptions: `bridges`, `coor` are existing fields in `table`, and the field's types are `struct` or `array>` - `source = table | flatten bridges` - `source = table | flatten coor` -- `source = table | flatten coor as (altitude, latitude, longitude)` - `source = table | flatten bridges | flatten coor` - `source = table | fields bridges | flatten bridges` - `source = table | fields country, bridges | flatten bridges | fields country, length | stats avg(length) as avg by country` @@ -487,11 +485,4 @@ _- **Limitation: another command usage of (relation) subquery is in `appendcols` > ppl-correlation-command is an experimental command - it may be removed in future versions -#### **Cast** -[See additional command details](functions/ppl-conversion.md) -- `source = table | eval int_to_string = cast(1 as string) | fields int_to_string` -- `source = table | eval int_to_string = cast(int_col as string), string_to_int = cast(string_col as integer) | fields int_to_string, string_to_int` -- `source = table | eval cdate = CAST('2012-08-07' as date), ctime = cast('2012-08-07T08:07:06' as timestamp) | fields cdate, ctime` -- `source = table | eval chained_cast = cast(cast("true" as boolean) as integer) | fields chained_cast` - --- diff --git a/docs/ppl-lang/functions/ppl-conversion.md b/docs/ppl-lang/functions/ppl-conversion.md index 7d3535936..48e4106ca 100644 --- a/docs/ppl-lang/functions/ppl-conversion.md +++ b/docs/ppl-lang/functions/ppl-conversion.md @@ -7,21 +7,22 @@ `cast(expr as dateType)` cast the expr to dataType. return the value of dataType. The following conversion rules are used: ``` -+------------+--------+--------+---------+-------------+--------+ -| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | -+------------+--------+--------+---------+-------------+--------+ -| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | -+------------+--------+--------+---------+-------------+--------+ -| NUMBER | Note1 | | v!=0 | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+ -| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | -+------------+--------+--------+---------+-------------+--------+ -| TIMESTAMP | Note1 | N/A | N/A | | DATE() | -+------------+--------+--------+---------+-------------+--------+ -| DATE | Note1 | N/A | N/A | N/A | | -+------------+--------+--------+---------+-------------+--------+ ++------------+--------+--------+---------+-------------+--------+--------+ +| Src/Target | STRING | NUMBER | BOOLEAN | TIMESTAMP | DATE | TIME | ++------------+--------+--------+---------+-------------+--------+--------+ +| STRING | | Note1 | Note1 | TIMESTAMP() | DATE() | TIME() | ++------------+--------+--------+---------+-------------+--------+--------+ +| NUMBER | Note1 | | v!=0 | N/A | N/A | N/A | ++------------+--------+--------+---------+-------------+--------+--------+ +| BOOLEAN | Note1 | v?1:0 | | N/A | N/A | N/A | ++------------+--------+--------+---------+-------------+--------+--------+ +| TIMESTAMP | Note1 | N/A | N/A | | DATE() | TIME() | ++------------+--------+--------+---------+-------------+--------+--------+ +| DATE | Note1 | N/A | N/A | N/A | | N/A | ++------------+--------+--------+---------+-------------+--------+--------+ +| TIME | Note1 | N/A | N/A | N/A | N/A | | ++------------+--------+--------+---------+-------------+--------+--------+ ``` -- `NUMBER` includes `INTEGER`, `LONG`, `FLOAT`, `DOUBLE`. Cast to **string** example: @@ -35,7 +36,7 @@ Cast to **string** example: Cast to **number** example: - os> source=people | eval `cbool` = CAST(true as integer), `cstring` = CAST('1' as integer) | fields `cbool`, `cstring` + os> source=people | eval `cbool` = CAST(true as int), `cstring` = CAST('1' as int) | fields `cbool`, `cstring` fetched rows / total rows = 1/1 +---------+-----------+ | cbool | cstring | @@ -45,13 +46,13 @@ Cast to **number** example: Cast to **date** example: - os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctimestamp` + os> source=people | eval `cdate` = CAST('2012-08-07' as date), `ctime` = CAST('01:01:01' as time), `ctimestamp` = CAST('2012-08-07 01:01:01' as timestamp) | fields `cdate`, `ctime`, `ctimestamp` fetched rows / total rows = 1/1 - +------------+---------------------+ - | cdate | ctimestamp | - |------------+---------------------| - | 2012-08-07 | 2012-08-07 01:01:01 | - +------------+---------------------+ + +------------+----------+---------------------+ + | cdate | ctime | ctimestamp | + |------------+----------+---------------------| + | 2012-08-07 | 01:01:01 | 2012-08-07 01:01:01 | + +------------+----------+---------------------+ Cast function can be **chained**: diff --git a/docs/ppl-lang/planning/ppl-geoip.md b/docs/ppl-lang/planning/ppl-geoip.md new file mode 100644 index 000000000..f6bef8f34 --- /dev/null +++ b/docs/ppl-lang/planning/ppl-geoip.md @@ -0,0 +1,39 @@ +## geoip syntax proposal + +geoip function to add information about the geographical location of an IPv4 or IPv6 address + +1. **Proposed syntax** + - `... | eval geoinfo = geoip([datasource,] ipAddress [,properties])` + - generic syntax + - `... | eval geoinfo = geoip(ipAddress)` + - use the default geoip datasource + - `... | eval geoinfo = geoip("abc", ipAddress)` + - use the "abc" geoip datasource + - `... | eval geoinfo = geoip(ipAddress, "city,lat,lon")` + - use the default geoip datasource, retrieve only city, lat and lon + - `... | eval geoinfo = geoip("abc", ipAddress, "city,lat,lon")` + - use the "abc" geoip datasource, retrieve only city, lat and lon + + +2. **Proposed wiring with the geoip database** + - Leverage the functionality of the ip2geo processor + - ip2geo processor configuration, functionality and code will be used + - Prerequisite for the geoip is that ip2geo processor is configured properly + - See https://opensearch.org/docs/latest/ingest-pipelines/processors/ip2geo/ + + +### New syntax definition in ANTLR + +```ANTLR + +// functions +evalFunctionCall + : evalFunctionName LT_PRTHS functionArgs RT_PRTHS + | geoipFunction + ; + +geoipFunction + : GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = stringLiteral)? RT_PRTHS + ; +``` + diff --git a/docs/ppl-lang/ppl-fillnull-command.md b/docs/ppl-lang/ppl-fillnull-command.md index f204a5969..00064849c 100644 --- a/docs/ppl-lang/ppl-fillnull-command.md +++ b/docs/ppl-lang/ppl-fillnull-command.md @@ -17,7 +17,7 @@ The example show fillnull one field. PPL query: - os> source=logs | fields status_code | eval input=status_code | fillnull with 0 in status_code; + os> source=logs | fields status_code | eval input=status_code | fillnull value = 0 status_code; | input | status_code | |-------|-------------| | 403 | 403 | @@ -43,7 +43,7 @@ The example show fillnull applied to multiple fields. PPL query: - os> source=logs | fields request_path, timestamp | eval input_request_path=request_path, input_timestamp = timestamp | fillnull with '???' in request_path, timestamp; + os> source=logs | fields request_path, timestamp | eval input_request_path=request_path, input_timestamp = timestamp | fillnull value = '???' request_path, timestamp; | input_request_path | input_timestamp | request_path | timestamp | |--------------------|-----------------------|--------------|------------------------| | /contact | NULL | /contact | ??? | @@ -89,4 +89,4 @@ PPL query: | /services | NULL | /services | 1970-01-01 00:00:00 | | /home | 2023-10-01 10:45:00 | /home | 2023-10-01 10:45:00 | | /services | 2023-10-01 11:00:00 | /services | 2023-10-01 11:00:00 | -| NULL | 2023-10-01 10:35:00 | /error | 2023-10-01 10:35:00 | +| NULL | 2023-10-01 10:35:00 | /error | 2023-10-01 10:35:00 | \ No newline at end of file diff --git a/docs/ppl-lang/ppl-flatten-command.md b/docs/ppl-lang/ppl-flatten-command.md index 68b03e82e..4c1ae5d0d 100644 --- a/docs/ppl-lang/ppl-flatten-command.md +++ b/docs/ppl-lang/ppl-flatten-command.md @@ -7,10 +7,9 @@ Using `flatten` command to flatten a field of type: ### Syntax -`flatten [As aliasSequence]` +`flatten ` * field: to be flattened. The field must be of supported type. -* aliasSequence: to be used as aliasSequence for the flattened-output fields. Better to put the aliasSequence in brace if there is more than one field. ### Test table #### Schema @@ -88,18 +87,4 @@ PPL query: | 2024-09-13T12:00:00 | Prague | Czech Republic| 343 | Legion Bridge | 200 | 50.0755| 14.4378| | 2024-09-13T12:00:00 | Budapest| Hungary | 375 | Chain Bridge | 96 | 47.4979| 19.0402| | 2024-09-13T12:00:00 | Budapest| Hungary | 333 | Liberty Bridge | 96 | 47.4979| 19.0402| -| 1990-09-13T12:00:00 | Warsaw | Poland | NULL | NULL | NULL | NULL | NULL | - -### Example 4: flatten with aliasSequence -This example shows how to flatten with aliasSequence. -PPL query: - - `source=table | flatten coor as (altitude, latitude, longitude)` - -| \_time | bridges | city | country | altitude | latitude | longtitude | -|---------------------|----------------------------------------------|---------|---------------|----------|----------|------------| -| 2024-09-13T12:00:00 | [{801, Tower Bridge}, {928, London Bridge}] | London | England | 35 | 51.5074 | -0.1278 | -| 2024-09-13T12:00:00 | [{232, Pont Neuf}, {160, Pont Alexandre III}]| Paris | France | 35 | 48.8566 | 2.3522 | -| 2024-09-13T12:00:00 | [{48, Rialto Bridge}, {11, Bridge of Sighs}] | Venice | Italy | 2 | 45.4408 | 12.3155 | -| 2024-09-13T12:00:00 | [{516, Charles Bridge}, {343, Legion Bridge}]| Prague | Czech Republic| 200 | 50.0755 | 14.4378 | -| 2024-09-13T12:00:00 | [{375, Chain Bridge}, {333, Liberty Bridge}] | Budapest| Hungary | 96 | 47.4979 | 19.0402 | -| 1990-09-13T12:00:00 | NULL | Warsaw | Poland | NULL | NULL | NULL | +| 1990-09-13T12:00:00 | Warsaw | Poland | NULL | NULL | NULL | NULL | NULL | \ No newline at end of file diff --git a/docs/spark-docker.md b/docs/spark-docker.md deleted file mode 100644 index d1200e2b3..000000000 --- a/docs/spark-docker.md +++ /dev/null @@ -1,164 +0,0 @@ -# Running Queries with Apache Spark in Docker - -There are [Bitnami Apache Spark docker images](https://hub.docker.com/r/bitnami/spark). These -can be modified to be able to include the OpenSearch Spark PPL extension. With the OpenSearch -Spark PPL extension, the docker image can be used to test PPL commands. - -The Bitnami Apache Spark image can be used to run a Spark cluster and also to run -`spark-shell` for running queries. - -## Prepare OpenSearch Spark PPL Extension - -Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the -location of the Jar file as well as the name of the Jar file. - -From the root of this repository, build the OpenSearch Spark PPL extension with: - -``` -sbt clean -sbt assembly -``` - -Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information. - -## Using Docker Compose - -There are sample files in this repository at `docker/apache-spark-sample` They can be used to -start up both nodes with the command: - -``` -docker compose up -d -``` - -The cluster can be stopped with: - -``` -docker compose down -``` - -### Configuration - -There is a file `docker/apache-spark-sample/.env` that can be edited to change some settings. - -| Variable Name | Description | -|----------------|---------------------------------------------------| -| MASTER_UI_PORT | Host port to bind to port 8080 of the master node | -| MASTER_PORT | Host port to bind to port 7077 of the master node | -| UI_PORT | Host port to bind to port 4040 of the master node | -| PPL_JAR | Path to the PPL Jar file | - -## Running Spark Shell - -Can run `spark-shell` on the master node. - -``` -docker exec -it apache-spark-sample-spark-1 /opt/bitnami/spark/bin/spark-shell -``` - -Within the Spark Shell, you can submit queries, including PPL queries. For example a sample -table can be created, populated and finally queried using PPL. - -``` -spark.sql("CREATE TABLE test_table(id int, name varchar(100))") -spark.sql("INSERT INTO test_table (id, name) VALUES(1, 'Foo')") -spark.sql("INSERT INTO test_table (id, name) VALUES(2, 'Bar')") -spark.sql("source=test_table | eval x = id + 5 | fields x, name").show() -``` - -For further information, see the [Spark PPL Test Instructions](ppl-lang/local-spark-ppl-test-instruction.md) - -## Manual Setup - -### spark-conf - -Contains the Apache Spark configuration. Need to add three lines to the `spark-defaults.conf` -file: -``` -spark.sql.legacy.createHiveTableByDefault false -spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions -spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog -``` - -An example file available in this repository at `docker/apache-spark-sample/spark-defaults.conf` - -## Prepare OpenSearch Spark PPL Extension - -Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the -location of the Jar file as well as the name of the Jar file. - -## Run the Spark Cluster - -Need to run a master node and a worker node. For these to communicate, first create a network -for them to use. - -``` -docker network create spark-network -``` - -### Master Node - -The master node can be run with the following command: -``` -docker run \ - -d \ - --name spark \ - --network spark-network \ - -p 8080:8080 \ - -p 7077:7077 \ - -p 4040:4040 \ - -e SPARK_MODE=master \ - -e SPARK_RPC_AUTHENTICATION_ENABLED=no \ - -e SPARK_RPC_ENCRYPTION_ENABLED=no \ - -e SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no \ - -e SPARK_SSL_ENABLED=no \ - -e SPARK_PUBLIC_DNS=localhost \ - -v :/opt/bitnami/spark/conf/spark-defaults.conf \ - -v /:/opt/bitnami/spark/jars/ \ - bitnami/spark:3.5.3 -``` - -* `-d` - Run the container in the background and return to the shell -* `--name spark` - Name the docker container `spark` -* `` - Replace with the path to the Spark configuration file. -* `` - Replace with the path to the directory containing the OpenSearch Spark PPL extension - Jar file. -* `` - Replace with the filename of the OpenSearch Spark PPL extension Jar file. - -### Worker Node - -The worker node can be run with the following command: -``` -docker run \ - -d \ - --name spark-worker \ - --network spark-network \ - -e SPARK_MODE=worker \ - -e SPARK_MASTER_URL=spark://spark:7077 \ - -e SPARK_WORKER_MEMORY=1G \ - -e SPARK_WORKER_CORES=1 \ - -e SPARK_RPC_AUTHENTICATION_ENABLED=no \ - -e SPARK_RPC_ENCRYPTION_ENABLED=no \ - -e SPARK_LOCAL_STORAGE_ENCRYPTION_ENABLED=no \ - -e SPARK_SSL_ENABLED=no \ - -e SPARK_PUBLIC_DNS=localhost \ - -v :/opt/bitnami/spark/conf/spark-defaults.conf \ - -v /:/opt/bitnami/spark/jars/ \ - bitnami/spark:3.5.3 -``` - -* `-d` - Run the container in the background and return to the shell -* `--name spark-worker` - Name the docker container `spark-worker` -* `` - Replace with the path to the Spark configuration file. -* `` - Replace with the path to the directory containing the OpenSearch Spark PPL extension - Jar file. -* `` - Replace with the filename of the OpenSearch Spark PPL extension Jar file. diff --git a/docs/spark-emr-docker.md b/docs/spark-emr-docker.md deleted file mode 100644 index 7eef4d250..000000000 --- a/docs/spark-emr-docker.md +++ /dev/null @@ -1,147 +0,0 @@ -# Running Queries with Spark EMR in Docker - -Spark EMR images are available on the Amazon ECR Public Gallery. These can be modified to -be able to include the OpenSearch Spark PPL extension. With the OpenSearch Spark PPL -extension, the docker image can be used to test PPL commands. - -The Spark EMR image will run an Apache Spark app if one was specified and then shutdown. - -## Prepare OpenSearch Spark PPL Extension - -Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the -location of the Jar file as well as the name of the Jar file. - -From the root of this repository, build the OpenSearch Spark PPL extension with: - -``` -sbt clean -sbt assembly -``` - -Refer to the [Developer Guide](../DEVELOPER_GUIDE.md) for more information. - -## Using Docker Compose - -There are sample files in this repository at `docker/spark-emr-sample` They can be used to -run the Spark EMR container: - -``` -docker compose up -``` - -Remove the docker resources afterwards with: - -``` -docker compose down -``` - -### Configuration - -There is a file `docker/spark-emr-sample/.env` that can be edited to change some settings. - -| Variable Name | Description | -|----------------|---------------------------------------------------| -| PPL_JAR | Path to the PPL Jar file | - -## Logs - -The logs are available in `/var/log/spark` in the docker container. - -STDERR for the app run is available in `/var/log/spark/user/stderr`. - -STDOUT for the app -run is available in `/var/log/spark/user/stdout`. - -## Manual Setup - -Need to create two directories. These directories will be bound to the directories in the -image. - -Look in `docker/spark-emr-sample` in this repository for samples of the directories -described below. - -### logging-conf -Contains two shell scripts that are run during startup to configure logging. -* `run-adot-collector.sh` -* `run-fluentd-spark.sh` - -Unless you need to make changes to the logging in the docker image, these can both be -empty shell scripts. - -### spark-conf - -Contains the Apache Spark configuration. Need to add three lines to the `spark-defaults.conf` -file: -``` -spark.sql.legacy.createHiveTableByDefault false -spark.sql.extensions org.opensearch.flint.spark.FlintPPLSparkExtensions -spark.sql.catalog.dev org.apache.spark.opensearch.catalog.OpenSearchCatalog -``` - -## Create a Spark App - -An Apache Spark app is needed to provide queries to be run on the Spark EMR instance. -The image has been tested with an app written in Scala. - -An example app is available in this repository in `docker/spark-sample--app`. - -### Bulid the Example App - -The example app can be built using [SBT](https://www.scala-sbt.org/). -``` -cd docker/spark-sample-app -sbt clean package -``` - -This will produce a Jar file in `docker/spark-sample-app/target/scala-2.12` -that can be used with the Spark EMR image. - -## Prepare OpenSearch Spark PPL Extension - -Create a local build or copy of the OpenSearch Spark PPL extension. Make a note of the -location of the Jar file as well as the name of the Jar file. - -## Run the Spark EMR Image - -The Spark EMR image can be run with the following command from the root of this repository: -``` -docker run \ - --name spark-emr \ - -v ./docker/spark-emr-sample/logging-conf:/var/loggingConfiguration/spark \ - -v ./docker/spark-sample-app/target/scala-2.12:/app \ - -v ./docker/spark-emr-sample/spark-conf:/etc/spark/conf \ - -v /:/usr/lib/spark/jars/ \ - public.ecr.aws/emr-serverless/spark/emr-7.5.0:20241125 \ - driver \ - --class MyApp \ - /app/myapp_2.12-1.0.jar -``` - -* `--name spark-emr` - Name the docker container `spark-emr` -* `-v ./docker/spark-emr-sample/logging-conf:/var/loggingConfiguration/spark` - - Bind the directory containing logging shell scripts to the docker image. Needs to bind - to `/var/loggingConfiguration/spark` in the image. -* `-v ./docker/spark-sample-app/target/scala-2.12:/app` - - Bind the directory containing the Apache Spark app Jar file to a location in the - docker image. The directory in the docker image must match the path used in the final - argument. -* `-v ./docker/spark-emr-sample/spark-conf:/etc/spark/conf` - - Bind the directory containing the Apache Spark configuration. Needs to bind to - `/etc/spark/conf` in the image. -* `` - Replace with the path to the directory containing the OpenSearch Spark PPL extension - Jar file. -* `` - Replace with the filename of the OpenSearch Spark PPL extension Jar file. -* `driver` - Start the Spark EMR container as a driver. This will run `spark-submit` to run an - app. -* `--class MyApp` - The main class of the Spark App to run. -* `/app/myapp_2.12-1.0.jar` - The full path within the docker container where the Jar file of the Spark app is - located. diff --git a/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java b/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java index 721685c38..9facd89ef 100644 --- a/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java +++ b/flint-core/src/main/java/org/opensearch/flint/core/IRestHighLevelClient.java @@ -98,15 +98,15 @@ static void recordLatency(String metricNamePrefix, long latencyMilliseconds) { * Otherwise, it increments a general failure metric counter based on the status code category (e.g., 4xx, 5xx). * * @param metricNamePrefix the prefix for the metric name which is used to construct the full metric name for failure - * @param t the exception encountered during the operation, used to determine the type of failure + * @param e the exception encountered during the operation, used to determine the type of failure */ - static void recordOperationFailure(String metricNamePrefix, Throwable t) { - OpenSearchException openSearchException = extractOpenSearchException(t); + static void recordOperationFailure(String metricNamePrefix, Exception e) { + OpenSearchException openSearchException = extractOpenSearchException(e); int statusCode = openSearchException != null ? openSearchException.status().getStatus() : 500; if (openSearchException != null) { CustomLogging.logError(new OperationMessage("OpenSearch Operation failed.", statusCode), openSearchException); } else { - CustomLogging.logError("OpenSearch Operation failed with an exception.", t); + CustomLogging.logError("OpenSearch Operation failed with an exception.", e); } if (statusCode == 403) { String forbiddenErrorMetricName = metricNamePrefix + ".403.count"; diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java b/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java index f9d181b70..6ddc6ae9c 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/FlintOptions.java @@ -88,11 +88,7 @@ public class FlintOptions implements Serializable { public static final int DEFAULT_SOCKET_TIMEOUT_MILLIS = 60000; public static final int DEFAULT_INACTIVITY_LIMIT_MILLIS = 3 * 60 * 1000; - - public static final String REQUEST_COMPLETION_DELAY_MILLIS = "request.completionDelayMillis"; - public static final int DEFAULT_REQUEST_COMPLETION_DELAY_MILLIS = 0; - public static final int DEFAULT_AOSS_REQUEST_COMPLETION_DELAY_MILLIS = 2000; - + public static final String DATA_SOURCE_NAME = "spark.flint.datasource.name"; public static final String BATCH_BYTES = "write.batch_bytes"; @@ -182,13 +178,6 @@ public int getSocketTimeoutMillis() { return Integer.parseInt(options.getOrDefault(SOCKET_TIMEOUT_MILLIS, String.valueOf(DEFAULT_SOCKET_TIMEOUT_MILLIS))); } - public int getRequestCompletionDelayMillis() { - int defaultValue = SERVICE_NAME_AOSS.equals(getServiceName()) - ? DEFAULT_AOSS_REQUEST_COMPLETION_DELAY_MILLIS - : DEFAULT_REQUEST_COMPLETION_DELAY_MILLIS; - return Integer.parseInt(options.getOrDefault(REQUEST_COMPLETION_DELAY_MILLIS, String.valueOf(defaultValue))); - } - public String getDataSourceName() { return options.getOrDefault(DATA_SOURCE_NAME, ""); } diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java b/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java index 597f441ec..8f6e2c07e 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/http/FlintRetryOptions.java @@ -6,12 +6,8 @@ package org.opensearch.flint.core.http; import static java.time.temporal.ChronoUnit.SECONDS; -import static org.opensearch.flint.core.FlintOptions.SERVICE_NAME; -import static org.opensearch.flint.core.FlintOptions.SERVICE_NAME_AOSS; -import static org.opensearch.flint.core.FlintOptions.SERVICE_NAME_ES; import dev.failsafe.RetryPolicy; -import dev.failsafe.RetryPolicyBuilder; import dev.failsafe.event.ExecutionAttemptedEvent; import dev.failsafe.function.CheckedPredicate; import java.time.Duration; @@ -20,7 +16,6 @@ import java.util.logging.Logger; import org.opensearch.action.bulk.BulkResponse; import org.opensearch.flint.core.http.handler.ExceptionClassNameFailurePredicate; -import org.opensearch.flint.core.http.handler.HttpAOSSResultPredicate; import org.opensearch.flint.core.http.handler.HttpStatusCodeResultPredicate; import java.io.Serializable; @@ -70,7 +65,7 @@ public boolean isRetryEnabled() { * @return Failsafe retry policy */ public RetryPolicy getRetryPolicy() { - RetryPolicyBuilder builder = RetryPolicy.builder() + return RetryPolicy.builder() // Backoff strategy config (can be configurable as needed in future) .withBackoff(1, 30, SECONDS) .withJitter(Duration.ofMillis(100)) @@ -80,11 +75,8 @@ public RetryPolicy getRetryPolicy() { .handleResultIf(new HttpStatusCodeResultPredicate<>(getRetryableHttpStatusCodes())) // Logging listener .onFailedAttempt(FlintRetryOptions::onFailure) - .onRetry(FlintRetryOptions::onRetry); - if (SERVICE_NAME_AOSS.equals(getServiceName())) { - builder.handleResultIf(new HttpAOSSResultPredicate<>()); - } - return builder.build(); + .onRetry(FlintRetryOptions::onRetry) + .build(); } public RetryPolicy getBulkRetryPolicy(CheckedPredicate resultPredicate) { @@ -109,10 +101,6 @@ private static void onRetry(ExecutionAttemptedEvent event) { LOG.warning("Retrying failed request at #" + event.getAttemptCount()); } - private String getServiceName() { - return options.getOrDefault(SERVICE_NAME, SERVICE_NAME_ES); - } - /** * @return maximum retry option value */ diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/http/handler/HttpAOSSResultPredicate.java b/flint-core/src/main/scala/org/opensearch/flint/core/http/handler/HttpAOSSResultPredicate.java deleted file mode 100644 index 8bfb05fa3..000000000 --- a/flint-core/src/main/scala/org/opensearch/flint/core/http/handler/HttpAOSSResultPredicate.java +++ /dev/null @@ -1,66 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.flint.core.http.handler; - -import dev.failsafe.function.CheckedPredicate; -import org.apache.http.HttpEntity; -import org.apache.http.HttpResponse; -import org.apache.http.entity.BufferedHttpEntity; -import org.apache.http.util.EntityUtils; - -import java.util.logging.Logger; - -/** - * Failure handler based on HTTP response from AOSS. - * - * @param result type (supposed to be HttpResponse for OS client) - */ -public class HttpAOSSResultPredicate implements CheckedPredicate { - - private static final Logger LOG = Logger.getLogger(HttpAOSSResultPredicate.class.getName()); - - public static final int BAD_REQUEST_STATUS_CODE = 400; - public static final String RESOURCE_ALREADY_EXISTS_EXCEPTION_MESSAGE = "resource_already_exists_exception"; - - public HttpAOSSResultPredicate() { } - - @Override - public boolean test(T result) throws Throwable { - LOG.info("Checking if response is retryable"); - - int statusCode = ((HttpResponse) result).getStatusLine().getStatusCode(); - if (statusCode != BAD_REQUEST_STATUS_CODE) { - LOG.info("Status code " + statusCode + " is not " + BAD_REQUEST_STATUS_CODE + ". Check result: false"); - return false; - } - - HttpResponse response = (HttpResponse) result; - HttpEntity entity = response.getEntity(); - if (entity == null) { - LOG.info("No response entity found. Check result: false"); - return false; - } - - // Buffer the entity to make it repeatable, so that this retry test does not consume the content stream, - // resulting in the request caller getting empty response - BufferedHttpEntity bufferedEntity = new BufferedHttpEntity(entity); - response.setEntity(bufferedEntity); - - try { - String responseContent = EntityUtils.toString(bufferedEntity); - // Effectively restores the content stream of the response - bufferedEntity.getContent().reset(); - - boolean isRetryable = responseContent.contains(RESOURCE_ALREADY_EXISTS_EXCEPTION_MESSAGE); - - LOG.info("Check retryable response result: " + isRetryable); - return isRetryable; - } catch (Exception e) { - LOG.info("Unable to parse response body. Check result: false"); - return false; - } - } -} diff --git a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java index 5861ccf22..2bc097bba 100644 --- a/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java +++ b/flint-core/src/main/scala/org/opensearch/flint/core/storage/FlintOpenSearchClient.java @@ -44,7 +44,6 @@ public void createIndex(String indexName, FlintMetadata metadata) { LOG.info("Creating Flint index " + indexName + " with metadata " + metadata); try { createIndex(indexName, FlintOpenSearchIndexMetadataService.serialize(metadata, false), metadata.indexSettings()); - waitRequestComplete(); // Delay to ensure create is complete before making other requests for the index emitIndexCreationSuccessMetric(metadata.kind()); } catch (IllegalStateException ex) { emitIndexCreationFailureMetric(metadata.kind()); @@ -132,14 +131,6 @@ private String sanitizeIndexName(String indexName) { return OpenSearchClientUtils.sanitizeIndexName(indexName); } - private void waitRequestComplete() { - try { - Thread.sleep(options.getRequestCompletionDelayMillis()); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - } - } - private void emitIndexCreationSuccessMetric(String indexKind) { emitIndexCreationMetric(indexKind, "success"); } diff --git a/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala b/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala index 8a8927920..7d3b79a9e 100644 --- a/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala +++ b/flint-core/src/test/scala/org/opensearch/flint/core/http/RetryableHttpAsyncClientSuite.scala @@ -12,13 +12,11 @@ import java.util.concurrent.{ExecutionException, Future} import scala.collection.JavaConverters.mapAsJavaMapConverter -import org.apache.http.HttpEntity import org.apache.http.HttpResponse import org.apache.http.concurrent.FutureCallback import org.apache.http.impl.nio.client.{CloseableHttpAsyncClient, HttpAsyncClientBuilder} import org.apache.http.nio.protocol.{HttpAsyncRequestProducer, HttpAsyncResponseConsumer} import org.apache.http.protocol.HttpContext -import org.apache.http.util.EntityUtils import org.mockito.ArgumentMatchers.any import org.mockito.Mockito._ import org.mockito.verification.VerificationMode @@ -155,23 +153,6 @@ class RetryableHttpAsyncClientSuite extends AnyFlatSpec with BeforeAndAfter with expectFutureGetTimes = times(0)) } - it should "retry if AOSS response is retryable" in { - retryableClient - .withOption("auth.servicename", "aoss") - .whenResponse( - 400, - "OpenSearchStatusException[OpenSearch exception [type=resource_already_exists_exception,") - .shouldExecute(times(DEFAULT_MAX_RETRIES + 1)) - } - - it should "not apply retry policy for AOSS response if service is not AOSS" in { - retryableClient - .whenResponse( - 400, - "OpenSearchStatusException[OpenSearch exception [type=resource_already_exists_exception,") - .shouldExecute(times(1)) - } - private def retryableClient: AssertionHelper = new AssertionHelper class AssertionHelper { @@ -194,17 +175,6 @@ class RetryableHttpAsyncClientSuite extends AnyFlatSpec with BeforeAndAfter with this } - def whenResponse(statusCode: Int, responseMessage: String): AssertionHelper = { - val entity = mock[HttpEntity](RETURNS_DEEP_STUBS) - mockStatic(classOf[EntityUtils]) - when(EntityUtils.toString(any[HttpEntity])).thenReturn(responseMessage) - val response = mock[HttpResponse](RETURNS_DEEP_STUBS) - when(response.getStatusLine.getStatusCode).thenReturn(statusCode) - when(response.getEntity).thenReturn(entity) - when(future.get()).thenReturn(response) - this - } - def shouldExecute(expectExecuteTimes: VerificationMode): Unit = { shouldExecute(expectExecuteTimes, expectExecuteTimes) } diff --git a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala index 364a8a1de..bdcc120c0 100644 --- a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala +++ b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/config/FlintSparkConf.scala @@ -201,11 +201,6 @@ object FlintSparkConf { .datasourceOption() .doc("socket duration in milliseconds") .createWithDefault(String.valueOf(FlintOptions.DEFAULT_SOCKET_TIMEOUT_MILLIS)) - val REQUEST_COMPLETION_DELAY_MILLIS = - FlintConfig(s"spark.datasource.flint.${FlintOptions.REQUEST_COMPLETION_DELAY_MILLIS}") - .datasourceOption() - .doc("delay in milliseconds after index creation is completed") - .createOptional() val DATA_SOURCE_NAME = FlintConfig(s"spark.flint.datasource.name") .doc("data source name") @@ -361,8 +356,7 @@ case class FlintSparkConf(properties: JMap[String, String]) extends Serializable REQUEST_INDEX, METADATA_ACCESS_AWS_CREDENTIALS_PROVIDER, EXCLUDE_JOB_IDS, - SCROLL_SIZE, - REQUEST_COMPLETION_DELAY_MILLIS) + SCROLL_SIZE) .map(conf => (conf.optionKey, conf.readFrom(reader))) .flatMap { case (_, None) => None diff --git a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala index 19fe28a2d..a4b23bd46 100644 --- a/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala +++ b/flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/datatype/FlintDataType.scala @@ -142,7 +142,6 @@ object FlintDataType { case ByteType => JObject("type" -> JString("byte")) case DoubleType => JObject("type" -> JString("double")) case FloatType => JObject("type" -> JString("float")) - case DecimalType() => JObject("type" -> JString("double")) // Date case TimestampType | _: TimestampNTZType => @@ -154,9 +153,6 @@ object FlintDataType { // objects case st: StructType => serializeJValue(st) - // Serialize maps as empty objects and let the map entries automap - case mt: MapType => serializeJValue(new StructType()) - // array case ArrayType(elementType, _) => serializeField(elementType, Metadata.empty) diff --git a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala index fbc24e93a..68d2409ee 100644 --- a/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala +++ b/flint-spark-integration/src/main/scala/org/opensearch/flint/spark/FlintSpark.scala @@ -510,10 +510,6 @@ class FlintSpark(val spark: SparkSession) extends FlintSparkTransactionSupport w private def isSchedulerModeChanged( originalOptions: FlintSparkIndexOptions, updatedOptions: FlintSparkIndexOptions): Boolean = { - // Altering from manual to auto should not be interpreted as a scheduling mode change. - if (!originalOptions.options.contains(SCHEDULER_MODE.toString)) { - return false - } updatedOptions.isExternalSchedulerEnabled() != originalOptions.isExternalSchedulerEnabled() } diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala index 1d301087f..b675265b7 100644 --- a/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala +++ b/flint-spark-integration/src/test/scala/org/apache/spark/FlintSuite.scala @@ -12,7 +12,6 @@ import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation import org.apache.spark.sql.flint.config.{FlintConfigEntry, FlintSparkConf} import org.apache.spark.sql.flint.config.FlintSparkConf.{EXTERNAL_SCHEDULER_ENABLED, HYBRID_SCAN_ENABLED, METADATA_CACHE_WRITE} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.internal.StaticSQLConf.WAREHOUSE_PATH import org.apache.spark.sql.test.SharedSparkSession trait FlintSuite extends SharedSparkSession { @@ -31,7 +30,6 @@ trait FlintSuite extends SharedSparkSession { .set( FlintSparkConf.CUSTOM_FLINT_SCHEDULER_CLASS.key, "org.opensearch.flint.core.scheduler.AsyncQuerySchedulerBuilderTest$AsyncQuerySchedulerForLocalTest") - .set(WAREHOUSE_PATH.key, s"spark-warehouse/${suiteName}") conf } diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala index 594322bae..0cde6ab0f 100644 --- a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala +++ b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/config/FlintSparkConfSuite.scala @@ -114,21 +114,6 @@ class FlintSparkConfSuite extends FlintSuite { } } - test("test request completionDelayMillis default value") { - FlintSparkConf().flintOptions().getRequestCompletionDelayMillis shouldBe 0 - } - - test("test request completionDelayMillis default value for aoss") { - val options = FlintSparkConf(Map("auth.servicename" -> "aoss").asJava).flintOptions() - options.getRequestCompletionDelayMillis shouldBe 2000 - } - - test("test specified request completionDelayMillis") { - val options = - FlintSparkConf(Map("request.completionDelayMillis" -> "1000").asJava).flintOptions() - options.getRequestCompletionDelayMillis shouldBe 1000 - } - test("externalSchedulerIntervalThreshold should return default value when empty") { val options = FlintSparkConf(Map("spark.flint.job.externalScheduler.interval" -> "").asJava) assert(options diff --git a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala index 312f3a5a1..94f4839d6 100644 --- a/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala +++ b/flint-spark-integration/src/test/scala/org/apache/spark/sql/flint/datatype/FlintDataTypeSuite.scala @@ -128,35 +128,6 @@ class FlintDataTypeSuite extends FlintSuite with Matchers { |}""".stripMargin) } - test("spark map type serialize") { - val sparkStructType = StructType( - StructField("mapField", MapType(StringType, StringType), true) :: - Nil) - - FlintDataType.serialize(sparkStructType) shouldBe compactJson("""{ - | "properties": { - | "mapField": { - | "properties": { - | } - | } - | } - |}""".stripMargin) - } - - test("spark decimal type serialize") { - val sparkStructType = StructType( - StructField("decimalField", DecimalType(1, 1), true) :: - Nil) - - FlintDataType.serialize(sparkStructType) shouldBe compactJson("""{ - | "properties": { - | "decimalField": { - | "type": "double" - | } - | } - |}""".stripMargin) - } - test("spark varchar and char type serialize") { val flintDataType = """{ | "properties": { diff --git a/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala b/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala index 81bf60f5e..11bc7271c 100644 --- a/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala +++ b/integ-test/src/integration/scala/org/apache/spark/sql/FlintJobITSuite.scala @@ -81,42 +81,36 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest { } } - def createJobOperator(query: String, jobRunId: String): JobOperator = { - val streamingRunningCount = new AtomicInteger(0) - - /* - * Because we cannot test from FlintJob.main() for the reason below, we have to configure - * all Spark conf required by Flint code underlying manually. - */ - spark.conf.set(DATA_SOURCE_NAME.key, dataSourceName) - spark.conf.set(JOB_TYPE.key, FlintJobType.STREAMING) - - val job = JobOperator( - appId, - jobRunId, - spark, - query, - queryId, - dataSourceName, - resultIndex, - FlintJobType.STREAMING, - streamingRunningCount) - job.terminateJVM = false - job - } - def startJob(query: String, jobRunId: String): Future[Unit] = { val prefix = "flint-job-test" val threadPool = ThreadUtils.newDaemonThreadPoolScheduledExecutor(prefix, 1) implicit val executionContext = ExecutionContext.fromExecutor(threadPool) + val streamingRunningCount = new AtomicInteger(0) val futureResult = Future { + /* + * Because we cannot test from FlintJob.main() for the reason below, we have to configure + * all Spark conf required by Flint code underlying manually. + */ + spark.conf.set(DATA_SOURCE_NAME.key, dataSourceName) + spark.conf.set(JOB_TYPE.key, FlintJobType.STREAMING) /** * FlintJob.main() is not called because we need to manually set these variables within a * JobOperator instance to accommodate specific runtime requirements. */ - val job = createJobOperator(query, jobRunId) + val job = + JobOperator( + appId, + jobRunId, + spark, + query, + queryId, + dataSourceName, + resultIndex, + FlintJobType.STREAMING, + streamingRunningCount) + job.terminateJVM = false job.start() } futureResult.onComplete { @@ -297,10 +291,6 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest { } test("create skipping index with non-existent table") { - val prefix = "flint-job-test" - val threadPool = ThreadUtils.newDaemonThreadPoolScheduledExecutor(prefix, 1) - implicit val executionContext = ExecutionContext.fromExecutor(threadPool) - val query = s""" | CREATE SKIPPING INDEX ON testTable @@ -313,9 +303,7 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest { | """.stripMargin val queryStartTime = System.currentTimeMillis() val jobRunId = "00ff4o3b5091080r" - - val job = createJobOperator(query, jobRunId) - threadLocalFuture.set(Future(job.start())) + threadLocalFuture.set(startJob(query, jobRunId)) val validation: REPLResult => Boolean = result => { assert( @@ -327,9 +315,6 @@ class FlintJobITSuite extends FlintSparkSuite with JobTest { assert(result.status == "FAILED", s"expected status is FAILED, but got ${result.status}") assert(!result.error.isEmpty, s"we expect error, but got ${result.error}") - assert( - job.throwableHandler.error.contains("Table spark_catalog.default.testTable is not found"), - "Expected error message to mention 'spark_catalog.default.testTable is not found'") commonAssert(result, jobRunId, query, queryStartTime) true } diff --git a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala index fe3cefef8..a2c2d26f6 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/core/FlintOpenSearchClientSuite.scala @@ -65,27 +65,6 @@ class FlintOpenSearchClientSuite extends AnyFlatSpec with OpenSearchSuite with M (settings \ "index.number_of_replicas").extract[String] shouldBe "2" } - it should "create index with request completion delay config" in { - val metadata = FlintOpenSearchIndexMetadataService.deserialize("{}") - // Create a dummy index to avoid timing the initial overhead - flintClient.createIndex("dummy", metadata) - - val indexName = "flint_test_without_request_completion_delay" - val elapsedTimeWithoutDelay = timer { - flintClient.createIndex(indexName, metadata) - } - - val delayIndexName = "flint_test_with_request_completion_delay" - val delayOptions = - openSearchOptions + (FlintOptions.REQUEST_COMPLETION_DELAY_MILLIS -> "2000") - val delayFlintOptions = new FlintOptions(delayOptions.asJava) - val delayFlintClient = new FlintOpenSearchClient(delayFlintOptions) - val elapsedTimeWithDelay = timer { - delayFlintClient.createIndex(delayIndexName, metadata) - } - elapsedTimeWithDelay - elapsedTimeWithoutDelay should be >= 1800L // allowing 200ms of wiggle room - } - it should "get all index names with the given index name pattern" in { val metadata = FlintOpenSearchIndexMetadataService.deserialize( """{"properties": {"test": { "type": "integer" } } }""") @@ -241,11 +220,4 @@ class FlintOpenSearchClientSuite extends AnyFlatSpec with OpenSearchSuite with M def createTable(indexName: String, options: FlintOptions): Table = { OpenSearchCluster.apply(indexName, options).asScala.head } - - def timer(block: => Unit): Long = { - val start = System.currentTimeMillis() - block - val end = System.currentTimeMillis() - end - start - } } diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala index bf5e6309e..ae2e53090 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkMaterializedViewSqlITSuite.scala @@ -523,45 +523,5 @@ class FlintSparkMaterializedViewSqlITSuite extends FlintSparkSuite { } } - test("create materialized view with decimal and map types") { - val decimalAndMapTable = s"$catalogName.default.mv_test_decimal_map" - val decimalAndMapMv = s"$catalogName.default.mv_test_decimal_map_ser" - withTable(decimalAndMapTable) { - createMapAndDecimalTimeSeriesTable(decimalAndMapTable) - - withTempDir { checkpointDir => - sql(s""" - | CREATE MATERIALIZED VIEW $decimalAndMapMv - | AS - | SELECT - | base_score, mymap - | FROM $decimalAndMapTable - | WITH ( - | auto_refresh = true, - | checkpoint_location = '${checkpointDir.getAbsolutePath}' - | ) - |""".stripMargin) - - // Wait for streaming job complete current micro batch - val flintIndex = getFlintIndexName(decimalAndMapMv) - val job = spark.streams.active.find(_.name == flintIndex) - job shouldBe defined - failAfter(streamingTimeout) { - job.get.processAllAvailable() - } - - flint.describeIndex(flintIndex) shouldBe defined - checkAnswer( - flint.queryIndex(flintIndex).select("base_score", "mymap"), - Seq( - Row(3.1415926, Row(null, null, null, null, "mapvalue1")), - Row(4.1415926, Row("mapvalue2", null, null, null, null)), - Row(5.1415926, Row(null, null, "mapvalue3", null, null)), - Row(6.1415926, Row(null, null, null, "mapvalue4", null)), - Row(7.1415926, Row(null, "mapvalue5", null, null, null)))) - } - } - } - private def timestamp(ts: String): Timestamp = Timestamp.valueOf(ts) } diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala index 7c19cab12..68d370791 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkSuite.scala @@ -445,34 +445,6 @@ trait FlintSparkSuite extends QueryTest with FlintSuite with OpenSearchSuite wit sql(s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 03:00:00', 'E', 15, 'Vancouver')") } - protected def createMapAndDecimalTimeSeriesTable(testTable: String): Unit = { - // CSV tables do not support MAP types so we use JSON instead - val finalTableType = if (tableType == "CSV") "JSON" else tableType - - sql(s""" - | CREATE TABLE $testTable - | ( - | time TIMESTAMP, - | name STRING, - | age INT, - | base_score DECIMAL(8, 7), - | mymap MAP - | ) - | USING $finalTableType $tableOptions - |""".stripMargin) - - sql( - s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 00:01:00', 'A', 30, 3.1415926, Map('mapkey1', 'mapvalue1'))") - sql( - s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 00:10:00', 'B', 20, 4.1415926, Map('mapkey2', 'mapvalue2'))") - sql( - s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 00:15:00', 'C', 35, 5.1415926, Map('mapkey3', 'mapvalue3'))") - sql( - s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 01:00:00', 'D', 40, 6.1415926, Map('mapkey4', 'mapvalue4'))") - sql( - s"INSERT INTO $testTable VALUES (TIMESTAMP '2023-10-01 03:00:00', 'E', 15, 7.1415926, Map('mapkey5', 'mapvalue5'))") - } - protected def createTimeSeriesTransactionTable(testTable: String): Unit = { sql(s""" | CREATE TABLE $testTable diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala index f27c0dae9..c9f6c47f7 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/FlintSparkUpdateIndexITSuite.scala @@ -618,44 +618,6 @@ class FlintSparkUpdateIndexITSuite extends FlintSparkSuite { flint.queryIndex(testIndex).collect().toSet should have size 2 } - test("update full refresh index to auto refresh should start job with external scheduler") { - setFlintSparkConf(FlintSparkConf.EXTERNAL_SCHEDULER_ENABLED, "true") - - withTempDir { checkpointDir => - // Create full refresh Flint index - flint - .skippingIndex() - .onTable(testTable) - .addPartitions("year", "month") - .options(FlintSparkIndexOptions(Map("auto_refresh" -> "false")), testIndex) - .create() - - spark.streams.active.find(_.name == testIndex) shouldBe empty - flint.queryIndex(testIndex).collect().toSet should have size 0 - val indexInitial = flint.describeIndex(testIndex).get - indexInitial.options.isExternalSchedulerEnabled() shouldBe false - - val updatedIndex = flint - .skippingIndex() - .copyWithUpdate( - indexInitial, - FlintSparkIndexOptions( - Map( - "auto_refresh" -> "true", - "checkpoint_location" -> checkpointDir.getAbsolutePath))) - - val jobId = flint.updateIndex(updatedIndex) - jobId shouldBe empty - val indexFinal = flint.describeIndex(testIndex).get - indexFinal.options.isExternalSchedulerEnabled() shouldBe true - indexFinal.options.autoRefresh() shouldBe true - indexFinal.options.refreshInterval() shouldBe Some( - FlintOptions.DEFAULT_EXTERNAL_SCHEDULER_INTERVAL) - - verifySchedulerIndex(testIndex, 5, "MINUTES") - } - } - test("update incremental refresh index to auto refresh should start job") { withTempDir { checkpointDir => // Create incremental refresh Flint index and wait for complete @@ -705,51 +667,6 @@ class FlintSparkUpdateIndexITSuite extends FlintSparkSuite { } } - test( - "update incremental refresh index to auto refresh should start job with external scheduler") { - setFlintSparkConf(FlintSparkConf.EXTERNAL_SCHEDULER_ENABLED, "true") - - withTempDir { checkpointDir => - // Create incremental refresh Flint index - flint - .skippingIndex() - .onTable(testTable) - .addPartitions("year", "month") - .options( - FlintSparkIndexOptions( - Map( - "incremental_refresh" -> "true", - "checkpoint_location" -> checkpointDir.getAbsolutePath)), - testIndex) - .create() - - spark.streams.active.find(_.name == testIndex) shouldBe empty - flint.queryIndex(testIndex).collect().toSet should have size 0 - val indexInitial = flint.describeIndex(testIndex).get - indexInitial.options.isExternalSchedulerEnabled() shouldBe false - - val updatedIndex = flint - .skippingIndex() - .copyWithUpdate( - indexInitial, - FlintSparkIndexOptions( - Map( - "auto_refresh" -> "true", - "incremental_refresh" -> "false", - "checkpoint_location" -> checkpointDir.getAbsolutePath))) - - val jobId = flint.updateIndex(updatedIndex) - jobId shouldBe empty - val indexFinal = flint.describeIndex(testIndex).get - indexFinal.options.isExternalSchedulerEnabled() shouldBe true - indexFinal.options.autoRefresh() shouldBe true - indexFinal.options.refreshInterval() shouldBe Some( - FlintOptions.DEFAULT_EXTERNAL_SCHEDULER_INTERVAL) - - verifySchedulerIndex(testIndex, 5, "MINUTES") - } - } - test("update auto refresh index to full refresh should stop job") { // Create auto refresh Flint index and wait for complete flint diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala deleted file mode 100644 index a9b01b9e3..000000000 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLCastITSuite.scala +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.flint.spark.ppl - -import java.sql.Date -import java.sql.Timestamp - -import org.apache.spark.sql.{QueryTest, Row} -import org.apache.spark.sql.streaming.StreamTest - -class FlintSparkPPLCastITSuite - extends QueryTest - with LogicalPlanTestUtils - with FlintPPLSuite - with StreamTest { - - /** Test table and index name */ - private val testTable = "spark_catalog.default.flint_ppl_test" - - override def beforeAll(): Unit = { - super.beforeAll() - // Create test table - createNullableJsonContentTable(testTable) - } - - protected override def afterEach(): Unit = { - super.afterEach() - // Stop all streaming jobs if any - spark.streams.active.foreach { job => - job.stop() - job.awaitTermination() - } - } - - test("test cast number to compatible data types") { - val frame = sql(s""" - | source=$testTable | eval - | id_string = cast(id as string), - | id_double = cast(id as double), - | id_long = cast(id as long), - | id_boolean = cast(id as boolean) - | | fields id, id_string, id_double, id_long, id_boolean | head 1 - | """.stripMargin) - - assert( - frame.dtypes.sameElements( - Array( - ("id", "IntegerType"), - ("id_string", "StringType"), - ("id_double", "DoubleType"), - ("id_long", "LongType"), - ("id_boolean", "BooleanType")))) - assertSameRows(Seq(Row(1, "1", 1.0, 1L, true)), frame) - } - - test("test cast string to compatible data types") { - val frame = sql(s""" - | source=$testTable | eval - | id_int = cast(cast(id as string) as integer), - | cast_true = cast("True" as boolean), - | cast_false = cast("false" as boolean), - | cast_timestamp = cast("2024-11-26 23:39:06" as timestamp), - | cast_date = cast("2024-11-26" as date) - | | fields id_int, cast_true, cast_false, cast_timestamp, cast_date | head 1 - | """.stripMargin) - - assert( - frame.dtypes.sameElements( - Array( - ("id_int", "IntegerType"), - ("cast_true", "BooleanType"), - ("cast_false", "BooleanType"), - ("cast_timestamp", "TimestampType"), - ("cast_date", "DateType")))) - assertSameRows( - Seq( - Row( - 1, - true, - false, - Timestamp.valueOf("2024-11-26 23:39:06"), - Date.valueOf("2024-11-26"))), - frame) - } - - test("test cast time related types to compatible data types") { - val frame = sql(s""" - | source=$testTable | eval - | timestamp = cast("2024-11-26 23:39:06" as timestamp), - | ts_str = cast(timestamp as string), - | ts_date = cast(timestamp as date), - | date_str = cast(ts_date as string), - | date_ts = cast(ts_date as timestamp) - | | fields timestamp, ts_str, ts_date, date_str, date_ts | head 1 - | """.stripMargin) - - assert( - frame.dtypes.sameElements( - Array( - ("timestamp", "TimestampType"), - ("ts_str", "StringType"), - ("ts_date", "DateType"), - ("date_str", "StringType"), - ("date_ts", "TimestampType")))) - assertSameRows( - Seq( - Row( - Timestamp.valueOf("2024-11-26 23:39:06"), - "2024-11-26 23:39:06", - Date.valueOf("2024-11-26"), - "2024-11-26", - Timestamp.valueOf("2024-11-26 00:00:00"))), - frame) - } - -} diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala index ca96c126f..4788aa23f 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFillnullITSuite.scala @@ -277,26 +277,6 @@ class FlintSparkPPLFillnullITSuite assert(ex.getMessage().contains("Syntax error ")) } - test("test fillnull with null_replacement type mismatch") { - val frame = sql(s""" - | source = $testTable | fillnull with cast(0 as long) in status_code - | """.stripMargin) - - assert(frame.columns.sameElements(Array("id", "request_path", "timestamp", "status_code"))) - val results: Array[Row] = frame.collect() - val expectedResults: Array[Row] = - Array( - Row(1, "/home", null, 200), - Row(2, "/about", "2023-10-01 10:05:00", 0), - Row(3, "/contact", "2023-10-01 10:10:00", 0), - Row(4, null, "2023-10-01 10:15:00", 301), - Row(5, null, "2023-10-01 10:20:00", 200), - Row(6, "/home", null, 403)) - // Compare the results - implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Int](_.getAs[Int](0)) - assert(results.sorted.sameElements(expectedResults.sorted)) - } - private def fillNullExpectedPlan( nullReplacements: Seq[(String, Expression)], addDefaultProject: Boolean = true): LogicalPlan = { diff --git a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala index 7d1b6e437..e714a5f7e 100644 --- a/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala +++ b/integ-test/src/integration/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLFlattenITSuite.scala @@ -9,7 +9,7 @@ import java.nio.file.Files import org.opensearch.flint.spark.FlattenGenerator import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq -import org.apache.spark.sql.{AnalysisException, QueryTest, Row} +import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} import org.apache.spark.sql.catalyst.expressions.{Alias, EqualTo, GeneratorOuter, Literal, Or} import org.apache.spark.sql.catalyst.plans.logical._ @@ -347,85 +347,4 @@ class FlintSparkPPLFlattenITSuite val expectedPlan = Project(Seq(UnresolvedStar(None)), flattenMultiValue) comparePlans(logicalPlan, expectedPlan, checkAnalysis = false) } - - test("flatten struct nested table using alias") { - val frame = sql(s""" - | source = $structNestedTable - | | flatten struct_col - | | flatten field1 as subfield_1 - | | flatten struct_col2 as (field1, field2_2) - | | flatten field1 as subfield_2 - | """.stripMargin) - - assert( - frame.columns.sameElements( - Array("int_col", "field2", "subfield_1", "field2_2", "subfield_2"))) - val results: Array[Row] = frame.collect() - implicit val rowOrdering: Ordering[Row] = Ordering.by[Row, Int](_.getAs[Int](0)) - val expectedResults: Array[Row] = - Array( - Row(30, 123, "value1", 23, "valueA"), - Row(40, 123, "value5", 33, "valueB"), - Row(30, 823, "value4", 83, "valueC"), - Row(40, 456, "value2", 46, "valueD"), - Row(50, 789, "value3", 89, "valueE")).sorted - // Compare the results - assert(results.sorted.sameElements(expectedResults)) - - // duplicate alias names - val frame2 = sql(s""" - | source = $structNestedTable - | | flatten struct_col as (field1, field2_2) - | | flatten field1 as subfield_1 - | | flatten struct_col2 as (field1, field2_2) - | | flatten field1 as subfield_2 - | """.stripMargin) - - // alias names duplicate with existing fields - assert( - frame2.columns.sameElements( - Array("int_col", "field2_2", "subfield_1", "field2_2", "subfield_2"))) - assert(frame2.collect().sorted.sameElements(expectedResults)) - - val frame3 = sql(s""" - | source = $structNestedTable - | | flatten struct_col as (field1, field2_2) - | | flatten field1 as int_col - | | flatten struct_col2 as (field1, field2_2) - | | flatten field1 as int_col - | """.stripMargin) - - assert( - frame3.columns.sameElements(Array("int_col", "field2_2", "int_col", "field2_2", "int_col"))) - assert(frame3.collect().sorted.sameElements(expectedResults)) - - // Throw AnalysisException if The number of aliases supplied in the AS clause does not match the - // number of columns output - val except = intercept[AnalysisException] { - sql(s""" - | source = $structNestedTable - | | flatten struct_col as (field1) - | | flatten field1 as int_col - | | flatten struct_col2 as (field1, field2_2) - | | flatten field1 as int_col - | """.stripMargin) - } - assert(except.message.contains( - "The number of aliases supplied in the AS clause does not match the number of columns output by the UDTF")) - - // Throw AnalysisException because of ambiguous - val except2 = intercept[AnalysisException] { - sql(s""" - | source = $structNestedTable - | | flatten struct_col as (field1, field2_2) - | | flatten field1 as int_col - | | flatten struct_col2 as (field1, field2_2) - | | flatten field1 as int_col - | | fields field2_2 - | """.stripMargin) - } - assert(except2.message.contains( - "[AMBIGUOUS_REFERENCE] Reference `field2_2` is ambiguous, could be: [`field2_2`, `field2_2`].")) - } - } diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 index d15f5c8e3..f3c6acda9 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 @@ -416,9 +416,6 @@ ISPRESENT: 'ISPRESENT'; BETWEEN: 'BETWEEN'; CIDRMATCH: 'CIDRMATCH'; -// Geo Loction -GEOIP: 'GEOIP'; - // FLOWCONTROL FUNCTIONS IFNULL: 'IFNULL'; NULLIF: 'NULLIF'; @@ -428,6 +425,19 @@ TYPEOF: 'TYPEOF'; //OTHER CONDITIONAL EXPRESSIONS COALESCE: 'COALESCE'; +//GEOLOCATION FUNCTIONS +GEOIP: 'GEOIP'; + +//GEOLOCATION PROPERTIES +COUNTRY_ISO_CODE: 'COUNTRY_ISO_CODE'; +COUNTRY_NAME: 'COUNTRY_NAME'; +CONTINENT_NAME: 'CONTINENT_NAME'; +REGION_ISO_CODE: 'REGION_ISO_CODE'; +REGION_NAME: 'REGION_NAME'; +CITY_NAME: 'CITY_NAME'; +LAT: 'LAT'; +LON: 'LON'; + // RELEVANCE FUNCTIONS AND PARAMETERS MATCH: 'MATCH'; MATCH_PHRASE: 'MATCH_PHRASE'; diff --git a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 index 2466a3d23..b15f59b4b 100644 --- a/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 +++ b/ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 @@ -45,6 +45,7 @@ commands | headCommand | topCommand | rareCommand + | geoipCommand | evalCommand | grokCommand | parseCommand @@ -177,6 +178,10 @@ evalCommand : EVAL evalClause (COMMA evalClause)* ; +geoipCommand + : EVAL fieldExpression EQUAL GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = geoIpPropertyList)? RT_PRTHS + ; + headCommand : HEAD (number = integerLiteral)? (FROM from = integerLiteral)? ; @@ -237,16 +242,21 @@ fillnullCommand | fillNullWithFieldVariousValues) ; -fillNullWithTheSameValue - : WITH nullReplacement = valueExpression IN nullableFieldList = fieldList - ; + fillNullWithTheSameValue + : WITH nullReplacement IN nullableField (COMMA nullableField)* + ; + + fillNullWithFieldVariousValues + : USING nullableField EQUAL nullReplacement (COMMA nullableField EQUAL nullReplacement)* + ; -fillNullWithFieldVariousValues - : USING nullableReplacementExpression (COMMA nullableReplacementExpression)* + + nullableField + : fieldExpression ; -nullableReplacementExpression - : nullableField = fieldExpression EQUAL nullableReplacement = valueExpression + nullReplacement + : expression ; expandCommand @@ -254,7 +264,7 @@ expandCommand ; flattenCommand - : FLATTEN fieldExpression (AS alias = identifierSeq)? + : FLATTEN fieldExpression ; trendlineCommand @@ -446,7 +456,6 @@ valueExpression | positionFunction # positionFunctionCall | caseFunction # caseExpr | timestampFunction # timestampFunctionCall - | geoipFunction # geoFunctionCall | LT_PRTHS valueExpression RT_PRTHS # parentheticValueExpr | LT_SQR_PRTHS subSearch RT_SQR_PRTHS # scalarSubqueryExpr | ident ARROW expression # lambda @@ -457,7 +466,6 @@ primaryExpression : evalFunctionCall | fieldExpression | literalValue - | dataTypeFunctionCall ; positionFunction @@ -544,11 +552,6 @@ dataTypeFunctionCall : CAST LT_PRTHS expression AS convertedDataType RT_PRTHS ; -// geoip function -geoipFunction - : GEOIP LT_PRTHS (datasource = functionArg COMMA)? ipAddress = functionArg (COMMA properties = stringLiteral)? RT_PRTHS - ; - // boolean functions booleanFunctionCall : conditionFunctionBase LT_PRTHS functionArgs RT_PRTHS @@ -582,7 +585,6 @@ evalFunctionName | cryptographicFunctionName | jsonFunctionName | collectionFunctionName - | geoipFunctionName | lambdaFunctionName ; @@ -913,6 +915,22 @@ coalesceFunctionName : COALESCE ; +geoIpPropertyList + : geoIpProperty (COMMA geoIpProperty)* + ; + +geoIpProperty + : COUNTRY_ISO_CODE + | COUNTRY_NAME + | CONTINENT_NAME + | REGION_ISO_CODE + | REGION_NAME + | CITY_NAME + | TIME_ZONE + | LAT + | LON + ; + // operators comparisonOperator : EQUAL @@ -1039,11 +1057,6 @@ qualifiedName : ident (DOT ident)* # identsAsQualifiedName ; -identifierSeq - : qualifiedName (COMMA qualifiedName)* # identsAsQualifiedNameSeq - | LT_PRTHS qualifiedName (COMMA qualifiedName)* RT_PRTHS # identsAsQualifiedNameSeq - ; - tableQualifiedName : tableIdent (DOT ident)* # identsAsTableQualifiedName ; @@ -1178,6 +1191,7 @@ keywordsCanBeId | FULL | SEMI | ANTI + | GEOIP | BETWEEN | CIDRMATCH | trendlineType diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java index dadf6b968..87e9f1ecb 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java @@ -13,14 +13,12 @@ import org.opensearch.sql.ast.expression.AttributeList; import org.opensearch.sql.ast.expression.Between; import org.opensearch.sql.ast.expression.Case; -import org.opensearch.sql.ast.expression.Cast; import org.opensearch.sql.ast.expression.Cidr; import org.opensearch.sql.ast.expression.Compare; import org.opensearch.sql.ast.expression.EqualTo; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.FieldList; import org.opensearch.sql.ast.expression.LambdaFunction; -import org.opensearch.sql.ast.tree.FieldSummary; import org.opensearch.sql.ast.expression.FieldsMapping; import org.opensearch.sql.ast.expression.Function; import org.opensearch.sql.ast.expression.In; @@ -41,6 +39,7 @@ import org.opensearch.sql.ast.expression.When; import org.opensearch.sql.ast.expression.WindowFunction; import org.opensearch.sql.ast.expression.Xor; +import org.opensearch.sql.ast.tree.FieldSummary; import org.opensearch.sql.ast.statement.Explain; import org.opensearch.sql.ast.statement.Query; import org.opensearch.sql.ast.statement.Statement; @@ -48,7 +47,11 @@ import org.opensearch.sql.ast.tree.Correlation; import org.opensearch.sql.ast.tree.Dedupe; import org.opensearch.sql.ast.tree.Eval; +import org.opensearch.sql.ast.tree.Expand; +import org.opensearch.sql.ast.tree.FillNull; import org.opensearch.sql.ast.tree.Filter; +import org.opensearch.sql.ast.tree.Flatten; +import org.opensearch.sql.ast.tree.GeoIp; import org.opensearch.sql.ast.tree.Head; import org.opensearch.sql.ast.tree.Join; import org.opensearch.sql.ast.tree.Kmeans; @@ -62,8 +65,9 @@ import org.opensearch.sql.ast.tree.Sort; import org.opensearch.sql.ast.tree.SubqueryAlias; import org.opensearch.sql.ast.tree.TableFunction; +import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.Values; -import org.opensearch.sql.ast.tree.*; +import org.opensearch.sql.ast.tree.Window; /** AST nodes visitor Defines the traverse path. */ public abstract class AbstractNodeVisitor { @@ -189,10 +193,6 @@ public T visitFunction(Function node, C context) { return visitChildren(node, context); } - public T visitCast(Cast node, C context) { - return visitChildren(node, context); - } - public T visitLambdaFunction(LambdaFunction node, C context) { return visitChildren(node, context); } @@ -338,9 +338,14 @@ public T visitExistsSubquery(ExistsSubquery node, C context) { return visitChildren(node, context); } + public T visitGeoIp(GeoIp node, C context) { + return visitChildren(node, context); + } + public T visitWindow(Window node, C context) { return visitChildren(node, context); } + public T visitCidr(Cidr node, C context) { return visitChildren(node, context); } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java index 226ff7a8c..7b3078629 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Alias.java @@ -13,23 +13,27 @@ import org.opensearch.sql.ast.AbstractNodeVisitor; /** - * Alias abstraction that associate an unnamed expression with a name. - * The name information preserved is useful for semantic analysis and response formatting + * Alias abstraction that associate an unnamed expression with a name and an optional alias. The + * name and alias information preserved is useful for semantic analysis and response formatting * eventually. This can avoid restoring the info in toString() method which is inaccurate because * original info is already lost. */ +@AllArgsConstructor @EqualsAndHashCode(callSuper = false) @Getter @RequiredArgsConstructor @ToString public class Alias extends UnresolvedExpression { - /** The name to be associated with the result of computing delegated expression. */ + /** Original field name. */ private final String name; /** Expression aliased. */ private final UnresolvedExpression delegated; + /** Optional field alias. */ + private String alias; + @Override public T accept(AbstractNodeVisitor nodeVisitor, C context) { return nodeVisitor.visitAlias(this, context); diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Cast.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Cast.java deleted file mode 100644 index 0668fbf7b..000000000 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/Cast.java +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.sql.ast.expression; - -import java.util.Collections; -import java.util.List; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.RequiredArgsConstructor; -import org.opensearch.sql.ast.AbstractNodeVisitor; - -/** - * Expression node of cast - */ -@Getter -@EqualsAndHashCode(callSuper = false) -@RequiredArgsConstructor -public class Cast extends UnresolvedExpression { - private final UnresolvedExpression expression; - private final DataType dataType; - - @Override - public List getChild() { - return Collections.singletonList(expression); - } - - @Override - public R accept(AbstractNodeVisitor nodeVisitor, C context) { - return nodeVisitor.visitCast(this, context); - } - - @Override - public String toString() { - return String.format("CAST(%s AS %s)", expression, dataType); - } -} diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java index 6f0de02f5..9843158b4 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/expression/DataType.java @@ -30,8 +30,4 @@ public enum DataType { INTERVAL(ExprCoreType.INTERVAL); @Getter private final ExprCoreType coreType; - - public static DataType fromString(String name) { - return valueOf(name.toUpperCase()); - } } diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java index 36c126591..9c57d2adf 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Flatten.java @@ -7,7 +7,6 @@ import org.opensearch.sql.ast.expression.Field; import java.util.List; -import org.opensearch.sql.ast.expression.UnresolvedExpression; @RequiredArgsConstructor public class Flatten extends UnresolvedPlan { @@ -16,8 +15,6 @@ public class Flatten extends UnresolvedPlan { @Getter private final Field field; - @Getter - private final List aliasSequence; @Override public UnresolvedPlan attach(UnresolvedPlan child) { diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java new file mode 100644 index 000000000..8861694d9 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/GeoIp.java @@ -0,0 +1,40 @@ +package org.opensearch.sql.ast.tree; + +import com.google.common.collect.ImmutableList; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.ToString; +import org.opensearch.sql.ast.AbstractNodeVisitor; +import org.opensearch.sql.ast.Node; +import org.opensearch.sql.ast.expression.UnresolvedExpression; + +import java.util.Arrays; +import java.util.List; + +@ToString +@Getter +@RequiredArgsConstructor +@EqualsAndHashCode(callSuper = false) +public class GeoIp extends UnresolvedPlan { + private UnresolvedPlan child; + private final UnresolvedExpression datasource; + private final UnresolvedExpression ipAddress; + private final UnresolvedExpression properties; + + @Override + public List getChild() { + return ImmutableList.of(child); + } + + @Override + public T accept(AbstractNodeVisitor nodeVisitor, C context) { + return nodeVisitor.visitGeoIp(this, context); + } + + @Override + public UnresolvedPlan attach(UnresolvedPlan child) { + this.child = child; + return this; + } +} diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java index 2541b3743..619f558c1 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/expression/function/SerializableUdf.java @@ -8,11 +8,13 @@ import inet.ipaddr.AddressStringException; import inet.ipaddr.IPAddressString; import inet.ipaddr.IPAddressStringParameters; + +import scala.Function1; import scala.Function2; import scala.Serializable; +import scala.runtime.AbstractFunction1; import scala.runtime.AbstractFunction2; - public interface SerializableUdf { Function2 cidrFunction = new SerializableAbstractFunction2<>() { @@ -48,8 +50,57 @@ public Boolean apply(String ipAddress, String cidrBlock) { } return parsedCidrBlock.contains(parsedIpAddress); - } - }; + }}; + + Function1 isIpv4 = new SerializableAbstractFunction1<>() { + + IPAddressStringParameters valOptions = new IPAddressStringParameters.Builder() + .allowEmpty(false) + .setEmptyAsLoopback(false) + .allow_inet_aton(false) + .allowSingleSegment(false) + .toParams(); + + @Override + public Boolean apply(String ipAddress) { + + IPAddressString parsedIpAddress = new IPAddressString(ipAddress, valOptions); + + try { + parsedIpAddress.validate(); + } catch (AddressStringException e) { + throw new RuntimeException("The given ipAddress '"+ipAddress+"' is invalid. It must be a valid IPv4 or IPv6 address. Error details: "+e.getMessage()); + } + + return parsedIpAddress.isIPv4(); + }}; + + Function1 ipToInt = new SerializableAbstractFunction1<>() { + + IPAddressStringParameters valOptions = new IPAddressStringParameters.Builder() + .allowEmpty(false) + .setEmptyAsLoopback(false) + .allow_inet_aton(false) + .allowSingleSegment(false) + .toParams(); + + @Override + public Boolean apply(String ipAddress) { + + IPAddressString parsedIpAddress = new IPAddressString(ipAddress, valOptions); + + try { + parsedIpAddress.validate(); + } catch (AddressStringException e) { + throw new RuntimeException("The given ipAddress '"+ipAddress+"' is invalid. It must be a valid IPv4 or IPv6 address. Error details: "+e.getMessage()); + } + + return parsedIpAddress.isIPv4(); + }}; + + abstract class SerializableAbstractFunction1 extends AbstractFunction1 + implements Serializable { + } abstract class SerializableAbstractFunction2 extends AbstractFunction2 implements Serializable { diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java index bc14ba9d4..a651f83e9 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystExpressionVisitor.java @@ -8,58 +8,31 @@ import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute; import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute$; import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation; +import org.apache.spark.sql.catalyst.analysis.UnresolvedStar; import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$; import org.apache.spark.sql.catalyst.expressions.CaseWhen; -import org.apache.spark.sql.catalyst.expressions.Cast$; -import org.apache.spark.sql.catalyst.expressions.CurrentRow$; import org.apache.spark.sql.catalyst.expressions.Exists$; import org.apache.spark.sql.catalyst.expressions.Expression; import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual; import org.apache.spark.sql.catalyst.expressions.In$; import org.apache.spark.sql.catalyst.expressions.InSubquery$; import org.apache.spark.sql.catalyst.expressions.LambdaFunction$; -import org.apache.spark.sql.catalyst.expressions.LessThan; import org.apache.spark.sql.catalyst.expressions.LessThanOrEqual; import org.apache.spark.sql.catalyst.expressions.ListQuery$; import org.apache.spark.sql.catalyst.expressions.MakeInterval$; import org.apache.spark.sql.catalyst.expressions.NamedExpression; import org.apache.spark.sql.catalyst.expressions.Predicate; -import org.apache.spark.sql.catalyst.expressions.RowFrame$; import org.apache.spark.sql.catalyst.expressions.ScalaUDF; import org.apache.spark.sql.catalyst.expressions.ScalarSubquery$; import org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable; import org.apache.spark.sql.catalyst.expressions.UnresolvedNamedLambdaVariable$; -import org.apache.spark.sql.catalyst.expressions.SpecifiedWindowFrame; -import org.apache.spark.sql.catalyst.expressions.WindowExpression; -import org.apache.spark.sql.catalyst.expressions.WindowSpecDefinition; import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.StructField; +import org.apache.spark.unsafe.types.UTF8String; + import org.opensearch.sql.ast.AbstractNodeVisitor; -import org.opensearch.sql.ast.expression.AggregateFunction; -import org.opensearch.sql.ast.expression.Alias; -import org.opensearch.sql.ast.expression.AllFields; -import org.opensearch.sql.ast.expression.And; -import org.opensearch.sql.ast.expression.Between; -import org.opensearch.sql.ast.expression.BinaryExpression; -import org.opensearch.sql.ast.expression.Case; -import org.opensearch.sql.ast.expression.Cast; -import org.opensearch.sql.ast.expression.Compare; -import org.opensearch.sql.ast.expression.DataType; -import org.opensearch.sql.ast.expression.FieldsMapping; -import org.opensearch.sql.ast.expression.Function; -import org.opensearch.sql.ast.expression.In; -import org.opensearch.sql.ast.expression.Interval; -import org.opensearch.sql.ast.expression.IsEmpty; -import org.opensearch.sql.ast.expression.Literal; -import org.opensearch.sql.ast.expression.Not; -import org.opensearch.sql.ast.expression.Or; -import org.opensearch.sql.ast.expression.LambdaFunction; -import org.opensearch.sql.ast.expression.QualifiedName; -import org.opensearch.sql.ast.expression.Span; -import org.opensearch.sql.ast.expression.UnresolvedExpression; -import org.opensearch.sql.ast.expression.When; -import org.opensearch.sql.ast.expression.WindowFunction; -import org.opensearch.sql.ast.expression.Xor; +import org.opensearch.sql.ast.expression.*; import org.opensearch.sql.ast.expression.subquery.ExistsSubquery; import org.opensearch.sql.ast.expression.subquery.InSubquery; import org.opensearch.sql.ast.expression.subquery.ScalarSubquery; @@ -68,9 +41,7 @@ import org.opensearch.sql.ast.tree.FillNull; import org.opensearch.sql.ast.tree.Kmeans; import org.opensearch.sql.ast.tree.RareTopN; -import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.UnresolvedPlan; -import org.opensearch.sql.expression.function.BuiltinFunctionName; import org.opensearch.sql.expression.function.SerializableUdf; import org.opensearch.sql.ppl.utils.AggregatorTransformer; import org.opensearch.sql.ppl.utils.BuiltinFunctionTransformer; @@ -83,6 +54,7 @@ import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.Optional; import java.util.Stack; import java.util.function.BiFunction; @@ -144,7 +116,6 @@ public Expression visitBinaryArithmetic(BinaryExpression node, BiFunction()), Option.empty(), @@ -468,16 +439,6 @@ public Expression visitLambdaFunction(LambdaFunction node, CatalystPlanContext c return context.getNamedParseExpressions().push(LambdaFunction$.MODULE$.apply(functionResult, seq(argsResult), false)); } - @Override - public Expression visitCast(Cast node, CatalystPlanContext context) { - analyze(node.getExpression(), context); - Optional ret = context.popNamedParseExpressions(); - if (ret.isEmpty()) { - throw new UnsupportedOperationException( - String.format("Invalid use of expression %s", node.getExpression())); - } - return context.getNamedParseExpressions().push(Cast$.MODULE$.apply(ret.get(), translate(node.getDataType()), false)); - } private List visitExpressionList(List expressionList, CatalystPlanContext context) { return expressionList.isEmpty() diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java index d7f59bae3..3349715cb 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java @@ -5,16 +5,28 @@ package org.opensearch.sql.ppl; +import org.apache.spark.sql.catalyst.AliasIdentifier; import org.apache.spark.sql.catalyst.TableIdentifier; +import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute$; import org.apache.spark.sql.catalyst.analysis.UnresolvedFunction; import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation; +import org.apache.spark.sql.catalyst.analysis.UnresolvedStar; import org.apache.spark.sql.catalyst.analysis.UnresolvedStar$; +import org.apache.spark.sql.catalyst.expressions.And; import org.apache.spark.sql.catalyst.expressions.Ascending$; +import org.apache.spark.sql.catalyst.expressions.AttributeReference; import org.apache.spark.sql.catalyst.expressions.Descending$; +import org.apache.spark.sql.catalyst.expressions.EqualTo; import org.apache.spark.sql.catalyst.expressions.Explode; +import org.apache.spark.sql.catalyst.expressions.ExprId; import org.apache.spark.sql.catalyst.expressions.Expression; import org.apache.spark.sql.catalyst.expressions.GeneratorOuter; +import org.apache.spark.sql.catalyst.expressions.GreaterThan; +import org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual; +import org.apache.spark.sql.catalyst.expressions.LessThan; +import org.apache.spark.sql.catalyst.expressions.LessThanOrEqual; import org.apache.spark.sql.catalyst.expressions.NamedExpression; +import org.apache.spark.sql.catalyst.expressions.ScalaUDF; import org.apache.spark.sql.catalyst.expressions.SortDirection; import org.apache.spark.sql.catalyst.expressions.SortOrder; import org.apache.spark.sql.catalyst.plans.logical.Aggregate; @@ -23,12 +35,14 @@ import org.apache.spark.sql.catalyst.plans.logical.Generate; import org.apache.spark.sql.catalyst.plans.logical.Limit; import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan$; import org.apache.spark.sql.catalyst.plans.logical.Project$; +import org.apache.spark.sql.catalyst.plans.logical.Union; import org.apache.spark.sql.execution.ExplainMode; import org.apache.spark.sql.execution.command.DescribeTableCommand; import org.apache.spark.sql.execution.command.ExplainCommand; import org.apache.spark.sql.types.DataTypes; +import org.apache.spark.sql.types.Metadata; +import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.util.CaseInsensitiveStringMap; import org.opensearch.flint.spark.FlattenGenerator; import org.opensearch.sql.ast.AbstractNodeVisitor; @@ -37,6 +51,7 @@ import org.opensearch.sql.ast.expression.Argument; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.Function; +import org.opensearch.sql.ast.tree.GeoIp; import org.opensearch.sql.ast.expression.In; import org.opensearch.sql.ast.expression.Let; import org.opensearch.sql.ast.expression.Literal; @@ -71,6 +86,7 @@ import org.opensearch.sql.ast.tree.Trendline; import org.opensearch.sql.ast.tree.Window; import org.opensearch.sql.common.antlr.SyntaxCheckException; +import org.opensearch.sql.expression.function.SerializableUdf; import org.opensearch.sql.ppl.utils.FieldSummaryTransformer; import org.opensearch.sql.ppl.utils.ParseTransformer; import org.opensearch.sql.ppl.utils.SortUtils; @@ -83,12 +99,14 @@ import java.util.ArrayList; import java.util.List; +import java.util.Locale; import java.util.Objects; import java.util.Optional; import java.util.stream.Collectors; import static java.util.Collections.emptyList; import static java.util.List.of; +import static org.opensearch.sql.ppl.CatalystPlanContext.findRelation; import static org.opensearch.sql.ppl.utils.DataTypeTransformer.seq; import static org.opensearch.sql.ppl.utils.DedupeTransformer.retainMultipleDuplicateEvents; import static org.opensearch.sql.ppl.utils.DedupeTransformer.retainMultipleDuplicateEventsAndKeepEmpty; @@ -292,7 +310,6 @@ public LogicalPlan visitSubqueryAlias(SubqueryAlias node, CatalystPlanContext co context.withSubqueryAlias(alias); return alias; }); - } @Override @@ -453,30 +470,10 @@ public LogicalPlan visitFillNull(FillNull fillNull, CatalystPlanContext context) Seq projectExpressions = context.retainAllNamedParseExpressions(p -> (NamedExpression) p); // build the plan with the projection step context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p)); - LogicalPlan resultWithoutDuplicatedColumns = context.apply(dropOriginalColumns(p -> p.children().head(), toDrop)); + LogicalPlan resultWithoutDuplicatedColumns = context.apply(logicalPlan -> DataFrameDropColumns$.MODULE$.apply(seq(toDrop), logicalPlan)); return Objects.requireNonNull(resultWithoutDuplicatedColumns, "FillNull operation failed"); } - /** - * This method is used to generate DataFrameDropColumns operator for dropping duplicated columns - * in the original plan. Then achieving similar effect like updating columns. - * - * PLAN_ID_TAG is a mechanism inner Spark that explicitly specify a plan to resolve the - * UnresolvedAttributes. Set toDrop expressions' PLAN_ID_TAG to the same value as that of the - * original plan, so Spark will resolve them correctly by that plan instead of the child. - */ - private java.util.function.Function dropOriginalColumns( - java.util.function.Function findOriginalPlan, - List toDrop) { - return logicalPlan -> { - LogicalPlan originalPlan = findOriginalPlan.apply(logicalPlan); - long planId = logicalPlan.hashCode(); - originalPlan.setTagValue(LogicalPlan$.MODULE$.PLAN_ID_TAG(), planId); - toDrop.forEach(e -> e.setTagValue(LogicalPlan$.MODULE$.PLAN_ID_TAG(), planId)); - return DataFrameDropColumns$.MODULE$.apply(seq(toDrop), logicalPlan); - }; - } - @Override public LogicalPlan visitFlatten(Flatten flatten, CatalystPlanContext context) { visitFirstChild(flatten, context); @@ -485,13 +482,9 @@ public LogicalPlan visitFlatten(Flatten flatten, CatalystPlanContext context) { context.getNamedParseExpressions().push(UnresolvedStar$.MODULE$.apply(Option.>empty())); } Expression field = visitExpression(flatten.getField(), context); - List alias = flatten.getAliasSequence().stream() - .map(aliasNode -> visitExpression(aliasNode, context)) - .collect(Collectors.toList()); context.retainAllNamedParseExpressions(p -> (NamedExpression) p); FlattenGenerator flattenGenerator = new FlattenGenerator(field); - scala.collection.mutable.Seq outputs = alias.isEmpty() ? seq() : seq(alias); - context.apply(p -> new Generate(new GeneratorOuter(flattenGenerator), seq(), true, (Option) None$.MODULE$, outputs, p)); + context.apply(p -> new Generate(new GeneratorOuter(flattenGenerator), seq(), true, (Option) None$.MODULE$, seq(), p)); return context.apply(logicalPlan -> DataFrameDropColumns$.MODULE$.apply(seq(field), logicalPlan)); } @@ -577,6 +570,103 @@ public LogicalPlan visitEval(Eval node, CatalystPlanContext context) { return context.apply(p -> new org.apache.spark.sql.catalyst.plans.logical.Project(projectExpressions, p)); } + @Override + public LogicalPlan visitGeoIp(GeoIp node, CatalystPlanContext context) { + + visitFirstChild(node, context); +// expressionAnalyzer.analyze(node.getDatasource(), context); +// Expression datasourceExpression = context.getNamedParseExpressions().pop(); + Expression ipAddressExpression = visitExpression(node.getIpAddress(), context); +// expressionAnalyzer.analyze(node.getProperties(), context); + +// List attributeList = new ArrayList<>(); +// Expression nextExpression = context.getNamedParseExpressions().peek(); +// while (nextExpression != null && !(nextExpression instanceof UnresolvedStar)) { +// String attributeName = nextExpression.toString(); +// +// if (attributeList.contains(attributeName)) { +// throw new IllegalStateException("Duplicate attribute in GEOIP attribute list"); +// } +// +// attributeList.add(0, attributeName); +// context.getNamedParseExpressions().pop(); +// nextExpression = context.getNamedParseExpressions().peek(); +// } + + ScalaUDF ipInt = new ScalaUDF(SerializableUdf.ipToInt, + DataTypes.BooleanType, + seq(ipAddressExpression), + seq(), + Option.empty(), + Option.apply("ip_to_int"), + false, + true); + + ScalaUDF isIpv4 = new ScalaUDF(SerializableUdf.isIpv4, + DataTypes.BooleanType, + seq(ipAddressExpression), + seq(), + Option.empty(), + Option.apply("is_ipv4"), + false, + true); + + LogicalPlan plan = context.apply(left -> { + LogicalPlan right = new UnresolvedRelation(seq("geoip"), CaseInsensitiveStringMap.empty(), false); + Optional joinCondition = Optional.of(new And( + new And( + new GreaterThanOrEqual( + ipInt, + UnresolvedAttribute$.MODULE$.apply(seq("ip_range_start")) + ), + new LessThan( + ipInt, + UnresolvedAttribute$.MODULE$.apply(seq("ip_range_end")) + ) + ), + new EqualTo( + isIpv4, + UnresolvedAttribute$.MODULE$.apply(seq("ip_type")) + ) + )); + context.retainAllNamedParseExpressions(p -> p); + context.retainAllPlans(p -> p); + return join(left, + right, + Join.JoinType.INNER, + joinCondition, + new Join.JoinHint()); + }); + + System.out.println("Wow I like Pancakes"); + System.out.println(plan); + + return plan; + } + + private StructField[] createGeoIpStructFields(List attributeList) { + List attributeListToUse; + if (attributeList == null || attributeList.isEmpty()) { + attributeListToUse = List.of( + "country_iso_code", + "country_name", + "continent_name", + "region_iso_code", + "region_name", + "city_name", + "time_zone", + "lat", + "lon" + ); + } else { + attributeListToUse = attributeList; + } + + return attributeListToUse.stream() + .map(a -> DataTypes.createStructField(a.toLowerCase(Locale.ROOT), DataTypes.StringType, true)) + .toArray(StructField[]::new); + } + @Override public LogicalPlan visitKmeans(Kmeans node, CatalystPlanContext context) { throw new IllegalStateException("Not Supported operation : Kmeans"); diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java index d4f9ece87..d2242d9b3 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java @@ -340,6 +340,18 @@ public UnresolvedPlan visitEvalCommand(OpenSearchPPLParser.EvalCommandContext ct .collect(Collectors.toList())); } + @Override + public UnresolvedPlan visitGeoipCommand(OpenSearchPPLParser.GeoipCommandContext ctx) { + UnresolvedExpression datasource = + (ctx.datasource != null) ? + internalVisitExpression(ctx.datasource) : + // TODO Make default value var + new Literal("https://geoip.maps.opensearch.org/v1/geolite2-city/manifest.json", DataType.STRING); + UnresolvedExpression ipAddress = internalVisitExpression(ctx.ipAddress); + UnresolvedExpression properties = ctx.properties == null ? new AttributeList(Collections.emptyList()) : internalVisitExpression(ctx.properties); + return new GeoIp(datasource, ipAddress, properties); + } + private List getGroupByList(OpenSearchPPLParser.ByClauseContext ctx) { return ctx.fieldList().fieldExpression().stream() .map(this::internalVisitExpression) @@ -581,18 +593,19 @@ public UnresolvedPlan visitFillnullCommand(OpenSearchPPLParser.FillnullCommandCo FillNullWithFieldVariousValuesContext variousValuesContext = ctx.fillNullWithFieldVariousValues(); if (sameValueContext != null) { // todo consider using expression instead of Literal - UnresolvedExpression replaceNullWithMe = internalVisitExpression(sameValueContext.nullReplacement); - List fieldsToReplace = sameValueContext.nullableFieldList.fieldExpression() + UnresolvedExpression replaceNullWithMe = internalVisitExpression(sameValueContext.nullReplacement().expression()); + List fieldsToReplace = sameValueContext.nullableField() .stream() .map(this::internalVisitExpression) .map(Field.class::cast) .collect(Collectors.toList()); return new FillNull(ofSameValue(replaceNullWithMe, fieldsToReplace)); } else if (variousValuesContext != null) { - List nullableFieldFills = IntStream.range(0, variousValuesContext.nullableReplacementExpression().size()) + List nullableFieldFills = IntStream.range(0, variousValuesContext.nullableField().size()) .mapToObj(index -> { - UnresolvedExpression replaceNullWithMe = internalVisitExpression(variousValuesContext.nullableReplacementExpression(index).nullableReplacement); - Field nullableFieldReference = (Field) internalVisitExpression(variousValuesContext.nullableReplacementExpression(index).nullableField); + variousValuesContext.nullableField(index); + UnresolvedExpression replaceNullWithMe = internalVisitExpression(variousValuesContext.nullReplacement(index).expression()); + Field nullableFieldReference = (Field) internalVisitExpression(variousValuesContext.nullableField(index)); return new NullableFieldFill(nullableFieldReference, replaceNullWithMe); }) .collect(Collectors.toList()); @@ -605,8 +618,7 @@ public UnresolvedPlan visitFillnullCommand(OpenSearchPPLParser.FillnullCommandCo @Override public UnresolvedPlan visitFlattenCommand(OpenSearchPPLParser.FlattenCommandContext ctx) { Field unresolvedExpression = (Field) internalVisitExpression(ctx.fieldExpression()); - List alias = ctx.alias == null ? emptyList() : ((AttributeList) internalVisitExpression(ctx.alias)).getAttrList(); - return new Flatten(unresolvedExpression, alias); + return new Flatten(unresolvedExpression); } /** AD command. */ diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java index 1fe57d13e..e9e4c7cbe 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java @@ -19,13 +19,13 @@ import org.opensearch.sql.ast.expression.AttributeList; import org.opensearch.sql.ast.expression.Between; import org.opensearch.sql.ast.expression.Case; -import org.opensearch.sql.ast.expression.Cast; import org.opensearch.sql.ast.expression.Cidr; import org.opensearch.sql.ast.expression.Compare; import org.opensearch.sql.ast.expression.DataType; import org.opensearch.sql.ast.expression.EqualTo; import org.opensearch.sql.ast.expression.Field; import org.opensearch.sql.ast.expression.Function; +import org.opensearch.sql.ast.tree.GeoIp; import org.opensearch.sql.ast.expression.In; import org.opensearch.sql.ast.expression.Interval; import org.opensearch.sql.ast.expression.IntervalUnit; @@ -45,8 +45,6 @@ import org.opensearch.sql.ast.expression.subquery.ExistsSubquery; import org.opensearch.sql.ast.expression.subquery.InSubquery; import org.opensearch.sql.ast.expression.subquery.ScalarSubquery; -import org.opensearch.sql.ast.tree.Trendline; -import org.opensearch.sql.common.antlr.SyntaxCheckException; import org.opensearch.sql.common.utils.StringUtils; import org.opensearch.sql.ppl.utils.ArgumentFactory; @@ -280,9 +278,9 @@ public UnresolvedExpression visitEvalFunctionCall(OpenSearchPPLParser.EvalFuncti return buildFunction(ctx.evalFunctionName().getText(), ctx.functionArgs().functionArg()); } - @Override public UnresolvedExpression visitDataTypeFunctionCall(OpenSearchPPLParser.DataTypeFunctionCallContext ctx) { - // TODO: for long term consideration, needs to implement DataTypeBuilder/Visitor to parse all data types - return new Cast(this.visit(ctx.expression()), DataType.fromString(ctx.convertedDataType().getText())); + @Override + public UnresolvedExpression visitConvertedDataType(OpenSearchPPLParser.ConvertedDataTypeContext ctx) { + return new Literal(ctx.getText(), DataType.STRING); } @Override @@ -330,11 +328,6 @@ public UnresolvedExpression visitIdentsAsQualifiedName(OpenSearchPPLParser.Ident return visitIdentifiers(ctx.ident()); } - @Override - public UnresolvedExpression visitIdentsAsQualifiedNameSeq(OpenSearchPPLParser.IdentsAsQualifiedNameSeqContext ctx) { - return new AttributeList(ctx.qualifiedName().stream().map(this::visit).collect(Collectors.toList())); - } - @Override public UnresolvedExpression visitIdentsAsTableQualifiedName( OpenSearchPPLParser.IdentsAsTableQualifiedNameContext ctx) { @@ -383,7 +376,8 @@ public UnresolvedExpression visitBooleanLiteral(OpenSearchPPLParser.BooleanLiter public UnresolvedExpression visitBySpanClause(OpenSearchPPLParser.BySpanClauseContext ctx) { String name = ctx.spanClause().getText(); return ctx.alias != null - ? new Alias(StringUtils.unquoteIdentifier(ctx.alias.getText()), visit(ctx.spanClause())) + ? new Alias( + name, visit(ctx.spanClause()), StringUtils.unquoteIdentifier(ctx.alias.getText())) : new Alias(name, visit(ctx.spanClause())); } @@ -450,6 +444,39 @@ public UnresolvedExpression visitLambda(OpenSearchPPLParser.LambdaContext ctx) { return new LambdaFunction(function, arguments); } + @Override + public UnresolvedExpression visitGeoIpPropertyList(OpenSearchPPLParser.GeoIpPropertyListContext ctx) { + ImmutableList.Builder properties = ImmutableList.builder(); + if (ctx != null) { + for (OpenSearchPPLParser.GeoIpPropertyContext property : ctx.geoIpProperty()) { + String propertyName; + if (property.COUNTRY_ISO_CODE() != null) { + propertyName = "COUNTRY_ISO_CODE"; + } else if (property.COUNTRY_NAME() != null) { + propertyName = "COUNTRY_NAME"; + } else if (property.CONTINENT_NAME() != null) { + propertyName = "CONTINENT_NAME"; + } else if (property.REGION_ISO_CODE() != null) { + propertyName = "REGION_ISO_CODE"; + } else if (property.CITY_NAME() != null) { + propertyName = "CITY_NAME"; + } else if (property.TIME_ZONE() != null) { + propertyName = "TIME_ZONE"; + } else if (property.LAT() != null) { + propertyName = "LAT"; + } else if (property.LON() != null) { + propertyName = "LON"; + } else { + continue; + } + + properties.add(new Literal(propertyName, DataType.STRING)); + } + } + + return new AttributeList(properties.build()); + } + private List timestampFunctionArguments( OpenSearchPPLParser.TimestampFunctionCallContext ctx) { List args = diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java index f583d7847..e4defad52 100644 --- a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java @@ -9,7 +9,6 @@ import org.apache.spark.sql.types.BooleanType$; import org.apache.spark.sql.types.ByteType$; import org.apache.spark.sql.types.DataType; -import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.DateType$; import org.apache.spark.sql.types.DoubleType$; import org.apache.spark.sql.types.FloatType$; @@ -50,12 +49,8 @@ static Seq seq(List list) { static DataType translate(org.opensearch.sql.ast.expression.DataType source) { switch (source.getCoreType()) { - case DATE: + case TIME: return DateType$.MODULE$; - case TIMESTAMP: - return DataTypes.TimestampType; - case STRING: - return DataTypes.StringType; case INTEGER: return IntegerType$.MODULE$; case LONG: @@ -73,7 +68,7 @@ static DataType translate(org.opensearch.sql.ast.expression.DataType source) { case UNDEFINED: return NullType$.MODULE$; default: - throw new IllegalArgumentException("Unsupported data type for Spark: " + source); + return StringType$.MODULE$; } } @@ -125,4 +120,4 @@ static String translate(SpanUnit unit) { } return ""; } -} +} \ No newline at end of file diff --git a/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoipCatalystUtils.java b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoipCatalystUtils.java new file mode 100644 index 000000000..a35114140 --- /dev/null +++ b/ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/GeoipCatalystUtils.java @@ -0,0 +1,4 @@ +package org.opensearch.sql.ppl.utils; + +public interface GeoipCatalystUtils { +} diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala deleted file mode 100644 index 829b7ff1f..000000000 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanCastTestSuite.scala +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.flint.spark.ppl - -import org.opensearch.flint.spark.ppl.PlaneUtils.plan -import org.opensearch.sql.common.antlr.SyntaxCheckException -import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} -import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq -import org.scalatest.matchers.should.Matchers - -import org.apache.spark.SparkFunSuite -import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, Cast, Literal} -import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.Project -import org.apache.spark.sql.types.{IntegerType, StringType} - -class PPLLogicalPlanCastTestSuite - extends SparkFunSuite - with PlanTest - with LogicalPlanTestUtils - with Matchers { - - private val planTransformer = new CatalystQueryPlanVisitor() - private val pplParser = new PPLSyntaxParser() - - test("test cast with case sensitive") { - val table = UnresolvedRelation(Seq("t")) - val expectedPlan = Project( - seq(UnresolvedStar(None)), - Project( - seq(UnresolvedStar(None), Alias(Cast(UnresolvedAttribute("a"), StringType), "a")()), - table)) - - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit(plan(pplParser, """source=t | eval a = cast(a as STRING)"""), context) - comparePlans(expectedPlan, logPlan, false) - - // test case insensitive - val context2 = new CatalystPlanContext - val logPlan2 = - planTransformer.visit( - plan(pplParser, """source=t | eval a = cast(a as string)"""), - context2) - comparePlans(expectedPlan, logPlan2, false) - } - - test("test cast literal") { - val table = UnresolvedRelation(Seq("t")) - val expectedPlan = Project( - seq(UnresolvedStar(None)), - Project( - seq( - UnresolvedStar(None), - Alias(Cast(Cast(Literal("a"), IntegerType), StringType), "a")()), - table)) - - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, """source=t | eval a = cast(cast("a" as INTEGER) as STRING)"""), - context) - comparePlans(expectedPlan, logPlan, false) - } - - test("test chained cast") { - val table = UnresolvedRelation(Seq("t")) - val expectedPlan = Project( - seq(UnresolvedStar(None)), - Project( - seq( - UnresolvedStar(None), - Alias(Cast(Cast(UnresolvedAttribute("a"), IntegerType), StringType), "a")()), - table)) - - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, """source=t | eval a = cast(cast(a as INTEGER) as STRING)"""), - context) - comparePlans(expectedPlan, logPlan, false) - } - - test("test cast with unsupported dataType") { - // Unsupported data type for opensearch parser - val context = new CatalystPlanContext - val exception = intercept[SyntaxCheckException] { - planTransformer.visit( - plan(pplParser, """source=t | eval a = cast(a as UNSUPPORTED_DATATYPE)"""), - context) - } - assert( - exception.getMessage.contains( - "Failed to parse query due to offending symbol [UNSUPPORTED_DATATYPE]")) - - // Unsupported data type for Spark - val context2 = new CatalystPlanContext - val exception2 = intercept[IllegalArgumentException] { - planTransformer.visit(plan(pplParser, """source=t | eval a = cast(a as time)"""), context2) - } - assert(exception2.getMessage == "Unsupported data type for Spark: TIME") - } - -} diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala index ba0d78670..1b61dc98f 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanEvalTranslatorTestSuite.scala @@ -6,15 +6,17 @@ package org.opensearch.flint.spark.ppl import org.opensearch.flint.spark.ppl.PlaneUtils.plan +import org.opensearch.sql.expression.function.SerializableUdf import org.opensearch.sql.ppl.{CatalystPlanContext, CatalystQueryPlanVisitor} import org.opensearch.sql.ppl.utils.DataTypeTransformer.seq import org.scalatest.matchers.should.Matchers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, ExprId, In, Literal, NamedExpression, SortOrder} -import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{Project, Sort} +import org.apache.spark.sql.catalyst.expressions.{Alias, And, Descending, EqualTo, ExprId, GreaterThanOrEqual, In, LessThan, Literal, NamedExpression, ScalaUDF, SortOrder} +import org.apache.spark.sql.catalyst.plans.{Inner, PlanTest} +import org.apache.spark.sql.catalyst.plans.logical.{Join, JoinHint, Project, Sort, SubqueryAlias} +import org.apache.spark.sql.types.DataTypes class PPLLogicalPlanEvalTranslatorTestSuite extends SparkFunSuite @@ -25,192 +27,243 @@ class PPLLogicalPlanEvalTranslatorTestSuite private val planTransformer = new CatalystQueryPlanVisitor() private val pplParser = new PPLSyntaxParser() - test("test eval expressions not included in fields expressions") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1 | fields c"), context) - val evalProjectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")()) - val expectedPlan = Project( - seq(UnresolvedAttribute("c")), - Project(evalProjectList, UnresolvedRelation(Seq("t")))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } - - test("test eval expressions included in fields expression") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=t | eval a = 1, c = 1 | fields a, b, c"), - context) +// test("test eval expressions not included in fields expressions") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1 | fields c"), context) +// val evalProjectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")()) +// val expectedPlan = Project( +// seq(UnresolvedAttribute("c")), +// Project(evalProjectList, UnresolvedRelation(Seq("t")))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } - val evalProjectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "c")()) - val expectedPlan = Project( - seq(UnresolvedAttribute("a"), UnresolvedAttribute("b"), UnresolvedAttribute("c")), - Project(evalProjectList, UnresolvedRelation(Seq("t")))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } +// test("test eval expressions included in fields expression") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval a = 1, c = 1 | fields a, b, c"), +// context) +// +// val evalProjectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "c")()) +// val expectedPlan = Project( +// seq(UnresolvedAttribute("a"), UnresolvedAttribute("b"), UnresolvedAttribute("c")), +// Project(evalProjectList, UnresolvedRelation(Seq("t")))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test eval expressions without fields command") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1"), context) +// +// val evalProjectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")()) +// val expectedPlan = +// Project(seq(UnresolvedStar(None)), Project(evalProjectList, UnresolvedRelation(Seq("t")))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test eval expressions with sort") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval a = 1, b = 1 | sort - a | fields b"), +// context) +// +// val evalProjectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")()) +// val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t"))) +// val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty) +// val sort = Sort(seq(sortOrder), global = true, evalProject) +// val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test eval expressions with multiple recursive sort") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval a = 1, a = a | sort - a | fields b"), +// context) +// +// val evalProjectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(UnresolvedAttribute("a"), "a")()) +// val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t"))) +// val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty) +// val sort = Sort(seq(sortOrder), global = true, evalProject) +// val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test multiple eval expressions") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval a = 1, b = 'hello' | eval b = a | sort - b | fields b"), +// context) +// +// val evalProjectList1: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal("hello"), "b")()) +// val evalProjectList2: Seq[NamedExpression] = Seq( +// UnresolvedStar(None), +// Alias(UnresolvedAttribute("a"), "b")(exprId = ExprId(2), qualifier = Seq.empty)) +// val evalProject1 = Project(evalProjectList1, UnresolvedRelation(Seq("t"))) +// val evalProject2 = Project(evalProjectList2, evalProject1) +// val sortOrder = SortOrder(UnresolvedAttribute("b"), Descending, Seq.empty) +// val sort = Sort(seq(sortOrder), global = true, evalProject2) +// val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test complex eval expressions - date function") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval a = TIMESTAMP('2020-09-16 17:30:00') | fields a"), +// context) +// +// val evalProjectList: Seq[NamedExpression] = Seq( +// UnresolvedStar(None), +// Alias( +// UnresolvedFunction("timestamp", seq(Literal("2020-09-16 17:30:00")), isDistinct = false), +// "a")()) +// val expectedPlan = Project( +// seq(UnresolvedAttribute("a")), +// Project(evalProjectList, UnresolvedRelation(Seq("t")))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test complex eval expressions - math function") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit(plan(pplParser, "source=t | eval a = RAND() | fields a"), context) +// +// val evalProjectList: Seq[NamedExpression] = Seq( +// UnresolvedStar(None), +// Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "a")( +// exprId = ExprId(0), +// qualifier = Seq.empty)) +// val expectedPlan = Project( +// seq(UnresolvedAttribute("a")), +// Project(evalProjectList, UnresolvedRelation(Seq("t")))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// test("test complex eval expressions - compound function") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval a = if(like(b, '%Hello%'), 'World', 'Hi') | fields a"), +// context) +// +// val evalProjectList: Seq[NamedExpression] = Seq( +// UnresolvedStar(None), +// Alias( +// UnresolvedFunction( +// "if", +// seq( +// UnresolvedFunction( +// "like", +// seq(UnresolvedAttribute("b"), Literal("%Hello%")), +// isDistinct = false), +// Literal("World"), +// Literal("Hi")), +// isDistinct = false), +// "a")()) +// val expectedPlan = Project( +// seq(UnresolvedAttribute("a")), +// Project(evalProjectList, UnresolvedRelation(Seq("t")))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } - test("test eval expressions without fields command") { + test("test eval expression - geoip function") { val context = new CatalystPlanContext - val logPlan = - planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 1"), context) - val evalProjectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")()) - val expectedPlan = - Project(seq(UnresolvedStar(None)), Project(evalProjectList, UnresolvedRelation(Seq("t")))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } + //scalastyle:off + println("Wow I like Pancakes"); + //scalastyle:on - test("test eval expressions with sort") { - val context = new CatalystPlanContext val logPlan = planTransformer.visit( - plan(pplParser, "source=t | eval a = 1, b = 1 | sort - a | fields b"), + plan(pplParser, "source=t | eval a = geoip(lol,ip_address,TIME_ZONE)"), context) - val evalProjectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(1), "b")()) - val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t"))) - val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty) - val sort = Sort(seq(sortOrder), global = true, evalProject) - val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } + //scalastyle:off + println("Wow I like Pancakes"); + //scalastyle:on - test("test eval expressions with multiple recursive sort") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=t | eval a = 1, a = a | sort - a | fields b"), - context) + val ipAddress = UnresolvedAttribute("ip_address") - val evalProjectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(UnresolvedAttribute("a"), "a")()) - val evalProject = Project(evalProjectList, UnresolvedRelation(Seq("t"))) - val sortOrder = SortOrder(UnresolvedAttribute("a"), Descending, Seq.empty) - val sort = Sort(seq(sortOrder), global = true, evalProject) - val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } + val is_ipv4 = ScalaUDF( + SerializableUdf.isIpv4, + DataTypes.BooleanType, + seq(ipAddress), + seq(), + Option.empty, + Option.apply("is_ipv4") + ) - test("test multiple eval expressions") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=t | eval a = 1, b = 'hello' | eval b = a | sort - b | fields b"), - context) + val ip_int = ScalaUDF( + SerializableUdf.ipToInt, + DataTypes.IntegerType, + seq(ipAddress), + seq(), + Option.empty, + Option.apply("ip_to_int") + ) - val evalProjectList1: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal("hello"), "b")()) - val evalProjectList2: Seq[NamedExpression] = Seq( - UnresolvedStar(None), - Alias(UnresolvedAttribute("a"), "b")(exprId = ExprId(2), qualifier = Seq.empty)) - val evalProject1 = Project(evalProjectList1, UnresolvedRelation(Seq("t"))) - val evalProject2 = Project(evalProjectList2, evalProject1) - val sortOrder = SortOrder(UnresolvedAttribute("b"), Descending, Seq.empty) - val sort = Sort(seq(sortOrder), global = true, evalProject2) - val expectedPlan = Project(seq(UnresolvedAttribute("b")), sort) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } + val sourceTable = SubqueryAlias("l", UnresolvedRelation(seq("users"))) + val geoTable = SubqueryAlias("r", UnresolvedRelation(seq("geoip"))) - test("test complex eval expressions - date function") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=t | eval a = TIMESTAMP('2020-09-16 17:30:00') | fields a"), - context) - - val evalProjectList: Seq[NamedExpression] = Seq( - UnresolvedStar(None), - Alias( - UnresolvedFunction("timestamp", seq(Literal("2020-09-16 17:30:00")), isDistinct = false), - "a")()) - val expectedPlan = Project( - seq(UnresolvedAttribute("a")), - Project(evalProjectList, UnresolvedRelation(Seq("t")))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } + val ipRangeStartCondition = GreaterThanOrEqual(ip_int, UnresolvedAttribute("r.ip_t")) + val ipRangeEndCondition = LessThan(ip_int, UnresolvedAttribute("r.ip")) + val isIpv4Condition = EqualTo(is_ipv4, UnresolvedAttribute("r.ip_type")) - test("test complex eval expressions - math function") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit(plan(pplParser, "source=t | eval a = RAND() | fields a"), context) - - val evalProjectList: Seq[NamedExpression] = Seq( - UnresolvedStar(None), - Alias(UnresolvedFunction("rand", Seq.empty, isDistinct = false), "a")( - exprId = ExprId(0), - qualifier = Seq.empty)) - val expectedPlan = Project( - seq(UnresolvedAttribute("a")), - Project(evalProjectList, UnresolvedRelation(Seq("t")))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } + val joinCondition = And(And(ipRangeStartCondition, ipRangeEndCondition), isIpv4Condition) - test("test complex eval expressions - compound function") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=t | eval a = if(like(b, '%Hello%'), 'World', 'Hi') | fields a"), - context) + val joinPlan = Join(sourceTable, geoTable, Inner, Some(joinCondition), JoinHint.NONE) + val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - val evalProjectList: Seq[NamedExpression] = Seq( - UnresolvedStar(None), - Alias( - UnresolvedFunction( - "if", - seq( - UnresolvedFunction( - "like", - seq(UnresolvedAttribute("b"), Literal("%Hello%")), - isDistinct = false), - Literal("World"), - Literal("Hi")), - isDistinct = false), - "a")()) - val expectedPlan = Project( - seq(UnresolvedAttribute("a")), - Project(evalProjectList, UnresolvedRelation(Seq("t")))) comparePlans(expectedPlan, logPlan, checkAnalysis = false) } - // Todo fields-excluded command not supported - ignore("test eval expressions with fields-excluded command") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields - b"), context) - - val projectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")()) - val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t"))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } - - // Todo fields-included command not supported - ignore("test eval expressions with fields-included command") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields + b"), context) - - val projectList: Seq[NamedExpression] = - Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")()) - val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t"))) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } - - test("test IN expr in eval") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=t | eval in = a in ('Hello', 'World') | fields in"), - context) - - val in = Alias(In(UnresolvedAttribute("a"), Seq(Literal("Hello"), Literal("World"))), "in")() - val eval = Project(Seq(UnresolvedStar(None), in), UnresolvedRelation(Seq("t"))) - val expectedPlan = Project(Seq(UnresolvedAttribute("in")), eval) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } +// // Todo fields-excluded command not supported +// ignore("test eval expressions with fields-excluded command") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields - b"), context) +// +// val projectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")()) +// val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t"))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +// +// // Todo fields-included command not supported +// ignore("test eval expressions with fields-included command") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit(plan(pplParser, "source=t | eval a = 1, b = 2 | fields + b"), context) +// +// val projectList: Seq[NamedExpression] = +// Seq(UnresolvedStar(None), Alias(Literal(1), "a")(), Alias(Literal(2), "b")()) +// val expectedPlan = Project(projectList, UnresolvedRelation(Seq("t"))) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } +//// +// test("test IN expr in eval") { +// val context = new CatalystPlanContext +// val logPlan = +// planTransformer.visit( +// plan(pplParser, "source=t | eval in = a in ('Hello', 'World') | fields in"), +// context) +// +// val in = Alias(In(UnresolvedAttribute("a"), Seq(Literal("Hello"), Literal("World"))), "in")() +// val eval = Project(Seq(UnresolvedStar(None), in), UnresolvedRelation(Seq("t"))) +// val expectedPlan = Project(Seq(UnresolvedAttribute("in")), eval) +// comparePlans(expectedPlan, logPlan, checkAnalysis = false) +// } } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala index 543e5c05d..58a6c04b3 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFlattenCommandTranslatorTestSuite.scala @@ -13,9 +13,9 @@ import org.scalatest.matchers.should.Matchers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.analysis.{UnresolvedAttribute, UnresolvedFunction, UnresolvedRelation, UnresolvedStar} -import org.apache.spark.sql.catalyst.expressions.{Alias, GeneratorOuter, Literal, RegExpExtract} +import org.apache.spark.sql.catalyst.expressions.{Alias, Descending, GeneratorOuter, Literal, NullsLast, RegExpExtract, SortOrder} import org.apache.spark.sql.catalyst.plans.PlanTest -import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, DataFrameDropColumns, Generate, Project} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, DataFrameDropColumns, Generate, GlobalLimit, LocalLimit, Project, Sort} import org.apache.spark.sql.types.IntegerType class PPLLogicalPlanFlattenCommandTranslatorTestSuite @@ -153,45 +153,4 @@ class PPLLogicalPlanFlattenCommandTranslatorTestSuite comparePlans(expectedPlan, logPlan, checkAnalysis = false) } - test("test flatten with one alias") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=relation | flatten field_with_array as col1"), - context) - - val relation = UnresolvedRelation(Seq("relation")) - val flattenGenerator = new FlattenGenerator(UnresolvedAttribute("field_with_array")) - val outerGenerator = GeneratorOuter(flattenGenerator) - val generate = - Generate(outerGenerator, seq(), true, None, Seq(UnresolvedAttribute("col1")), relation) - val dropSourceColumn = - DataFrameDropColumns(Seq(UnresolvedAttribute("field_with_array")), generate) - val expectedPlan = Project(seq(UnresolvedStar(None)), dropSourceColumn) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } - - test("test flatten with alias list") { - val context = new CatalystPlanContext - val logPlan = - planTransformer.visit( - plan(pplParser, "source=relation | flatten field_with_array as (col1, col2)"), - context) - - val relation = UnresolvedRelation(Seq("relation")) - val flattenGenerator = new FlattenGenerator(UnresolvedAttribute("field_with_array")) - val outerGenerator = GeneratorOuter(flattenGenerator) - val generate = Generate( - outerGenerator, - seq(), - true, - None, - Seq(UnresolvedAttribute("col1"), UnresolvedAttribute("col2")), - relation) - val dropSourceColumn = - DataFrameDropColumns(Seq(UnresolvedAttribute("field_with_array")), generate) - val expectedPlan = Project(seq(UnresolvedStar(None)), dropSourceColumn) - comparePlans(expectedPlan, logPlan, checkAnalysis = false) - } - } diff --git a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala index f4ed397e3..d75de8d9f 100644 --- a/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala +++ b/ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanJoinTranslatorTestSuite.scala @@ -30,12 +30,30 @@ class PPLLogicalPlanJoinTranslatorTestSuite private val testTable3 = "spark_catalog.default.flint_ppl_test3" private val testTable4 = "spark_catalog.default.flint_ppl_test4" +// test("test two-tables inner join: join condition with aliases") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } + test("test two-tables inner join: join condition with aliases") { val context = new CatalystPlanContext val logPlan = plan( pplParser, s""" - | source = $testTable1| JOIN left = l right = r ON l.id = r.id $testTable2 + | source=users | join left = t1 right = t2 on t1.ip_int>=t2.ip_range_start and t1.ip_int 10 AND lower(r.name) = 'hello' $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = And( - And( - EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")), - EqualTo( - Literal("hello"), - UnresolvedFunction.apply( - "lower", - Seq(UnresolvedAttribute("r.name")), - isDistinct = false))), - LessThan(Literal(10), UnresolvedAttribute("l.count"))) - val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test inner join: join condition with table names and predicates") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| INNER JOIN left = l right = r ON $testTable1.id = $testTable2.id AND $testTable1.count > 10 AND lower($testTable2.name) = 'hello' $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = And( - And( - EqualTo(UnresolvedAttribute(s"$testTable1.id"), UnresolvedAttribute(s"$testTable2.id")), - EqualTo( - Literal("hello"), - UnresolvedFunction.apply( - "lower", - Seq(UnresolvedAttribute(s"$testTable2.name")), - isDistinct = false))), - LessThan(Literal(10), UnresolvedAttribute(s"$testTable1.count"))) - val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test left outer join") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| LEFT OUTER JOIN left = l right = r ON l.id = r.id $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test right outer join") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| RIGHT JOIN left = l right = r ON l.id = r.id $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, RightOuter, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test left semi join") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| LEFT SEMI JOIN left = l right = r ON l.id = r.id $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, LeftSemi, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test left anti join") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| LEFT ANTI JOIN left = l right = r ON l.id = r.id $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, LeftAnti, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test full outer join") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| FULL JOIN left = l right = r ON l.id = r.id $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, FullOuter, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test cross join") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| CROSS JOIN left = l right = r $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinPlan = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test cross join with join condition") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| CROSS JOIN left = l right = r ON l.id = r.id $testTable2 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightPlan = SubqueryAlias("r", table2) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, Cross, Some(joinCondition), JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1 - | | inner JOIN left = l right = r ON l.id = r.id $testTable2 - | | left JOIN left = l right = r ON l.name = r.name $testTable3 - | | cross JOIN left = l right = r $testTable4 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) - val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4")) - var leftPlan = SubqueryAlias("l", table1) - var rightPlan = SubqueryAlias("r", table2) - val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE) - leftPlan = SubqueryAlias("l", joinPlan1) - rightPlan = SubqueryAlias("r", table3) - val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name")) - val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE) - leftPlan = SubqueryAlias("l", joinPlan2) - rightPlan = SubqueryAlias("r", table4) - val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test complex join: TPC-H Q13") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | SEARCH source = $testTable1 - | | FIELDS id, name - | | LEFT OUTER JOIN left = c right = o ON c.custkey = o.custkey $testTable2 - | | STATS count(o.orderkey) AS o_count BY c.custkey - | | STATS count(1) AS custdist BY o_count - | | SORT - custdist, - o_count - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val tableC = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val tableO = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val left = SubqueryAlias( - "c", - Project(Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), tableC)) - val right = SubqueryAlias("o", tableO) - val joinCondition = - EqualTo(UnresolvedAttribute("o.custkey"), UnresolvedAttribute("c.custkey")) - val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE) - val groupingExpression1 = Alias(UnresolvedAttribute("c.custkey"), "c.custkey")() - val aggregateExpressions1 = - Alias( - UnresolvedFunction( - Seq("COUNT"), - Seq(UnresolvedAttribute("o.orderkey")), - isDistinct = false), - "o_count")() - val agg1 = - Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join) - val groupingExpression2 = Alias(UnresolvedAttribute("o_count"), "o_count")() - val aggregateExpressions2 = - Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")() - val agg2 = - Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg1) - val sort = Sort( - Seq( - SortOrder(UnresolvedAttribute("custdist"), Descending), - SortOrder(UnresolvedAttribute("o_count"), Descending)), - global = true, - agg2) - val expectedPlan = Project(Seq(UnresolvedStar(None)), sort) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test inner join with relation subquery") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| JOIN left = l right = r ON l.id = r.id - | [ - | source = $testTable2 - | | where id > 10 and name = 'abc' - | | fields id, name - | | sort id - | | head 10 - | ] - | | stats count(id) as cnt by type - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightSubquery = - GlobalLimit( - Literal(10), - LocalLimit( - Literal(10), - Sort( - Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), - global = true, - Project( - Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), - Filter( - And( - GreaterThan(UnresolvedAttribute("id"), Literal(10)), - EqualTo(UnresolvedAttribute("name"), Literal("abc"))), - table2))))) - val rightPlan = SubqueryAlias("r", rightSubquery) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) - val groupingExpression = Alias(UnresolvedAttribute("type"), "type")() - val aggregateExpression = Alias( - UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false), - "cnt")() - val aggPlan = - Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan) - val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test left outer join with relation subquery") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1| LEFT JOIN left = l right = r ON l.id = r.id - | [ - | source = $testTable2 - | | where id > 10 and name = 'abc' - | | fields id, name - | | sort id - | | head 10 - | ] - | | stats count(id) as cnt by type - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val leftPlan = SubqueryAlias("l", table1) - val rightSubquery = - GlobalLimit( - Literal(10), - LocalLimit( - Literal(10), - Sort( - Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), - global = true, - Project( - Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), - Filter( - And( - GreaterThan(UnresolvedAttribute("id"), Literal(10)), - EqualTo(UnresolvedAttribute("name"), Literal("abc"))), - table2))))) - val rightPlan = SubqueryAlias("r", rightSubquery) - val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE) - val groupingExpression = Alias(UnresolvedAttribute("type"), "type")() - val aggregateExpression = Alias( - UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false), - "cnt")() - val aggPlan = - Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan) - val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins with relation subquery") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1 - | | head 10 - | | inner JOIN left = l right = r ON l.id = r.id - | [ - | source = $testTable2 - | | where id > 10 - | ] - | | left JOIN left = l right = r ON l.name = r.name - | [ - | source = $testTable3 - | | fields id - | ] - | | cross JOIN left = l right = r - | [ - | source = $testTable4 - | | sort id - | ] - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) - val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4")) - var leftPlan = SubqueryAlias("l", GlobalLimit(Literal(10), LocalLimit(Literal(10), table1))) - var rightPlan = - SubqueryAlias("r", Filter(GreaterThan(UnresolvedAttribute("id"), Literal(10)), table2)) - val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) - val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE) - leftPlan = SubqueryAlias("l", joinPlan1) - rightPlan = SubqueryAlias("r", Project(Seq(UnresolvedAttribute("id")), table3)) - val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name")) - val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE) - leftPlan = SubqueryAlias("l", joinPlan2) - rightPlan = SubqueryAlias( - "r", - Sort(Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), global = true, table4)) - val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test complex join: TPC-H Q13 with relation subquery") { - // select - // c_count, - // count(*) as custdist - // from - // ( - // select - // c_custkey, - // count(o_orderkey) as c_count - // from - // customer left outer join orders on - // c_custkey = o_custkey - // and o_comment not like '%special%requests%' - // group by - // c_custkey - // ) as c_orders - // group by - // c_count - // order by - // custdist desc, - // c_count desc - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | SEARCH source = [ - | SEARCH source = customer - | | LEFT OUTER JOIN left = c right = o ON c_custkey = o_custkey - | [ - | SEARCH source = orders - | | WHERE not like(o_comment, '%special%requests%') - | ] - | | STATS COUNT(o_orderkey) AS c_count BY c_custkey - | ] AS c_orders - | | STATS COUNT(o_orderkey) AS c_count BY c_custkey - | | STATS COUNT(1) AS custdist BY c_count - | | SORT - custdist, - c_count - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val tableC = UnresolvedRelation(Seq("customer")) - val tableO = UnresolvedRelation(Seq("orders")) - val left = SubqueryAlias("c", tableC) - val filterNot = Filter( - Not( - UnresolvedFunction( - Seq("like"), - Seq(UnresolvedAttribute("o_comment"), Literal("%special%requests%")), - isDistinct = false)), - tableO) - val right = SubqueryAlias("o", filterNot) - val joinCondition = - EqualTo(UnresolvedAttribute("o_custkey"), UnresolvedAttribute("c_custkey")) - val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE) - val groupingExpression1 = Alias(UnresolvedAttribute("c_custkey"), "c_custkey")() - val aggregateExpressions1 = - Alias( - UnresolvedFunction( - Seq("COUNT"), - Seq(UnresolvedAttribute("o_orderkey")), - isDistinct = false), - "c_count")() - val agg3 = - Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join) - val subqueryAlias = SubqueryAlias("c_orders", agg3) - val agg2 = - Aggregate( - Seq(groupingExpression1), - Seq(aggregateExpressions1, groupingExpression1), - subqueryAlias) - val groupingExpression2 = Alias(UnresolvedAttribute("c_count"), "c_count")() - val aggregateExpressions2 = - Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")() - val agg1 = - Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg2) - val sort = Sort( - Seq( - SortOrder(UnresolvedAttribute("custdist"), Descending), - SortOrder(UnresolvedAttribute("c_count"), Descending)), - global = true, - agg1) - val expectedPlan = Project(Seq(UnresolvedStar(None)), sort) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins with table alias") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = table1 as t1 - | | JOIN ON t1.id = t2.id - | [ - | source = table2 as t2 - | ] - | | JOIN ON t2.id = t3.id - | [ - | source = table3 as t3 - | ] - | | JOIN ON t3.id = t4.id - | [ - | source = table4 as t4 - | ] - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("table1")) - val table2 = UnresolvedRelation(Seq("table2")) - val table3 = UnresolvedRelation(Seq("table3")) - val table4 = UnresolvedRelation(Seq("table4")) - val joinPlan1 = Join( - SubqueryAlias("t1", table1), - SubqueryAlias("t2", table2), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))), - JoinHint.NONE) - val joinPlan2 = Join( - joinPlan1, - SubqueryAlias("t3", table3), - Inner, - Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))), - JoinHint.NONE) - val joinPlan3 = Join( - joinPlan2, - SubqueryAlias("t4", table4), - Inner, - Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))), - JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins with table and subquery alias") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = table1 as t1 - | | JOIN left = l right = r ON t1.id = t2.id - | [ - | source = table2 as t2 - | ] - | | JOIN left = l right = r ON t2.id = t3.id - | [ - | source = table3 as t3 - | ] - | | JOIN left = l right = r ON t3.id = t4.id - | [ - | source = table4 as t4 - | ] - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("table1")) - val table2 = UnresolvedRelation(Seq("table2")) - val table3 = UnresolvedRelation(Seq("table3")) - val table4 = UnresolvedRelation(Seq("table4")) - val joinPlan1 = Join( - SubqueryAlias("l", SubqueryAlias("t1", table1)), - SubqueryAlias("r", SubqueryAlias("t2", table2)), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))), - JoinHint.NONE) - val joinPlan2 = Join( - SubqueryAlias("l", joinPlan1), - SubqueryAlias("r", SubqueryAlias("t3", table3)), - Inner, - Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))), - JoinHint.NONE) - val joinPlan3 = Join( - SubqueryAlias("l", joinPlan2), - SubqueryAlias("r", SubqueryAlias("t4", table4)), - Inner, - Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))), - JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins without table aliases") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = table1 - | | JOIN ON table1.id = table2.id table2 - | | JOIN ON table1.id = table3.id table3 - | | JOIN ON table2.id = table4.id table4 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("table1")) - val table2 = UnresolvedRelation(Seq("table2")) - val table3 = UnresolvedRelation(Seq("table3")) - val table4 = UnresolvedRelation(Seq("table4")) - val joinPlan1 = Join( - table1, - table2, - Inner, - Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table2.id"))), - JoinHint.NONE) - val joinPlan2 = Join( - joinPlan1, - table3, - Inner, - Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table3.id"))), - JoinHint.NONE) - val joinPlan3 = Join( - joinPlan2, - table4, - Inner, - Some(EqualTo(UnresolvedAttribute("table2.id"), UnresolvedAttribute("table4.id"))), - JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins with part subquery aliases") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = table1 - | | JOIN left = t1 right = t2 ON t1.name = t2.name table2 - | | JOIN right = t3 ON t1.name = t3.name table3 - | | JOIN right = t4 ON t2.name = t4.name table4 - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("table1")) - val table2 = UnresolvedRelation(Seq("table2")) - val table3 = UnresolvedRelation(Seq("table3")) - val table4 = UnresolvedRelation(Seq("table4")) - val joinPlan1 = Join( - SubqueryAlias("t1", table1), - SubqueryAlias("t2", table2), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), - JoinHint.NONE) - val joinPlan2 = Join( - joinPlan1, - SubqueryAlias("t3", table3), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))), - JoinHint.NONE) - val joinPlan3 = Join( - joinPlan2, - SubqueryAlias("t4", table4), - Inner, - Some(EqualTo(UnresolvedAttribute("t2.name"), UnresolvedAttribute("t4.name"))), - JoinHint.NONE) - val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins with self join 1") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1 - | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2 - | | JOIN right = t3 ON t1.name = t3.name $testTable3 - | | JOIN right = t4 ON t1.name = t4.name $testTable1 - | | fields t1.name, t2.name, t3.name, t4.name - | """.stripMargin) - - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) - val joinPlan1 = Join( - SubqueryAlias("t1", table1), - SubqueryAlias("t2", table2), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), - JoinHint.NONE) - val joinPlan2 = Join( - joinPlan1, - SubqueryAlias("t3", table3), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))), - JoinHint.NONE) - val joinPlan3 = Join( - joinPlan2, - SubqueryAlias("t4", table1), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))), - JoinHint.NONE) - val expectedPlan = Project( - Seq( - UnresolvedAttribute("t1.name"), - UnresolvedAttribute("t2.name"), - UnresolvedAttribute("t3.name"), - UnresolvedAttribute("t4.name")), - joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test multiple joins with self join 2") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1 - | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2 - | | JOIN right = t3 ON t1.name = t3.name $testTable3 - | | JOIN ON t1.name = t4.name - | [ - | source = $testTable1 - | ] as t4 - | | fields t1.name, t2.name, t3.name, t4.name - | """.stripMargin) - - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) - val joinPlan1 = Join( - SubqueryAlias("t1", table1), - SubqueryAlias("t2", table2), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), - JoinHint.NONE) - val joinPlan2 = Join( - joinPlan1, - SubqueryAlias("t3", table3), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))), - JoinHint.NONE) - val joinPlan3 = Join( - joinPlan2, - SubqueryAlias("t4", table1), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))), - JoinHint.NONE) - val expectedPlan = Project( - Seq( - UnresolvedAttribute("t1.name"), - UnresolvedAttribute("t2.name"), - UnresolvedAttribute("t3.name"), - UnresolvedAttribute("t4.name")), - joinPlan3) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } - - test("test side alias will override the subquery alias") { - val context = new CatalystPlanContext - val logPlan = plan( - pplParser, - s""" - | source = $testTable1 - | | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = $testTable2 as ttt ] as tt - | | fields t1.name, t2.name - | """.stripMargin) - val logicalPlan = planTransformer.visit(logPlan, context) - val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) - val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) - val joinPlan1 = Join( - SubqueryAlias("t1", table1), - SubqueryAlias("t2", SubqueryAlias("tt", SubqueryAlias("ttt", table2))), - Inner, - Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), - JoinHint.NONE) - val expectedPlan = - Project(Seq(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name")), joinPlan1) - comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) - } +// +// test("test two-tables inner join: join condition with table names") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| JOIN left = l right = r ON $testTable1.id = $testTable2.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = +// EqualTo(UnresolvedAttribute(s"$testTable1.id"), UnresolvedAttribute(s"$testTable2.id")) +// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test inner join: join condition without prefix") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| JOIN left = l right = r ON id = name $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = +// EqualTo(UnresolvedAttribute("id"), UnresolvedAttribute("name")) +// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test inner join: join condition with aliases and predicates") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| JOIN left = l right = r ON l.id = r.id AND l.count > 10 AND lower(r.name) = 'hello' $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = And( +// And( +// EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")), +// EqualTo( +// Literal("hello"), +// UnresolvedFunction.apply( +// "lower", +// Seq(UnresolvedAttribute("r.name")), +// isDistinct = false))), +// LessThan(Literal(10), UnresolvedAttribute("l.count"))) +// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test inner join: join condition with table names and predicates") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| INNER JOIN left = l right = r ON $testTable1.id = $testTable2.id AND $testTable1.count > 10 AND lower($testTable2.name) = 'hello' $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = And( +// And( +// EqualTo(UnresolvedAttribute(s"$testTable1.id"), UnresolvedAttribute(s"$testTable2.id")), +// EqualTo( +// Literal("hello"), +// UnresolvedFunction.apply( +// "lower", +// Seq(UnresolvedAttribute(s"$testTable2.name")), +// isDistinct = false))), +// LessThan(Literal(10), UnresolvedAttribute(s"$testTable1.count"))) +// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test left outer join") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| LEFT OUTER JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test right outer join") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| RIGHT JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, RightOuter, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test left semi join") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| LEFT SEMI JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, LeftSemi, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test left anti join") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| LEFT ANTI JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, LeftAnti, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test full outer join") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| FULL JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, FullOuter, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test cross join") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| CROSS JOIN left = l right = r $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinPlan = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test cross join with join condition") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| CROSS JOIN left = l right = r ON l.id = r.id $testTable2 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightPlan = SubqueryAlias("r", table2) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, Cross, Some(joinCondition), JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1 +// | | inner JOIN left = l right = r ON l.id = r.id $testTable2 +// | | left JOIN left = l right = r ON l.name = r.name $testTable3 +// | | cross JOIN left = l right = r $testTable4 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) +// val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4")) +// var leftPlan = SubqueryAlias("l", table1) +// var rightPlan = SubqueryAlias("r", table2) +// val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE) +// leftPlan = SubqueryAlias("l", joinPlan1) +// rightPlan = SubqueryAlias("r", table3) +// val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name")) +// val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE) +// leftPlan = SubqueryAlias("l", joinPlan2) +// rightPlan = SubqueryAlias("r", table4) +// val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test complex join: TPC-H Q13") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | SEARCH source = $testTable1 +// | | FIELDS id, name +// | | LEFT OUTER JOIN left = c right = o ON c.custkey = o.custkey $testTable2 +// | | STATS count(o.orderkey) AS o_count BY c.custkey +// | | STATS count(1) AS custdist BY o_count +// | | SORT - custdist, - o_count +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val tableC = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val tableO = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val left = SubqueryAlias( +// "c", +// Project(Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), tableC)) +// val right = SubqueryAlias("o", tableO) +// val joinCondition = +// EqualTo(UnresolvedAttribute("o.custkey"), UnresolvedAttribute("c.custkey")) +// val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE) +// val groupingExpression1 = Alias(UnresolvedAttribute("c.custkey"), "c.custkey")() +// val aggregateExpressions1 = +// Alias( +// UnresolvedFunction( +// Seq("COUNT"), +// Seq(UnresolvedAttribute("o.orderkey")), +// isDistinct = false), +// "o_count")() +// val agg1 = +// Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join) +// val groupingExpression2 = Alias(UnresolvedAttribute("o_count"), "o_count")() +// val aggregateExpressions2 = +// Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")() +// val agg2 = +// Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg1) +// val sort = Sort( +// Seq( +// SortOrder(UnresolvedAttribute("custdist"), Descending), +// SortOrder(UnresolvedAttribute("o_count"), Descending)), +// global = true, +// agg2) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), sort) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test inner join with relation subquery") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| JOIN left = l right = r ON l.id = r.id +// | [ +// | source = $testTable2 +// | | where id > 10 and name = 'abc' +// | | fields id, name +// | | sort id +// | | head 10 +// | ] +// | | stats count(id) as cnt by type +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightSubquery = +// GlobalLimit( +// Literal(10), +// LocalLimit( +// Literal(10), +// Sort( +// Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), +// global = true, +// Project( +// Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), +// Filter( +// And( +// GreaterThan(UnresolvedAttribute("id"), Literal(10)), +// EqualTo(UnresolvedAttribute("name"), Literal("abc"))), +// table2))))) +// val rightPlan = SubqueryAlias("r", rightSubquery) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, Inner, Some(joinCondition), JoinHint.NONE) +// val groupingExpression = Alias(UnresolvedAttribute("type"), "type")() +// val aggregateExpression = Alias( +// UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false), +// "cnt")() +// val aggPlan = +// Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test left outer join with relation subquery") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1| LEFT JOIN left = l right = r ON l.id = r.id +// | [ +// | source = $testTable2 +// | | where id > 10 and name = 'abc' +// | | fields id, name +// | | sort id +// | | head 10 +// | ] +// | | stats count(id) as cnt by type +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val leftPlan = SubqueryAlias("l", table1) +// val rightSubquery = +// GlobalLimit( +// Literal(10), +// LocalLimit( +// Literal(10), +// Sort( +// Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), +// global = true, +// Project( +// Seq(UnresolvedAttribute("id"), UnresolvedAttribute("name")), +// Filter( +// And( +// GreaterThan(UnresolvedAttribute("id"), Literal(10)), +// EqualTo(UnresolvedAttribute("name"), Literal("abc"))), +// table2))))) +// val rightPlan = SubqueryAlias("r", rightSubquery) +// val joinCondition = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition), JoinHint.NONE) +// val groupingExpression = Alias(UnresolvedAttribute("type"), "type")() +// val aggregateExpression = Alias( +// UnresolvedFunction(Seq("COUNT"), Seq(UnresolvedAttribute("id")), isDistinct = false), +// "cnt")() +// val aggPlan = +// Aggregate(Seq(groupingExpression), Seq(aggregateExpression, groupingExpression), joinPlan) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), aggPlan) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins with relation subquery") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1 +// | | head 10 +// | | inner JOIN left = l right = r ON l.id = r.id +// | [ +// | source = $testTable2 +// | | where id > 10 +// | ] +// | | left JOIN left = l right = r ON l.name = r.name +// | [ +// | source = $testTable3 +// | | fields id +// | ] +// | | cross JOIN left = l right = r +// | [ +// | source = $testTable4 +// | | sort id +// | ] +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) +// val table4 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test4")) +// var leftPlan = SubqueryAlias("l", GlobalLimit(Literal(10), LocalLimit(Literal(10), table1))) +// var rightPlan = +// SubqueryAlias("r", Filter(GreaterThan(UnresolvedAttribute("id"), Literal(10)), table2)) +// val joinCondition1 = EqualTo(UnresolvedAttribute("l.id"), UnresolvedAttribute("r.id")) +// val joinPlan1 = Join(leftPlan, rightPlan, Inner, Some(joinCondition1), JoinHint.NONE) +// leftPlan = SubqueryAlias("l", joinPlan1) +// rightPlan = SubqueryAlias("r", Project(Seq(UnresolvedAttribute("id")), table3)) +// val joinCondition2 = EqualTo(UnresolvedAttribute("l.name"), UnresolvedAttribute("r.name")) +// val joinPlan2 = Join(leftPlan, rightPlan, LeftOuter, Some(joinCondition2), JoinHint.NONE) +// leftPlan = SubqueryAlias("l", joinPlan2) +// rightPlan = SubqueryAlias( +// "r", +// Sort(Seq(SortOrder(UnresolvedAttribute("id"), Ascending)), global = true, table4)) +// val joinPlan3 = Join(leftPlan, rightPlan, Cross, None, JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test complex join: TPC-H Q13 with relation subquery") { +// // select +// // c_count, +// // count(*) as custdist +// // from +// // ( +// // select +// // c_custkey, +// // count(o_orderkey) as c_count +// // from +// // customer left outer join orders on +// // c_custkey = o_custkey +// // and o_comment not like '%special%requests%' +// // group by +// // c_custkey +// // ) as c_orders +// // group by +// // c_count +// // order by +// // custdist desc, +// // c_count desc +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | SEARCH source = [ +// | SEARCH source = customer +// | | LEFT OUTER JOIN left = c right = o ON c_custkey = o_custkey +// | [ +// | SEARCH source = orders +// | | WHERE not like(o_comment, '%special%requests%') +// | ] +// | | STATS COUNT(o_orderkey) AS c_count BY c_custkey +// | ] AS c_orders +// | | STATS COUNT(o_orderkey) AS c_count BY c_custkey +// | | STATS COUNT(1) AS custdist BY c_count +// | | SORT - custdist, - c_count +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val tableC = UnresolvedRelation(Seq("customer")) +// val tableO = UnresolvedRelation(Seq("orders")) +// val left = SubqueryAlias("c", tableC) +// val filterNot = Filter( +// Not( +// UnresolvedFunction( +// Seq("like"), +// Seq(UnresolvedAttribute("o_comment"), Literal("%special%requests%")), +// isDistinct = false)), +// tableO) +// val right = SubqueryAlias("o", filterNot) +// val joinCondition = +// EqualTo(UnresolvedAttribute("o_custkey"), UnresolvedAttribute("c_custkey")) +// val join = Join(left, right, LeftOuter, Some(joinCondition), JoinHint.NONE) +// val groupingExpression1 = Alias(UnresolvedAttribute("c_custkey"), "c_custkey")() +// val aggregateExpressions1 = +// Alias( +// UnresolvedFunction( +// Seq("COUNT"), +// Seq(UnresolvedAttribute("o_orderkey")), +// isDistinct = false), +// "c_count")() +// val agg3 = +// Aggregate(Seq(groupingExpression1), Seq(aggregateExpressions1, groupingExpression1), join) +// val subqueryAlias = SubqueryAlias("c_orders", agg3) +// val agg2 = +// Aggregate( +// Seq(groupingExpression1), +// Seq(aggregateExpressions1, groupingExpression1), +// subqueryAlias) +// val groupingExpression2 = Alias(UnresolvedAttribute("c_count"), "c_count")() +// val aggregateExpressions2 = +// Alias(UnresolvedFunction(Seq("COUNT"), Seq(Literal(1)), isDistinct = false), "custdist")() +// val agg1 = +// Aggregate(Seq(groupingExpression2), Seq(aggregateExpressions2, groupingExpression2), agg2) +// val sort = Sort( +// Seq( +// SortOrder(UnresolvedAttribute("custdist"), Descending), +// SortOrder(UnresolvedAttribute("c_count"), Descending)), +// global = true, +// agg1) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), sort) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins with table alias") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = table1 as t1 +// | | JOIN ON t1.id = t2.id +// | [ +// | source = table2 as t2 +// | ] +// | | JOIN ON t2.id = t3.id +// | [ +// | source = table3 as t3 +// | ] +// | | JOIN ON t3.id = t4.id +// | [ +// | source = table4 as t4 +// | ] +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("table1")) +// val table2 = UnresolvedRelation(Seq("table2")) +// val table3 = UnresolvedRelation(Seq("table3")) +// val table4 = UnresolvedRelation(Seq("table4")) +// val joinPlan1 = Join( +// SubqueryAlias("t1", table1), +// SubqueryAlias("t2", table2), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))), +// JoinHint.NONE) +// val joinPlan2 = Join( +// joinPlan1, +// SubqueryAlias("t3", table3), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))), +// JoinHint.NONE) +// val joinPlan3 = Join( +// joinPlan2, +// SubqueryAlias("t4", table4), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))), +// JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins with table and subquery alias") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = table1 as t1 +// | | JOIN left = l right = r ON t1.id = t2.id +// | [ +// | source = table2 as t2 +// | ] +// | | JOIN left = l right = r ON t2.id = t3.id +// | [ +// | source = table3 as t3 +// | ] +// | | JOIN left = l right = r ON t3.id = t4.id +// | [ +// | source = table4 as t4 +// | ] +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("table1")) +// val table2 = UnresolvedRelation(Seq("table2")) +// val table3 = UnresolvedRelation(Seq("table3")) +// val table4 = UnresolvedRelation(Seq("table4")) +// val joinPlan1 = Join( +// SubqueryAlias("l", SubqueryAlias("t1", table1)), +// SubqueryAlias("r", SubqueryAlias("t2", table2)), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.id"), UnresolvedAttribute("t2.id"))), +// JoinHint.NONE) +// val joinPlan2 = Join( +// SubqueryAlias("l", joinPlan1), +// SubqueryAlias("r", SubqueryAlias("t3", table3)), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t2.id"), UnresolvedAttribute("t3.id"))), +// JoinHint.NONE) +// val joinPlan3 = Join( +// SubqueryAlias("l", joinPlan2), +// SubqueryAlias("r", SubqueryAlias("t4", table4)), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t3.id"), UnresolvedAttribute("t4.id"))), +// JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins without table aliases") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = table1 +// | | JOIN ON table1.id = table2.id table2 +// | | JOIN ON table1.id = table3.id table3 +// | | JOIN ON table2.id = table4.id table4 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("table1")) +// val table2 = UnresolvedRelation(Seq("table2")) +// val table3 = UnresolvedRelation(Seq("table3")) +// val table4 = UnresolvedRelation(Seq("table4")) +// val joinPlan1 = Join( +// table1, +// table2, +// Inner, +// Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table2.id"))), +// JoinHint.NONE) +// val joinPlan2 = Join( +// joinPlan1, +// table3, +// Inner, +// Some(EqualTo(UnresolvedAttribute("table1.id"), UnresolvedAttribute("table3.id"))), +// JoinHint.NONE) +// val joinPlan3 = Join( +// joinPlan2, +// table4, +// Inner, +// Some(EqualTo(UnresolvedAttribute("table2.id"), UnresolvedAttribute("table4.id"))), +// JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins with part subquery aliases") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = table1 +// | | JOIN left = t1 right = t2 ON t1.name = t2.name table2 +// | | JOIN right = t3 ON t1.name = t3.name table3 +// | | JOIN right = t4 ON t2.name = t4.name table4 +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("table1")) +// val table2 = UnresolvedRelation(Seq("table2")) +// val table3 = UnresolvedRelation(Seq("table3")) +// val table4 = UnresolvedRelation(Seq("table4")) +// val joinPlan1 = Join( +// SubqueryAlias("t1", table1), +// SubqueryAlias("t2", table2), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), +// JoinHint.NONE) +// val joinPlan2 = Join( +// joinPlan1, +// SubqueryAlias("t3", table3), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))), +// JoinHint.NONE) +// val joinPlan3 = Join( +// joinPlan2, +// SubqueryAlias("t4", table4), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t2.name"), UnresolvedAttribute("t4.name"))), +// JoinHint.NONE) +// val expectedPlan = Project(Seq(UnresolvedStar(None)), joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins with self join 1") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1 +// | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2 +// | | JOIN right = t3 ON t1.name = t3.name $testTable3 +// | | JOIN right = t4 ON t1.name = t4.name $testTable1 +// | | fields t1.name, t2.name, t3.name, t4.name +// | """.stripMargin) +// +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) +// val joinPlan1 = Join( +// SubqueryAlias("t1", table1), +// SubqueryAlias("t2", table2), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), +// JoinHint.NONE) +// val joinPlan2 = Join( +// joinPlan1, +// SubqueryAlias("t3", table3), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))), +// JoinHint.NONE) +// val joinPlan3 = Join( +// joinPlan2, +// SubqueryAlias("t4", table1), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))), +// JoinHint.NONE) +// val expectedPlan = Project( +// Seq( +// UnresolvedAttribute("t1.name"), +// UnresolvedAttribute("t2.name"), +// UnresolvedAttribute("t3.name"), +// UnresolvedAttribute("t4.name")), +// joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test multiple joins with self join 2") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1 +// | | JOIN left = t1 right = t2 ON t1.name = t2.name $testTable2 +// | | JOIN right = t3 ON t1.name = t3.name $testTable3 +// | | JOIN ON t1.name = t4.name +// | [ +// | source = $testTable1 +// | ] as t4 +// | | fields t1.name, t2.name, t3.name, t4.name +// | """.stripMargin) +// +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val table3 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test3")) +// val joinPlan1 = Join( +// SubqueryAlias("t1", table1), +// SubqueryAlias("t2", table2), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), +// JoinHint.NONE) +// val joinPlan2 = Join( +// joinPlan1, +// SubqueryAlias("t3", table3), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t3.name"))), +// JoinHint.NONE) +// val joinPlan3 = Join( +// joinPlan2, +// SubqueryAlias("t4", table1), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t4.name"))), +// JoinHint.NONE) +// val expectedPlan = Project( +// Seq( +// UnresolvedAttribute("t1.name"), +// UnresolvedAttribute("t2.name"), +// UnresolvedAttribute("t3.name"), +// UnresolvedAttribute("t4.name")), +// joinPlan3) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } +// +// test("test side alias will override the subquery alias") { +// val context = new CatalystPlanContext +// val logPlan = plan( +// pplParser, +// s""" +// | source = $testTable1 +// | | JOIN left = t1 right = t2 ON t1.name = t2.name [ source = $testTable2 as ttt ] as tt +// | | fields t1.name, t2.name +// | """.stripMargin) +// val logicalPlan = planTransformer.visit(logPlan, context) +// val table1 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test1")) +// val table2 = UnresolvedRelation(Seq("spark_catalog", "default", "flint_ppl_test2")) +// val joinPlan1 = Join( +// SubqueryAlias("t1", table1), +// SubqueryAlias("t2", SubqueryAlias("tt", SubqueryAlias("ttt", table2))), +// Inner, +// Some(EqualTo(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name"))), +// JoinHint.NONE) +// val expectedPlan = +// Project(Seq(UnresolvedAttribute("t1.name"), UnresolvedAttribute("t2.name")), joinPlan1) +// comparePlans(expectedPlan, logicalPlan, checkAnalysis = false) +// } } diff --git a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala index ad26cf21a..63c120a2c 100644 --- a/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala +++ b/spark-sql-application/src/main/scala/org/apache/spark/sql/FlintJobExecutor.scala @@ -168,12 +168,10 @@ trait FlintJobExecutor { IRestHighLevelClient.recordOperationSuccess( MetricConstants.RESULT_METADATA_WRITE_METRIC_PREFIX) } catch { - case t: Throwable => + case e: Exception => IRestHighLevelClient.recordOperationFailure( MetricConstants.RESULT_METADATA_WRITE_METRIC_PREFIX, - t) - // Re-throw the exception - throw t + e) } } @@ -452,8 +450,7 @@ trait FlintJobExecutor { statusCode.foreach(code => errorDetails.put("StatusCode", code.toString)) val errorJson = mapper.writeValueAsString(errorDetails) - // Record the processed error message - throwableHandler.setError(errorJson) + // CustomLogging will call log4j logger.error() underneath statusCode match { case Some(code) =>