Merge branch 'main' into ppl-spark-join-command

# Conflicts: # build.sbt # integ-test/src/test/scala/org/opensearch/flint/spark/LogicalPlanTestUtils.scala # ppl-spark-integration/README.md # ppl-spark-integration/src/main/antlr4/OpenSearchPPLLexer.g4 # ppl-spark-integration/src/main/antlr4/OpenSearchPPLParser.g4 # ppl-spark-integration/src/main/java/org/opensearch/sql/ast/AbstractNodeVisitor.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ast/tree/Project.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystPlanContext.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/CatalystQueryPlanVisitor.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstBuilder.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/parser/AstExpressionBuilder.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/AggregatorTranslator.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/ComparatorTransformer.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/DataTypeTransformer.java # ppl-spark-integration/src/main/java/org/opensearch/sql/ppl/utils/SortUtils.java # ppl-spark-integration/src/main/scala/org/opensearch/flint/spark/ppl/FlintSparkPPLParser.scala # ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/LogicalPlanTestUtils.scala # ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalAdvancedTranslatorTestSuite.scala # ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanAggregationQueriesTranslatorTestSuite.scala # ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanBasicQueriesTranslatorTestSuite.scala # ppl-spark-integration/src/test/scala/org/opensearch/flint/spark/ppl/PPLLogicalPlanFiltersTranslatorTestSuite.scala
YANG-DB · Oct 6, 2023 · aee86d6 · aee86d6
2 parents aaa4831 + 6e7c88a
commit aee86d6
Show file tree

Hide file tree

Showing 78 changed files with 6,109 additions and 1,289 deletions.
diff --git a/.github/workflows/backport.yml b/.github/workflows/backport.yml
@@ -22,7 +22,7 @@ jobs:
           installation_id: 22958780
 
       - name: Backport
-        uses: VachaShah/backport@v2.1.0
+        uses: VachaShah/backport@v2.2.0
         with:
           github_token: ${{ steps.github_app_token.outputs.token }}
           head_template: backport/backport-<%= number %>-to-<%= base %>

diff --git a/.github/workflows/snapshot-publish.yml b/.github/workflows/snapshot-publish.yml
@@ -29,6 +29,7 @@ jobs:
       - name: Publish to Local Maven
         run: |
           sbt standaloneCosmetic/publishM2
+          sbt sparkPPLCosmetic/publishM2
           sbt sparkSqlApplicationCosmetic/publishM2
 
       - uses: actions/checkout@v3

diff --git a/.scalafmt.conf b/.scalafmt.conf
@@ -0,0 +1 @@
+version = 2.7.5
diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md
@@ -22,7 +22,7 @@ sbt scalafmtAll
 ```
 The code style is automatically checked, but users can also manually check it.
 ```
-sbt sbt scalastyle
+sbt scalastyle
 ```
 For IntelliJ user, read more in [scalafmt IntelliJ](https://scalameta.org/scalafmt/docs/installation.html#intellij) to integrate 
 scalafmt with IntelliJ
diff --git a/README.md b/README.md
@@ -4,10 +4,12 @@ OpenSearch Flint is ... It consists of two modules:
 
 - `flint-core`: a module that contains Flint specification and client.
 - `flint-spark-integration`: a module that provides Spark integration for Flint and derived dataset based on it.
+- `ppl-spark-integration`: a module that provides PPL query execution on top of Spark See [PPL repository](https://github.com/opensearch-project/piped-processing-language).
 
 ## Documentation
 
 Please refer to the [Flint Index Reference Manual](./docs/index.md) for more information.
+For PPL language see [PPL Reference Manual](https://github.com/opensearch-project/sql/blob/main/docs/user/ppl/index.rst) for more information.
 
 ## Prerequisites
 
@@ -17,14 +19,27 @@ Version compatibility:
 |---------------|-------------|---------------|---------------|------------|
 | 0.1.0         | 11+         | 3.3.1         | 2.12.14       | 2.6+       |
 
-## Usage
+## Flint Extension Usage 
 
 To use this application, you can run Spark with Flint extension:
 
 ```
 spark-sql --conf "spark.sql.extensions=org.opensearch.flint.FlintSparkExtensions"
 ```
 
+## PPL Extension Usage
+
+To use PPL to Spark translation, you can run Spark with PPL extension:
+
+```
+spark-sql --conf "spark.sql.extensions=org.opensearch.flint.FlintPPLSparkExtensions"
+```
+
+### Running With both Extension 
+```
+spark-sql --conf "spark.sql.extensions='org.opensearch.flint.spark.FlintPPLSparkExtensions, org.opensearch.flint.spark.FlintSparkExtensions'"
+```
+
 ## Build
 
 To build and run this application with Spark, you can run:
@@ -37,6 +52,18 @@ then add org.opensearch:opensearch-spark_2.12 when run spark application, for ex
 bin/spark-shell --packages "org.opensearch:opensearch-spark_2.12:0.1.0-SNAPSHOT"
 ```
 
+### PPL Build & Run 
+
+To build and run this PPL in Spark, you can run:
+
+```
+sbt clean sparkPPLCosmetic/publishM2
+```
+then add org.opensearch:opensearch-spark_2.12 when run spark application, for example,
+```
+bin/spark-shell --packages "org.opensearch:opensearch-spark-ppl_2.12:0.1.0-SNAPSHOT"
+```
+
 ## Code of Conduct
 
 This project has adopted an [Open Source Code of Conduct](./CODE_OF_CONDUCT.md).

diff --git a/build.sbt b/build.sbt
@@ -68,8 +68,6 @@ lazy val pplSparkIntegration = (project in file("ppl-spark-integration"))
     name := "ppl-spark-integration",
     scalaVersion := scala212,
     libraryDependencies ++= Seq(
-      "com.amazonaws" % "aws-java-sdk" % "1.12.397" % "provided"
-        exclude ("com.fasterxml.jackson.core", "jackson-databind"),
       "org.scalactic" %% "scalactic" % "3.2.15" % "test",
       "org.scalatest" %% "scalatest" % "3.2.15" % "test",
       "org.scalatest" %% "scalatest-flatspec" % "3.2.15" % "test",
@@ -100,7 +98,7 @@ lazy val pplSparkIntegration = (project in file("ppl-spark-integration"))
     assembly / test := (Test / test).value)
 
 lazy val flintSparkIntegration = (project in file("flint-spark-integration"))
-  .dependsOn(flintCore, pplSparkIntegration)
+  .dependsOn(flintCore)
   .enablePlugins(AssemblyPlugin, Antlr4Plugin)
   .settings(
     commonSettings,
@@ -151,7 +149,10 @@ lazy val integtest = (project in file("integ-test"))
       "org.scalactic" %% "scalactic" % "3.2.15",
       "org.scalatest" %% "scalatest" % "3.2.15" % "test",
       "com.stephenn" %% "scalatest-json-jsonassert" % "0.2.5" % "test",
-      "org.testcontainers" % "testcontainers" % "1.18.0" % "test"),
+      "org.testcontainers" % "testcontainers" % "1.18.0" % "test",
+      // add opensearch-java client to get node stats
+      "org.opensearch.client" % "opensearch-java" % "2.6.0" % "test"
+        exclude ("com.fasterxml.jackson.core", "jackson-databind")),
     libraryDependencies ++= deps(sparkVersion),
     Test / fullClasspath ++= Seq((flintSparkIntegration / assembly).value, (pplSparkIntegration / assembly).value))
 
@@ -164,13 +165,37 @@ lazy val standaloneCosmetic = project
     Compile / packageBin := (flintSparkIntegration / assembly).value)
 
 lazy val sparkSqlApplication = (project in file("spark-sql-application"))
+  // dependency will be provided at runtime, so it doesn't need to be included in the assembled JAR
+  .dependsOn(flintSparkIntegration % "provided")
   .settings(
     commonSettings,
     name := "sql-job",
     scalaVersion := scala212,
     libraryDependencies ++= Seq(
       "org.scalatest" %% "scalatest" % "3.2.15" % "test"),
-    libraryDependencies ++= deps(sparkVersion))
+    libraryDependencies ++= deps(sparkVersion),
+    libraryDependencies += "com.typesafe.play" %% "play-json" % "2.9.2",
+    // Assembly settings
+    // the sbt assembly plugin found multiple copies of the module-info.class file with
+    // different contents in the jars  that it was merging flintCore dependencies.
+    // This can happen if you have multiple dependencies that include the same library,
+    // but with different versions.
+    assemblyPackageScala / assembleArtifact := false,
+    assembly / assemblyOption ~= {
+      _.withIncludeScala(false)
+    },
+    assembly / assemblyMergeStrategy := {
+      case PathList(ps@_*) if ps.last endsWith ("module-info.class") =>
+        MergeStrategy.discard
+      case PathList("module-info.class") => MergeStrategy.discard
+      case PathList("META-INF", "versions", xs@_, "module-info.class") =>
+        MergeStrategy.discard
+      case x =>
+        val oldStrategy = (assembly / assemblyMergeStrategy).value
+        oldStrategy(x)
+    },
+    assembly / test := (Test / test).value
+  )
 
 lazy val sparkSqlApplicationCosmetic = project
   .settings(
@@ -180,6 +205,14 @@ lazy val sparkSqlApplicationCosmetic = project
     exportJars := true,
     Compile / packageBin := (sparkSqlApplication / assembly).value)
 
+lazy val sparkPPLCosmetic = project
+  .settings(
+    name := "opensearch-spark-ppl",
+    commonSettings,
+    releaseSettings,
+    exportJars := true,
+    Compile / packageBin := (pplSparkIntegration / assembly).value)
+
 lazy val releaseSettings = Seq(
   publishMavenStyle := true,
   publishArtifact := true,

diff --git a/docs/PPL-on-Spark.md b/docs/PPL-on-Spark.md
@@ -0,0 +1,59 @@
+# Running PPL On Spark Reference Manual
+
+## Overview
+
+This module provides the support for running [PPL](https://github.com/opensearch-project/piped-processing-language) queries on Spark using direct logical plan
+translation between PPL's logical plan to Spark's Catalyst logical plan.
+
+### What is PPL ?
+OpenSearch PPL, or Pipe Processing Language, is a query language used with the OpenSearch platform and now Apache Spark.
+PPL allows users to retrieve, query, and analyze data by using commands that are piped together, making it easier to understand and compose complex queries.
+Its syntax is inspired by Unix pipes, which enables chaining of commands to transform and process data. 
+With PPL, users can filter, aggregate, and visualize data in multiple datasources in a more intuitive manner compared to traditional query languages
+
+### Context
+
+The next concepts are the main purpose of introduction this functionality:
+- Transforming PPL to become OpenSearch default query language (specifically for logs/traces/metrics signals)
+- Promoting PPL as a viable candidate for the proposed  CNCF Observability universal query language.
+- Seamlessly Interact with different datasources such as S3 / Prometheus / data-lake leveraging spark execution.
+- Using spark's federative capabilities as a general purpose query engine to facilitate complex queries including joins 
+- Improve and promote PPL to become extensible and general purpose query language to be adopted by the community
+
+
+### Running PPL Commands:
+
+In order to run PPL commands, you will need to perform the following tasks:
+
+#### PPL Build & Run
+
+To build and run this PPL in Spark, you can run:
+
+```
+sbt clean sparkPPLCosmetic/publishM2
+```
+then add org.opensearch:opensearch-spark_2.12 when run spark application, for example,
+```
+bin/spark-shell --packages "org.opensearch:opensearch-spark-ppl_2.12:0.1.0-SNAPSHOT"
+```
+
+### PPL Extension Usage
+
+To use PPL to Spark translation, you can run Spark with PPL extension:
+
+```
+spark-sql --conf "spark.sql.extensions=org.opensearch.flint.FlintPPLSparkExtensions"
+```
+
+### Running With both Flint & PPL Extensions
+In order to make use of both flint and ppl extension, one can simply add both jars (`org.opensearch:opensearch-spark-ppl_2.12:0.1.0-SNAPSHOT`,`org.opensearch:opensearch-spark_2.12:0.1.0-SNAPSHOT`) to the cluster's
+classpath.
+
+Next need to configure both extensions :
+```
+spark-sql --conf "spark.sql.extensions='org.opensearch.flint.FlintPPLSparkExtensions, org.opensearch.flint.FlintSparkExtensions'"
+```
+
+Once this is done, spark will allow both extensions to parse the query (SQL / PPL) and allow the correct execution of the query.
+In addition, PPL queries will enjoy the acceleration capabilities supported by the Flint plugins as described [here](index.md)
+