-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge PR changes of covering index rewriting for Iceberg table
Signed-off-by: Chen Dai <[email protected]>
- Loading branch information
Showing
10 changed files
with
309 additions
and
32 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
35 changes: 35 additions & 0 deletions
35
...tegration/src/main/scala/org/opensearch/flint/spark/source/FlintSparkSourceRelation.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.source | ||
|
||
import org.apache.spark.sql.catalyst.expressions.AttributeReference | ||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
|
||
/** | ||
* This source relation abstraction allows Flint to interact uniformly with different kinds of | ||
* source data formats (like Spark built-in File, Delta table, Iceberg, etc.), hiding the | ||
* specifics of each data source implementation. | ||
*/ | ||
trait FlintSparkSourceRelation { | ||
|
||
/** | ||
* @return | ||
* the concrete logical plan of the relation associated | ||
*/ | ||
def plan: LogicalPlan | ||
|
||
/** | ||
* @return | ||
* fully qualified table name represented by the relation | ||
*/ | ||
def tableName: String | ||
|
||
/** | ||
* @return | ||
* output column list of the relation | ||
*/ | ||
def output: Seq[AttributeReference] | ||
} |
81 changes: 81 additions & 0 deletions
81
...n/src/main/scala/org/opensearch/flint/spark/source/FlintSparkSourceRelationProvider.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.source | ||
|
||
import org.opensearch.flint.spark.source.file.FileSourceRelationProvider | ||
import org.opensearch.flint.spark.source.iceberg.IcebergSourceRelationProvider | ||
|
||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
|
||
/** | ||
* A provider defines what kind of logical plan can be supported by Flint Spark integration. It | ||
* serves similar purpose to Scala extractor which has to be used in match case statement. | ||
* However, the problem here is we want to avoid hard dependency on some data source code, such as | ||
* Iceberg. In this case, we have to maintain a list of provider and run it only if the 3rd party | ||
* library is available in current Spark session. | ||
*/ | ||
trait FlintSparkSourceRelationProvider { | ||
|
||
/** | ||
* @return | ||
* the name of the source relation provider | ||
*/ | ||
def name(): String | ||
|
||
/** | ||
* Determines whether the given logical plan is supported by this provider. | ||
* | ||
* @param plan | ||
* the logical plan to evaluate | ||
* @return | ||
* true if the plan is supported, false otherwise | ||
*/ | ||
def isSupported(plan: LogicalPlan): Boolean | ||
|
||
/** | ||
* Creates a source relation based on the provided logical plan. | ||
* | ||
* @param plan | ||
* the logical plan to wrap in source relation | ||
* @return | ||
* an instance of source relation | ||
*/ | ||
def getRelation(plan: LogicalPlan): FlintSparkSourceRelation | ||
} | ||
|
||
/** | ||
* Companion object provides utility methods. | ||
*/ | ||
object FlintSparkSourceRelationProvider extends Logging { | ||
|
||
/** | ||
* Retrieve all supported source relation provider for the given Spark session. | ||
* | ||
* @param spark | ||
* the Spark session | ||
* @return | ||
* a sequence of source relation provider | ||
*/ | ||
def getAllProviders(spark: SparkSession): Seq[FlintSparkSourceRelationProvider] = { | ||
var relations = Seq[FlintSparkSourceRelationProvider]() | ||
|
||
// File source is built-in supported | ||
relations = relations :+ new FileSourceRelationProvider | ||
|
||
// Add Iceberg provider if it's enabled in Spark conf | ||
if (spark.conf | ||
.getOption("spark.sql.catalog.spark_catalog") | ||
.contains("org.apache.iceberg.spark.SparkSessionCatalog")) { | ||
relations = relations :+ new IcebergSourceRelationProvider | ||
} | ||
|
||
val providerNames = relations.map(_.name()).mkString(",") | ||
logInfo(s"Loaded source relation providers [$providerNames]") | ||
relations | ||
} | ||
} |
27 changes: 27 additions & 0 deletions
27
...ntegration/src/main/scala/org/opensearch/flint/spark/source/file/FileSourceRelation.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.source.file | ||
|
||
import org.opensearch.flint.spark.source.FlintSparkSourceRelation | ||
|
||
import org.apache.spark.sql.catalyst.expressions.AttributeReference | ||
import org.apache.spark.sql.execution.datasources.LogicalRelation | ||
|
||
/** | ||
* Concrete source relation implementation for Spark built-in file-based data sources. | ||
* | ||
* @param plan | ||
* the `LogicalRelation` that represents the plan associated with the File-based table | ||
*/ | ||
case class FileSourceRelation(override val plan: LogicalRelation) | ||
extends FlintSparkSourceRelation { | ||
|
||
override def tableName: String = | ||
plan.catalogTable.get // catalogTable must be present as pre-checked in source relation provider's | ||
.qualifiedName | ||
|
||
override def output: Seq[AttributeReference] = plan.output | ||
} |
30 changes: 30 additions & 0 deletions
30
...on/src/main/scala/org/opensearch/flint/spark/source/file/FileSourceRelationProvider.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.source.file | ||
|
||
import org.opensearch.flint.spark.source.{FlintSparkSourceRelation, FlintSparkSourceRelationProvider} | ||
|
||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
import org.apache.spark.sql.execution.datasources.LogicalRelation | ||
|
||
/** | ||
* Source relation provider for Spark built-in file-based source. | ||
* | ||
* @param name | ||
* the name of the file source provider | ||
*/ | ||
class FileSourceRelationProvider(override val name: String = "file") | ||
extends FlintSparkSourceRelationProvider { | ||
|
||
override def isSupported(plan: LogicalPlan): Boolean = plan match { | ||
case LogicalRelation(_, _, Some(_), false) => true | ||
case _ => false | ||
} | ||
|
||
override def getRelation(plan: LogicalPlan): FlintSparkSourceRelation = { | ||
FileSourceRelation(plan.asInstanceOf[LogicalRelation]) | ||
} | ||
} |
28 changes: 28 additions & 0 deletions
28
...tion/src/main/scala/org/opensearch/flint/spark/source/iceberg/IcebergSourceRelation.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.source.iceberg | ||
|
||
import org.opensearch.flint.spark.source.FlintSparkSourceRelation | ||
|
||
import org.apache.spark.sql.catalyst.expressions.AttributeReference | ||
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation | ||
|
||
/** | ||
* Concrete implementation of `FlintSparkSourceRelation` for Iceberg-based data sources. This | ||
* class encapsulates the handling of relations backed by Iceberg tables, which are built on top | ||
* of Spark's DataSourceV2 and TableProvider interfaces. | ||
* | ||
* @param plan | ||
* the `DataSourceV2Relation` that represents the plan associated with the Iceberg table. | ||
*/ | ||
case class IcebergSourceRelation(override val plan: DataSourceV2Relation) | ||
extends FlintSparkSourceRelation { | ||
|
||
override def tableName: String = | ||
plan.identifier.map(_.toString()).get | ||
|
||
override def output: Seq[AttributeReference] = plan.output | ||
} |
40 changes: 40 additions & 0 deletions
40
.../main/scala/org/opensearch/flint/spark/source/iceberg/IcebergSourceRelationProvider.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.opensearch.flint.spark.source.iceberg | ||
|
||
import org.apache.iceberg.spark.source.SparkTable | ||
import org.opensearch.flint.spark.source.{FlintSparkSourceRelation, FlintSparkSourceRelationProvider} | ||
|
||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2ScanRelation} | ||
|
||
/** | ||
* Source relation provider for Apache Iceberg-based source. | ||
* | ||
* @param name | ||
* the name of the Iceberg source provider | ||
*/ | ||
class IcebergSourceRelationProvider(override val name: String = "iceberg") | ||
extends FlintSparkSourceRelationProvider { | ||
|
||
override def isSupported(plan: LogicalPlan): Boolean = plan match { | ||
case DataSourceV2Relation(_: SparkTable, _, _, _, _) => true | ||
case DataSourceV2ScanRelation(DataSourceV2Relation(_: SparkTable, _, _, _, _), _, _, _) => | ||
true | ||
case _ => false | ||
} | ||
|
||
override def getRelation(plan: LogicalPlan): FlintSparkSourceRelation = plan match { | ||
case relation @ DataSourceV2Relation(_: SparkTable, _, _, _, _) => | ||
IcebergSourceRelation(relation) | ||
case DataSourceV2ScanRelation( | ||
relation @ DataSourceV2Relation(_: SparkTable, _, _, _, _), | ||
_, | ||
_, | ||
_) => | ||
IcebergSourceRelation(relation) | ||
} | ||
} |
Oops, something went wrong.