-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into improve-error-handling
- Loading branch information
Showing
9 changed files
with
541 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# OpenSearch Table | ||
|
||
## Overview | ||
|
||
The `OpenSearchCatalog` class integrates Apache Spark with OpenSearch, allowing Spark to interact with OpenSearch indices as tables. This integration supports read and write operations, enabling seamless data processing and querying across Spark and OpenSearch. | ||
|
||
## Configuration Parameters | ||
|
||
To configure the `OpenSearchCatalog`, set the following parameters in your Spark session: | ||
|
||
- **`opensearch.port`**: The port to connect to OpenSearch. Default is `9200`. | ||
- **`opensearch.scheme`**: The scheme to use for the connection. Default is `http`. Valid values are `[http, https]`. | ||
- **`opensearch.auth`**: The authentication method to use. Default is `noauth`. Valid values are `[noauth, sigv4, basic]`. | ||
- **`opensearch.auth.username`**: The username for basic authentication. | ||
- **`opensearch.auth.password`**: The password for basic authentication. | ||
- **`opensearch.region`**: The AWS region to use for SigV4 authentication. Default is `us-west-2`. Used only when `auth` is `sigv4`. | ||
|
||
## Usage | ||
|
||
### Initializing the Catalog | ||
|
||
To configure and initialize the catalog in your Spark session, set the following configurations: | ||
|
||
```scala | ||
spark.conf.set("spark.sql.catalog.dev", "org.apache.spark.opensearch.catalog.OpenSearchCatalog") | ||
spark.conf.set("spark.sql.catalog.dev.opensearch.port", "9200") | ||
spark.conf.set("spark.sql.catalog.dev.opensearch.scheme", "http") | ||
spark.conf.set("spark.sql.catalog.dev.opensearch.auth", "noauth") | ||
``` | ||
|
||
### Querying Data | ||
|
||
Once the catalog is configured, you can use Spark SQL to query OpenSearch indices as tables: | ||
|
||
- The namespace **MUST** be `default`. | ||
- When using a wildcard index name, it **MUST** be wrapped in backticks. | ||
|
||
Example: | ||
|
||
```scala | ||
val df = spark.sql("SELECT * FROM dev.default.my_index") | ||
df.show() | ||
``` | ||
|
||
Using a wildcard index name: | ||
```scala | ||
val df = spark.sql("SELECT * FROM dev.default.`my_index*`") | ||
df.show() | ||
``` | ||
|
||
## Limitation | ||
### catalog operation | ||
- List Tables: Not supported. | ||
- Create Table: Not supported. | ||
- Alter Table: Not supported. | ||
- Drop Table: Not supported. | ||
- Rename Table: Not supported. | ||
|
||
### table operation | ||
- Table only support read operation, for instance, SELECT, DESCRIBE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
132 changes: 132 additions & 0 deletions
132
...rk-integration/src/main/scala/org/apache/spark/opensearch/catalog/OpenSearchCatalog.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.apache.spark.opensearch.catalog | ||
|
||
import org.apache.spark.internal.Logging | ||
import org.apache.spark.opensearch.catalog.OpenSearchCatalog.OPENSEARCH_PREFIX | ||
import org.apache.spark.sql.catalyst.analysis.{NoSuchNamespaceException, NoSuchTableException} | ||
import org.apache.spark.sql.connector.catalog._ | ||
import org.apache.spark.sql.connector.expressions.Transform | ||
import org.apache.spark.sql.flint.FlintReadOnlyTable | ||
import org.apache.spark.sql.types.StructType | ||
import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
||
/** | ||
* A Spark TableCatalog implementation wrap OpenSearch domain as Catalog. | ||
* | ||
* <p> Configuration parameters for OpenSearchCatalog: | ||
* | ||
* <ul> <li><code>opensearch.port</code>: Default is 9200.</li> | ||
* <li><code>opensearch.scheme</code>: Default is http. Valid values are [http, https].</li> | ||
* <li><code>opensearch.auth</code>: Default is noauth. Valid values are [noauth, sigv4, | ||
* basic].</li> <li><code>opensearch.auth.username</code>: Basic auth username.</li> | ||
* <li><code>opensearch.auth.password</code>: Basic auth password.</li> | ||
* <li><code>opensearch.region</code>: Default is us-west-2. Only used when auth is sigv4.</li> | ||
* </ul> | ||
*/ | ||
class OpenSearchCatalog extends CatalogPlugin with TableCatalog with Logging { | ||
|
||
private var catalogName: String = _ | ||
private var options: CaseInsensitiveStringMap = _ | ||
|
||
override def initialize(name: String, options: CaseInsensitiveStringMap): Unit = { | ||
this.catalogName = name | ||
this.options = options | ||
} | ||
|
||
override def name(): String = catalogName | ||
|
||
@throws[NoSuchNamespaceException] | ||
override def listTables(namespace: Array[String]): Array[Identifier] = { | ||
throw new UnsupportedOperationException("OpenSearchCatalog does not support listTables") | ||
} | ||
|
||
@throws[NoSuchTableException] | ||
override def loadTable(ident: Identifier): Table = { | ||
logInfo(s"Loading table ${ident.name()}") | ||
if (!ident.namespace().exists(n => OpenSearchCatalog.isDefaultNamespace(n))) { | ||
throw new NoSuchTableException(ident.namespace().mkString("."), ident.name()) | ||
} | ||
|
||
val conf = new java.util.HashMap[String, String]( | ||
removePrefixFromMap(options.asCaseSensitiveMap(), OPENSEARCH_PREFIX)) | ||
conf.put("path", ident.name()) | ||
|
||
new FlintReadOnlyTable(conf, Option.empty) | ||
} | ||
|
||
override def createTable( | ||
ident: Identifier, | ||
schema: StructType, | ||
partitions: Array[Transform], | ||
properties: java.util.Map[String, String]): Table = { | ||
throw new UnsupportedOperationException("OpenSearchCatalog does not support createTable") | ||
} | ||
|
||
override def alterTable(ident: Identifier, changes: TableChange*): Table = { | ||
throw new UnsupportedOperationException("OpenSearchCatalog does not support alterTable") | ||
} | ||
|
||
override def dropTable(ident: Identifier): Boolean = { | ||
throw new UnsupportedOperationException("OpenSearchCatalog does not support dropTable") | ||
} | ||
|
||
override def renameTable(oldIdent: Identifier, newIdent: Identifier): Unit = { | ||
throw new UnsupportedOperationException("OpenSearchCatalog does not support renameTable") | ||
} | ||
|
||
private def removePrefixFromMap( | ||
map: java.util.Map[String, String], | ||
prefix: String): java.util.Map[String, String] = { | ||
val result = new java.util.HashMap[String, String]() | ||
map.forEach { (key, value) => | ||
if (key.startsWith(prefix)) { | ||
val newKey = key.substring(prefix.length) | ||
result.put(newKey, value) | ||
} else { | ||
result.put(key, value) | ||
} | ||
} | ||
result | ||
} | ||
} | ||
|
||
object OpenSearchCatalog { | ||
|
||
/** | ||
* The reserved namespace. | ||
*/ | ||
val RESERVED_DEFAULT_NAMESPACE: String = "default" | ||
|
||
/** | ||
* The prefix for OpenSearch-related configuration keys. | ||
*/ | ||
val OPENSEARCH_PREFIX: String = "opensearch." | ||
|
||
/** | ||
* Checks if the given namespace is the reserved default namespace. | ||
* | ||
* @param namespace | ||
* The namespace to check. | ||
* @return | ||
* True if the namespace is the reserved default namespace, false otherwise. | ||
*/ | ||
def isDefaultNamespace(namespace: String): Boolean = { | ||
RESERVED_DEFAULT_NAMESPACE.equalsIgnoreCase(namespace) | ||
} | ||
|
||
/** | ||
* Splits the table name into index names. | ||
* | ||
* @param tableName | ||
* The name of the table, potentially containing comma-separated index names. | ||
* @return | ||
* An array of index names. | ||
*/ | ||
def indexNames(tableName: String): Array[String] = { | ||
tableName.split(",") | ||
} | ||
} |
65 changes: 65 additions & 0 deletions
65
flint-spark-integration/src/main/scala/org/apache/spark/sql/flint/FlintReadOnlyTable.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* | ||
* Copyright OpenSearch Contributors | ||
* SPDX-License-Identifier: Apache-2.0 | ||
*/ | ||
|
||
package org.apache.spark.sql.flint | ||
|
||
import java.util | ||
|
||
import scala.collection.JavaConverters._ | ||
|
||
import org.opensearch.flint.core.FlintClientBuilder | ||
|
||
import org.apache.spark.opensearch.catalog.OpenSearchCatalog | ||
import org.apache.spark.sql.SparkSession | ||
import org.apache.spark.sql.connector.catalog.{SupportsRead, Table, TableCapability} | ||
import org.apache.spark.sql.connector.catalog.TableCapability.{BATCH_READ, BATCH_WRITE, STREAMING_WRITE, TRUNCATE} | ||
import org.apache.spark.sql.connector.read.ScanBuilder | ||
import org.apache.spark.sql.flint.config.FlintSparkConf | ||
import org.apache.spark.sql.flint.datatype.FlintDataType | ||
import org.apache.spark.sql.types.StructType | ||
import org.apache.spark.sql.util.CaseInsensitiveStringMap | ||
|
||
/** | ||
* FlintReadOnlyTable. | ||
* | ||
* @param conf | ||
* configuration | ||
* @param userSpecifiedSchema | ||
* userSpecifiedSchema | ||
*/ | ||
class FlintReadOnlyTable( | ||
val conf: util.Map[String, String], | ||
val userSpecifiedSchema: Option[StructType]) | ||
extends Table | ||
with SupportsRead { | ||
|
||
lazy val sparkSession = SparkSession.active | ||
|
||
lazy val flintSparkConf: FlintSparkConf = FlintSparkConf(conf) | ||
|
||
lazy val name: String = flintSparkConf.tableName() | ||
|
||
// todo. currently, we use first index schema in multiple indices. we should merge StructType | ||
// to widen type | ||
lazy val schema: StructType = { | ||
userSpecifiedSchema.getOrElse { | ||
FlintClientBuilder | ||
.build(flintSparkConf.flintOptions()) | ||
.getAllIndexMetadata(OpenSearchCatalog.indexNames(name): _*) | ||
.values() | ||
.asScala | ||
.headOption | ||
.map(m => FlintDataType.deserialize(m.getContent)) | ||
.getOrElse(StructType(Nil)) | ||
} | ||
} | ||
|
||
override def capabilities(): util.Set[TableCapability] = | ||
util.EnumSet.of(BATCH_READ) | ||
|
||
override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = { | ||
FlintScanBuilder(name, schema, flintSparkConf) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.