Skip to content

Commit

Permalink
Add a java based shared table reader that uses Kernel APIs
Browse files Browse the repository at this point in the history
  • Loading branch information
vkorukanti committed Aug 28, 2023
1 parent ea6b2c0 commit 29a91e3
Show file tree
Hide file tree
Showing 6 changed files with 467 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1026,4 +1026,62 @@ class DeltaSharingRestClientSuite extends DeltaSharingIntegrationTest {
}
}
}

integrationTest("kernel:getFiles") {
val client = new DeltaSharingRestClient(
testProfileProvider,
sslTrustAll = true,
responseFormat = DeltaSharingRestClient.RESPONSE_FORMAT_KERNEL
)
try {
val tableFiles =
client.getFiles(
Table(name = "hackathon_dv_table", schema = "default", share = "share1"),
Nil,
None,
None,
None,
None
)

val scanStateJson = tableFiles.kernelStateAndScanFiles.head
val scanFilesJson = tableFiles.kernelStateAndScanFiles.drop(1)

val hadoopConf = new Configuration() {
{
set("spark.hadoop.fs.s3a.aws.credentials.provider",
"com.amazonaws.auth.EnvironmentVariableCredentialsProvider")
set("fs.s3a.endpoint", "s3.us-west-2.amazonaws.com")
}
}
val tableClient = DefaultTableClient.create(hadoopConf)

val scanState = KernelUtils.deserializeRowFromJson(tableClient, scanStateJson)
val scanFiles = scanFilesJson.map { scanFileJson =>
KernelUtils.deserializeRowFromJson(tableClient, scanFileJson)
}

var readRecordCount = 0
val maxRowCount = 100
val data = Scan.readData(
tableClient,
scanState,
KernelUtils.convertToCloseableIterator(scanFiles),
Optional.empty())
breakable {
try {
while (data.hasNext) {
val dataReadResult = data.next()
readRecordCount += KernelUtils.printData(dataReadResult, maxRowCount - readRecordCount)
if (readRecordCount >= maxRowCount) {
break() // This will break out of the enclosing breakable block
}
}
} finally {
data.asInstanceOf[Closeable].close()
}
}
client.close()
}
}
}
99 changes: 99 additions & 0 deletions examples/java/kernel-based-java-client/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
<?xml version="1.0" encoding="UTF-8"?>

<!--Copyright (2021) The Delta Lake Project Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.-->

<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>org.example</groupId>
<artifactId>kernel-based-java-client</artifactId>
<version>0.1-SNAPSHOT</version>

<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<staging.repo.url>""</staging.repo.url>
<delta-kernel.version>3.0.0-SNAPSHOT</delta-kernel.version>
<delta-sharing.version>1.0.0-SNAPSHOT</delta-sharing.version>
<hadoop.version>3.3.1</hadoop.version>
</properties>

<repositories>
<repository>
<id>staging-repo</id>
<url>${staging.repo.url}</url>
</repository>
</repositories>

<dependencies>
<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-kernel-api</artifactId>
<version>${delta-kernel.version}</version>
</dependency>

<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-kernel-defaults</artifactId>
<version>${delta-kernel.version}</version>
</dependency>

<dependency>
<groupId>io.delta</groupId>
<artifactId>delta-sharing-client_2.12</artifactId>
<version>${delta-sharing.version}</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-runtime</artifactId>
<version>${hadoop.version}</version>
</dependency>

<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client-api</artifactId>
<version>${hadoop.version}</version>
</dependency>

<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.5.0</version>
</dependency>

<dependency>
<groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-databind</artifactId>
<version>2.13.5</version>
</dependency>

<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.12.11</version>
</dependency>

<dependency>
<groupId>com.fasterxml.jackson.module</groupId>
<artifactId>jackson-module-scala_2.12</artifactId>
<version>2.13.5</version>
</dependency>

</dependencies>
</project>

Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Copyright (2023) The Delta Lake Project Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.delta.sharing.examples;

import java.util.Arrays;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;

import io.delta.kernel.data.ColumnVector;
import io.delta.kernel.data.ColumnarBatch;
import io.delta.kernel.data.DataReadResult;
import io.delta.kernel.defaults.internal.data.vector.VectorUtils;
import io.delta.kernel.types.StructType;

/**
* Base class for reading Delta Lake tables using the Delta Kernel APIs.
*/
public abstract class BaseTableReader
{
protected static int printData(DataReadResult dataReadResult, int maxRowsToPrint)
{
int printedRowCount = 0;
ColumnarBatch data = dataReadResult.getData();
Optional<ColumnVector> selectionVector = dataReadResult.getSelectionVector();
for (int rowId = 0; rowId < data.getSize(); rowId++) {
if (!selectionVector.isPresent() || selectionVector.get().getBoolean(rowId)) {
printRow(data, rowId);
printedRowCount++;
if (printedRowCount == maxRowsToPrint) {
break;
}
}
}
return printedRowCount;
}

protected static void printSchema(StructType schema)
{
System.out.printf(formatter(schema.length()), schema.fieldNames().toArray(new String[0]));
}

protected static void printRow(ColumnarBatch batch, int rowId)
{
int numCols = batch.getSchema().length();
Object[] rowValues = IntStream.range(0, numCols).mapToObj(colOrdinal -> {
ColumnVector columnVector = batch.getColumnVector(colOrdinal);
return VectorUtils.getValueAsObject(columnVector, rowId);
}).toArray();

// TODO: Need to handle the Row, Map, Array, Timestamp, Date types specially to
// print them in the format they need. Copy this code from Spark CLI.

System.out.printf(formatter(numCols), rowValues);
}

/**
* Minimum command line options for any implementation of this reader.
*/
protected static Options baseOptions()
{
return new Options()
.addRequiredOption("t", "table", true, "Fully qualified table path")
.addOption("c", "columns", true,
"Comma separated list of columns to read from the table. " +
"Ex. --columns=id,name,address")
.addOption(
Option.builder()
.option("l")
.longOpt("limit")
.hasArg(true)
.desc("Maximum number of rows to read from the table (default 20).")
.type(Number.class)
.build()
);
}

/**
* Helper method to parse the command line arguments.
*/
protected static CommandLine parseArgs(String mainClassName, Options options, String[] args)
{
CommandLineParser cliParser = new DefaultParser();

try {
return cliParser.parse(options, args);
}
catch (ParseException parseException) {
new HelpFormatter().printHelp(
"java " + mainClassName,
options,
true
);
}
System.exit(-1);
return null;
}

protected static Optional<List<String>> parseColumnList(CommandLine cli, String optionName)
{
return Optional.ofNullable(cli.getOptionValue(optionName))
.map(colString -> Arrays.asList(colString.split(",[ ]*")));
}

protected static int parseInt(CommandLine cli, String optionName, int defaultValue)
throws ParseException
{
return Optional.ofNullable(cli.getParsedOptionValue(optionName))
.map(Number.class::cast)
.map(Number::intValue)
.orElse(defaultValue);
}

private static String formatter(int length)
{
return IntStream.range(0, length)
.mapToObj(i -> "%20s")
.collect(Collectors.joining("|")) + "\n";
}
}

Loading

0 comments on commit 29a91e3

Please sign in to comment.