From 971b476db29a780b9bfddfe6f8d48c7ec2d884cf Mon Sep 17 00:00:00 2001 From: Maximilian Osenberg Date: Fri, 17 Jun 2022 14:23:35 +0200 Subject: [PATCH] Add samples demonstrating Hyper's native S3 capabilities With this commit we add samples from the Tableau Conference Hands-on Training "Use Hyper as your Cloud Lake Engine" that show you how Hyper can natively read CSV and parquet files that are stored on Amazon S3 without the need for external tools. --- Community-Supported/native-s3/README.md | 72 +++++++++++++++++++ .../native-s3/join-parquet-and-csv-on-s3.py | 55 ++++++++++++++ .../native-s3/parquet-on-s3-to-hyper.py | 26 +++++++ .../native-s3/query-csv-on-s3.py | 35 +++++++++ .../native-s3/requirements.txt | 1 + Community-Supported/s3-to-hyper/README.md | 1 + 6 files changed, 190 insertions(+) create mode 100644 Community-Supported/native-s3/README.md create mode 100644 Community-Supported/native-s3/join-parquet-and-csv-on-s3.py create mode 100644 Community-Supported/native-s3/parquet-on-s3-to-hyper.py create mode 100644 Community-Supported/native-s3/query-csv-on-s3.py create mode 100644 Community-Supported/native-s3/requirements.txt diff --git a/Community-Supported/native-s3/README.md b/Community-Supported/native-s3/README.md new file mode 100644 index 0000000..a92c832 --- /dev/null +++ b/Community-Supported/native-s3/README.md @@ -0,0 +1,72 @@ +# parquet-to-hyper +## __parquet_to_hyper__ + +![Community Supported](https://img.shields.io/badge/Support%20Level-Community%20Supported-53bd92.svg) + +__Current Version__: 1.0 + +These samples show you how Hyper can natively interact with Amazon S3, without the need to install any external dependencies like boto or aws-cli. +They originate from the Tableau Conference 2022 Hands-on Training Use Hyper as your Cloud Lake Engine - you can [check out the slides here](https://mkt.tableau.com/tc22/sessions/live/428-HOT-D1_Hands-onUseTheHyperAPI.pdf). + +# Get started + +## __Prerequisites__ + +To run the script, you will need: + +- a computer running Windows, macOS, or Linux + +- Python 3.9+ + +- install the dependencies from the `requirements.txt` file + +## Run the samples + +The following instructions assume that you have set up a virtual environment for Python. For more information on +creating virtual environments, see [venv - Creation of virtual environments](https://docs.python.org/3/library/venv.html) +in the Python Standard Library. + +1. Open a terminal and activate the Python virtual environment (`venv`). + +1. Navigate to the folder where you installed the samples. + +1. Then follow the steps to run one of the samples which are shown below. + +**Create a `.hyper` file from parquet file on S3** +Run the Python script +```bash +$ python parquet-on-s3-to-hyper.py +``` + +This script will read the parquet file from `s3://nyc-tlc/trip%20data/yellow_tripdata_2021-06.parquet`, visit [AWS OpenData](https://registry.opendata.aws/nyc-tlc-trip-records-pds/) for more details and license about the dataset and insert the records into a table named `taxi_rides` which is stored in a `.hyper` database file. + +This database file can then directly be opened with Tableau Desktop or Tableau Prep or it can be published to Tableau Online and Tableau Server as shown in [this example](https://github.com/tableau/hyper-api-samples/tree/main/Community-Supported/publish-hyper). + +**Live query against a `.csv` file which is stored on AWS S3** +Run the Python script + +```bash +$ python query-csv-on-s3.py +``` + +This script will perform a live query on the CSV file which is stored in this public S3 bucket: `s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_small.csv`. + +**Live query with multiple `.parquet` and `.csv` files which are stored on AWS S3** +Run the Python script + +```bash +$ python join-parquet-and-csv-on-s3.py +``` + +This script will perform a live query on multiple `.parquet` files which are stored on AWS S3. It shows how to use the [`ARRAY` syntax](https://help.tableau.com/current/api/hyper_api/en-us/reference/sql/functions-srf.html#FUNCTIONS-SRF-EXTERNAL) to union multiple `.parquet` files and how `.parquet` files can be joined together with `.csv` files - as you would expect from normal database tables stored inside a `.hyper` file. + +## __Resources__ +Check out these resources to learn more: + +- [Hyper API docs](https://help.tableau.com/current/api/hyper_api/en-us/index.html) + +- [Tableau Hyper API Reference (Python)](https://help.tableau.com/current/api/hyper_api/en-us/reference/py/index.html) + +- [The EXTERNAL function in the Hyper API SQL Reference](https://help.tableau.com/current/api/hyper_api/en-us/reference/sql/functions-srf.html#FUNCTIONS-SRF-EXTERNAL) + +- [AWS command line tools documentation](https://docs.aws.amazon.com/cli/latest/reference/s3/cp.html), e.g. if you want to download some of the sample files to your local machine and explore them \ No newline at end of file diff --git a/Community-Supported/native-s3/join-parquet-and-csv-on-s3.py b/Community-Supported/native-s3/join-parquet-and-csv-on-s3.py new file mode 100644 index 0000000..80edc4a --- /dev/null +++ b/Community-Supported/native-s3/join-parquet-and-csv-on-s3.py @@ -0,0 +1,55 @@ +from tableauhyperapi import HyperProcess, Connection, Telemetry, CreateMode, SqlType, TableDefinition, TableName, Nullability, Inserter, escape_string_literal + +ORDERS_DATASET_2018 = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2018.parquet") +ORDERS_DATASET_2019 = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2019.parquet") +ORDERS_DATASET_2020 = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2020.parquet") +ORDERS_DATASET_2021 = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_2021.parquet") + +# CSV file which contains the orders that were returned by the customers +RETURNS_DATASET = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/returns.csv") + +# We need to manually enable S3 connectivity as this is still an experimental feature +with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper: + # Create a connection to the Hyper process - we do not connect to a database + with Connection(endpoint=hyper.endpoint) as connection: + + # We use the `ARRAY` syntax in the CREATE TEMP EXTERNAL TABLE statement to specify multiple files to be unioned + create_ext_orders_table = f""" + CREATE TEMP EXTERNAL TABLE orders + FOR ARRAY[ S3_LOCATION({ORDERS_DATASET_2018}, REGION => 'us-west-2'), + S3_LOCATION({ORDERS_DATASET_2019}, REGION => 'us-west-2'), + S3_LOCATION({ORDERS_DATASET_2020}, REGION => 'us-west-2'), + S3_LOCATION({ORDERS_DATASET_2021}, REGION => 'us-west-2')] + WITH (FORMAT => 'parquet') + """ + connection.execute_command(create_ext_orders_table) + + # Create the `returns` table also as EXTERNAL TABLE + create_ext_returns_table = f""" + CREATE TEMP EXTERNAL TABLE returns( + returned TEXT, + order_id TEXT + ) + FOR S3_LOCATION({RETURNS_DATASET}, REGION => 'us-west-2') + WITH (FORMAT => 'csv', HEADER => 'true', DELIMITER => ';') + """ + connection.execute_command(create_ext_returns_table) + + # Select the total sales amount per category from the CSV file + # and drill down by whether the orders were returned or not + query = f"""SELECT category, + (CASE WHEN returned IS NULL THEN 'Not Returned' ELSE 'Returned' END) AS return_info, + SUM(sales) + FROM orders + LEFT OUTER JOIN returns on orders.order_id = returns.order_id + GROUP BY 1, 2 + ORDER BY 1, 2""" + + # Execute the query with `execute_list_query` + result = connection.execute_list_query(query) + + # Iterate over all rows in the result and print them + print(f"{'Category':<20} {'Status':<20} Sales") + print(f"{'--------':<20} {'------':<20} -----") + for row in result: + print(f"{row[0]:<20} {row[1]:<20} {row[2]:,.2f} USD") diff --git a/Community-Supported/native-s3/parquet-on-s3-to-hyper.py b/Community-Supported/native-s3/parquet-on-s3-to-hyper.py new file mode 100644 index 0000000..c2c562c --- /dev/null +++ b/Community-Supported/native-s3/parquet-on-s3-to-hyper.py @@ -0,0 +1,26 @@ +from tableauhyperapi import HyperProcess, Connection, Telemetry, CreateMode, SqlType, TableDefinition, TableName, Nullability, Inserter, escape_string_literal + +# Details and license of dataset: https://registry.opendata.aws/nyc-tlc-trip-records-pds/ +TAXI_DATASET = escape_string_literal("s3://nyc-tlc/trip%20data/yellow_tripdata_2021-06.parquet") # May release fixes a bug so that %20 doesn't need to be escaped manually + +# We need to manually enable S3 connectivity as this is still an experimental feature +with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper: + # Create a connection to the Hyper process and let it create a database file - if it exists, it's overwritten + with Connection(endpoint=hyper.endpoint, database="taxi-rides-2021-06.hyper", create_mode=CreateMode.CREATE_AND_REPLACE) as connection: + + # Use `TableName` so we do not have to worry about escaping in the SQL query we generate below + # Note: This line does not create a table in Hyper, it just defines a name + taxi_rides = TableName("public", "taxi_rides") + + # Ingest the data from the parquet file into a Hyper Table + # Since the schema is stored inside the parquet file, we don't need to specify it explicitly here + cmd = f"CREATE TABLE {taxi_rides}" \ + f" AS ( SELECT * FROM EXTERNAL(S3_LOCATION({TAXI_DATASET}), FORMAT => 'parquet'))" + + # We use `execute_command` to send the CREATE TABLE statement to Hyper + # This may take some time depending on your network connectivity so AWS S3 + connection.execute_command(cmd) + + # Let's check how many rows we loaded + ride_count = connection.execute_scalar_query(f"SELECT COUNT(*) FROM {taxi_rides}") + print (f"Loaded {ride_count} taxi rides") diff --git a/Community-Supported/native-s3/query-csv-on-s3.py b/Community-Supported/native-s3/query-csv-on-s3.py new file mode 100644 index 0000000..c42a39e --- /dev/null +++ b/Community-Supported/native-s3/query-csv-on-s3.py @@ -0,0 +1,35 @@ +from tableauhyperapi import HyperProcess, Connection, Telemetry, CreateMode, SqlType, TableDefinition, TableName, Nullability, Inserter, escape_string_literal + +ORDERS_DATASET_S3 = escape_string_literal("s3://hyper-dev-us-west-2-bucket/tc22-demo/orders_small.csv") + +# We need to manually enable S3 connectivity as this is still an experimental feature +with HyperProcess(telemetry=Telemetry.SEND_USAGE_DATA_TO_TABLEAU, parameters={"experimental_external_s3": "true"}) as hyper: + # Create a connection to the Hyper process - we do not connect to a database + with Connection(endpoint=hyper.endpoint) as connection: + + # Use the CREATE TEMP EXTERNAL TABLE syntax - this allows us to use the CSV file like a normal table name in SQL queries + # We do not need to specify credentials as the S3 bucket is publicly accessible; this may be different when used with your own data + create_external_table = f""" + CREATE TEMP EXTERNAL TABLE orders( + order_date DATE, + product_id TEXT, + category TEXT, + sales DOUBLE PRECISION + ) + FOR S3_LOCATION({ORDERS_DATASET_S3}, REGION => 'us-west-2') + WITH (FORMAT => 'csv', HEADER => true) + """ + # Create the external table using `execute_command` which sends an instruction to the database - we don't expect a result value + connection.execute_command(create_external_table) + + # Select the total sales amount per category from the external table + query = f"""SELECT category, SUM(sales) + FROM orders + GROUP BY category""" + + # Execute the query with `execute_list_query` as we expect multiple rows (one row per category) and two columns (category name and sum of sales) + result = connection.execute_list_query(query) + + # Iterate over all rows in the result and print the category name and the sum of sales for that category + for row in result: + print(f"{row[0]}: {row[1]} USD") diff --git a/Community-Supported/native-s3/requirements.txt b/Community-Supported/native-s3/requirements.txt new file mode 100644 index 0000000..330d725 --- /dev/null +++ b/Community-Supported/native-s3/requirements.txt @@ -0,0 +1 @@ +tableauhyperapi>=0.0.14946 \ No newline at end of file diff --git a/Community-Supported/s3-to-hyper/README.md b/Community-Supported/s3-to-hyper/README.md index 7264961..d9070c4 100644 --- a/Community-Supported/s3-to-hyper/README.md +++ b/Community-Supported/s3-to-hyper/README.md @@ -14,6 +14,7 @@ This sample demonstrates how to, with little modification, leverage the Hyper AP It should serve as a starting point for anyone looking to automate the publishing process of datasources based on contents of S3 buckets. The advantage of leveraging this sample is that an end user should not need to open the Python script, instead simply edit the configuration file and the code handles the rest automatically. +**Note:** As an alternative to using Boto3, you can also check out if [Hyper's Native S3 capabilities](https://github.com/tableau/hyper-api-samples/tree/main/Community-Supported/native-s3/README.md) are applicable to your use-case to ingest data from AWS S3 into Hyper. # Get started