-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from kameshsampath/de-snowpark-py-update
(fix): Simplify and Context
- Loading branch information
Showing
5 changed files
with
125 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,70 +1,97 @@ | ||
#------------------------------------------------------------------------------ | ||
# ------------------------------------------------------------------------------ | ||
# Hands-On Lab: Data Engineering with Snowpark | ||
# Script: 02_load_raw.py | ||
# Author: Jeremiah Hansen, Caleb Baechtold | ||
# Last Updated: 1/9/2023 | ||
#------------------------------------------------------------------------------ | ||
# ------------------------------------------------------------------------------ | ||
|
||
import time | ||
from snowflake.snowpark import Session | ||
|
||
|
||
POS_TABLES = ['country', 'franchise', 'location', 'menu', 'truck', 'order_header', 'order_detail'] | ||
CUSTOMER_TABLES = ['customer_loyalty'] | ||
POS_TABLES = [ | ||
"country", | ||
"franchise", | ||
"location", | ||
"menu", | ||
"truck", | ||
"order_header", | ||
"order_detail", | ||
] | ||
CUSTOMER_TABLES = ["customer_loyalty"] | ||
TABLE_DICT = { | ||
"pos": {"schema": "RAW_POS", "tables": POS_TABLES}, | ||
"customer": {"schema": "RAW_CUSTOMER", "tables": CUSTOMER_TABLES} | ||
"customer": {"schema": "RAW_CUSTOMER", "tables": CUSTOMER_TABLES}, | ||
} | ||
|
||
# SNOWFLAKE ADVANTAGE: Schema detection | ||
# SNOWFLAKE ADVANTAGE: Data ingestion with COPY | ||
# SNOWFLAKE ADVANTAGE: Snowflake Tables (not file-based) | ||
|
||
|
||
def load_raw_table(session, tname=None, s3dir=None, year=None, schema=None): | ||
session.use_schema(schema) | ||
if year is None: | ||
location = "@external.frostbyte_raw_stage/{}/{}".format(s3dir, tname) | ||
else: | ||
print('\tLoading year {}'.format(year)) | ||
location = "@external.frostbyte_raw_stage/{}/{}/year={}".format(s3dir, tname, year) | ||
|
||
print("\tLoading year {}".format(year)) | ||
location = "@external.frostbyte_raw_stage/{}/{}/year={}".format( | ||
s3dir, tname, year | ||
) | ||
|
||
# we can infer schema using the parquet read option | ||
df = session.read.option("compression", "snappy") \ | ||
.parquet(location) | ||
df = session.read.option("compression", "snappy").parquet(location) | ||
df.copy_into_table("{}".format(tname)) | ||
|
||
|
||
# SNOWFLAKE ADVANTAGE: Warehouse elasticity (dynamic scaling) | ||
|
||
|
||
def load_all_raw_tables(session): | ||
_ = session.sql("ALTER WAREHOUSE HOL_WH SET WAREHOUSE_SIZE = XLARGE WAIT_FOR_COMPLETION = TRUE").collect() | ||
_ = session.sql( | ||
"ALTER WAREHOUSE HOL_WH SET WAREHOUSE_SIZE = XLARGE WAIT_FOR_COMPLETION = TRUE" | ||
).collect() | ||
|
||
for s3dir, data in TABLE_DICT.items(): | ||
tnames = data['tables'] | ||
schema = data['schema'] | ||
tnames = data["tables"] | ||
schema = data["schema"] | ||
for tname in tnames: | ||
print("Loading {}".format(tname)) | ||
# Only load the first 3 years of data for the order tables at this point | ||
# We will load the 2022 data later in the lab | ||
if tname in ['order_header', 'order_detail']: | ||
for year in ['2019', '2020', '2021']: | ||
load_raw_table(session, tname=tname, s3dir=s3dir, year=year, schema=schema) | ||
if tname in ["order_header", "order_detail"]: | ||
for year in ["2019", "2020", "2021"]: | ||
load_raw_table( | ||
session, tname=tname, s3dir=s3dir, year=year, schema=schema | ||
) | ||
else: | ||
load_raw_table(session, tname=tname, s3dir=s3dir, schema=schema) | ||
|
||
_ = session.sql("ALTER WAREHOUSE HOL_WH SET WAREHOUSE_SIZE = XSMALL").collect() | ||
|
||
|
||
def validate_raw_tables(session): | ||
# check column names from the inferred schema | ||
for tname in POS_TABLES: | ||
print('{}: \n\t{}\n'.format(tname, session.table('RAW_POS.{}'.format(tname)).columns)) | ||
print( | ||
"{}: \n\t{}\n".format( | ||
tname, session.table("RAW_POS.{}".format(tname)).columns | ||
) | ||
) | ||
|
||
for tname in CUSTOMER_TABLES: | ||
print('{}: \n\t{}\n'.format(tname, session.table('RAW_CUSTOMER.{}'.format(tname)).columns)) | ||
print( | ||
"{}: \n\t{}\n".format( | ||
tname, session.table("RAW_CUSTOMER.{}".format(tname)).columns | ||
) | ||
) | ||
|
||
|
||
# For local debugging | ||
if __name__ == "__main__": | ||
# Create a local Snowpark session | ||
with Session.builder.getOrCreate() as session: | ||
# Set the right database context to use | ||
session.use_database("HOL_DB") | ||
load_all_raw_tables(session) | ||
validate_raw_tables(session) | ||
validate_raw_tables(session) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
snowflake-snowpark-python | ||
snowflake-snowpark-python | ||
snowflake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters