-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
256 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
# Databricks notebook source | ||
# DBTITLE 1,Authentication Credential to read events from PubSub | ||
|
||
client_id_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "client_id_1") | ||
client_email_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "client_email_1") | ||
private_key_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "private_key_1") | ||
private_key_id_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "private_key_id_1") | ||
authOptions = {"client_id": client_id_secret, | ||
"client_email": client_email_secret, | ||
"private_key": private_key_secret, | ||
"private_key_id": private_key_id_secret} | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1, Spark structured streaming ingestion from PubSub topic | ||
from pyspark.sql.functions import * | ||
from pyspark.sql.types import * | ||
from pyspark.sql import * | ||
|
||
shipingInputDF = spark.readStream.format("pubsub") \ | ||
.option("subscriptionId", "shipment_subscription") \ | ||
.option("topicId", "shipping-notification") \ | ||
.option("projectId", "datamesh-2") \ | ||
.option("numFetchPartitions", "3") \ | ||
.options(**authOptions) \ | ||
.load() | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Schema for the shipping events | ||
shipingDetailsSchema = ( | ||
StructType() | ||
.add("shipmentId", "string") | ||
.add("orderId", "string") | ||
.add("paymentId", "string") | ||
.add("userId", "string") | ||
.add("firstName", "string") | ||
.add("lastName", "string") | ||
.add("address", "string") | ||
.add("emailId", "string") | ||
.add("mobileNumber", "string") | ||
.add("productId", "string") | ||
.add("brand", "string") | ||
.add("quantity", "integer") | ||
.add("basePrice", "float") | ||
.add("subTotal", "float") | ||
.add("total", "float") | ||
.add("tax", "float") | ||
.add("totalTax", "float") | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Spark data frame for the input shipping event | ||
shipingDetailDF = ( | ||
shipingInputDF | ||
.select( | ||
from_json( | ||
col("payload").cast("string"), | ||
shipingDetailsSchema | ||
) | ||
.alias("shipingdata") | ||
) | ||
.select( | ||
"shipingdata.shipmentId", | ||
"shipingdata.orderId", | ||
"shipingdata.paymentId", | ||
"shipingdata.userId", | ||
"shipingdata.firstName", | ||
"shipingdata.lastName", | ||
"shipingdata.address", | ||
"shipingdata.emailId", | ||
"shipingdata.mobileNumber", | ||
"shipingdata.productId", | ||
"shipingdata.brand", | ||
"shipingdata.quantity", | ||
"shipingdata.basePrice", | ||
"shipingdata.subTotal", | ||
"shipingdata.total", | ||
"shipingdata.tax", | ||
"shipingdata.totalTax" | ||
) | ||
) | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Writing streaming raw shipping data frame to the delta lake table (Bronze table) | ||
shipingDetailDF.writeStream.format("delta") \ | ||
.outputMode("append") \ | ||
.partitionBy("brand") \ | ||
.option("checkpointLocation", "/dbfs/pubsub-shippment-checkpoint-38/") \ | ||
.trigger(processingTime = '3 seconds') \ | ||
.table("main.car_demo_data_lake.shipping_bronze") | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Reading streaming shippment events from bronze table | ||
silverDF = spark.readStream.table("main.car_demo_data_lake.shipping_bronze") | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Creating encryption key | ||
from cryptography.fernet import Fernet | ||
|
||
encryptionKey = Fernet.generate_key() | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Create Spark UDFs in python for encrypting a value | ||
def encrypt_val(clear_text,MASTER_KEY): | ||
from cryptography.fernet import Fernet | ||
f = Fernet(MASTER_KEY) | ||
clear_text_b=bytes(clear_text, 'utf-8') | ||
cipher_text = f.encrypt(clear_text_b) | ||
cipher_text = str(cipher_text.decode('ascii')) | ||
return cipher_text | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Use the UDF in a dataframe to encrypt a productid column | ||
from pyspark.sql.functions import udf, lit, md5 | ||
from pyspark.sql.types import StringType | ||
|
||
encrypt = udf(encrypt_val, StringType()) | ||
|
||
encryptedDF = silverDF.withColumn("userId", encrypt("userId",lit(encryptionKey))) \ | ||
.withColumn("firstName", encrypt("firstName",lit(encryptionKey))) \ | ||
.withColumn("lastName", encrypt("lastName",lit(encryptionKey))) \ | ||
.withColumn("address", encrypt("address",lit(encryptionKey))) \ | ||
.withColumn("emailId", encrypt("emailId",lit(encryptionKey))) \ | ||
.withColumn("mobileNumber", encrypt("mobileNumber",lit(encryptionKey))) | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Writing transformed silver data frame to the silver table | ||
encryptedDF.writeStream.format("delta") \ | ||
.outputMode("append") \ | ||
.option("checkpointLocation", "/dbfs/pubsub-shippment-sliver-checkpoint-38/") \ | ||
.partitionBy("brand") \ | ||
.trigger(processingTime = '2 seconds') \ | ||
.table("main.car_demo_data_lake.shipping_sliver") | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Reading streaming data frame from silver table | ||
goldDF = spark.readStream \ | ||
.table("main.car_demo_data_lake.shipping_sliver") | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Aggregate cars quantity and price for each brands and years | ||
group_cols = ["brand"] | ||
vechileGoldDF = goldDF.groupBy(group_cols) \ | ||
.agg(sum("quantity").alias("total_quantity_shipped"), sum("subTotal").alias("total_selling_price_inr")) | ||
|
||
# COMMAND ---------- | ||
|
||
# DBTITLE 1,Writing aggregated result to Gold table | ||
( | ||
vechileGoldDF.writeStream \ | ||
.format("delta") \ | ||
.outputMode("complete") \ | ||
.partitionBy("brand") \ | ||
.option("checkpointLocation", "/dbfs/pubsub-shipping-gold-38/") \ | ||
.trigger(processingTime = '1 seconds') \ | ||
.table("main.car_demo_all_brands.shipping_data_gold") | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
# Car demo data infrastructure and analytics | ||
|
||
For the analytical purpose, the insightful data generated by microservices brings to the databricks | ||
Lakehouse platform. | ||
|
||
The Microservices publish data to pubsub topics. The streaming data pipeline, which basically receive the streaming event from | ||
cloud pubsub and writing to the several databricks Delta Lake table. There is a medallion architecture while creating the pipeline. | ||
There is a bronze table for keeping the raw data. From the raw data we are doing some basic transformations. | ||
Basically doing the encryption for the PII columns, then writing to the Silver Table and from the Silver Table we are doing some aggregation. | ||
and writing aggregated result to the Golden Table and this golden table we are using for creating the visualization and reporting. | ||
|
||
![img.png](img.png) | ||
|
||
There is a streaming pipeline([PubSub-shipping-ingestion.py](PubSub-shipping-ingestion.py)) created with spark structured streaming to read shipment events from pubsub topic | ||
and write it to different delta lake tables. | ||
|
||
### Prerequisite | ||
1. Databricks workspace created with data plane resides on GCP account running the gcp pubsub and other services. | ||
follow the ([document](https://docs.gcp.databricks.com/en/administration-guide/workspace/create-workspace.html)) to create workspace on GCP | ||
2. Create a [Unity Catalog metastore](https://docs.gcp.databricks.com/en/data-governance/unity-catalog/create-metastore.html) | ||
Note: Unity Catalog provides centralized access control, auditing, and data discovery capabilities across Databricks workspaces. | ||
so Unity catalog is using for data governance. | ||
![img_1.png](img_1.png) | ||
3. All the microservices deployed on GKE and publishing event to pubsub topic. the streaming pipeline([PubSub-shipping-ingestion.py](PubSub-shipping-ingestion.py)) | ||
read events from topic shipping-notification and subscription. so topic and subscription should be created. | ||
4. To connect databricks to gcp pubsub will be needed gcp credentials. The credential should be stores in | ||
```databricks workspace. Follow this [document](https://docs.gcp.databricks.com/en/security/secrets/secret-scopes.html) to keep secrets in secrets. Following credential to be keep in secret scope. | ||
client_id_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "client_id") | ||
client_email_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "client_email") | ||
private_key_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "private_key") | ||
private_key_id_secret = dbutils.secrets.get(scope = "gcp-pubsub", key = "private_key_id") | ||
``` | ||
pick these secrets from gcp service account json key. | ||
5. Create a small size [compute cluster](https://docs.gcp.databricks.com/en/compute/configure.html) on data bricks. | ||
6. For Visualization, Install power BI desktop on your machine. | ||
To connect Power BI desktop to databricks cluster follow - [document](https://docs.gcp.databricks.com/en/partners/bi/power-bi.html) | ||
|
||
### How to run | ||
1. Import this ([PubSub-shipping-ingestion.py](PubSub-shipping-ingestion.py)) this file to databricks notebook | ||
follow [document](https://docs.gcp.databricks.com/en/notebooks/notebook-export-import.html#import-a-notebook) to import | ||
2. [Attach](https://docs.gcp.databricks.com/en/notebooks/notebook-ui.html#attach) a cluster created to the notebook. | ||
3. [Run](https://docs.gcp.databricks.com/en/notebooks/run-notebook.html) the notebook. | ||
|
||
### Result | ||
|
||
The pipeline will crete three table in unity catalog - | ||
a) main.car_demo_data_lake.shipping_bronze | ||
b) main.car_demo_data_lake.shipping_sliver | ||
c) main.car_demo_all_brands.shipping_data_gold | ||
|
||
Note: these delta lake tables Creates automatically. | ||
|
||
![img_2.png](img_2.png) | ||
|
||
This gold table connects to powerBI desktop for the visualization | ||
![img_3.png](img_3.png) | ||
|
||
#### Data governance | ||
All data assets tables ,view, databases stores in unity catalog. So, Grant and revoke of permission to a user | ||
and service principle will be executed on databricks unity catalog. Data discovery will be done on unity catalog. | ||
![img_4.png](img_4.png) | ||
|
||
|
||
|
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.