-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
98 lines (78 loc) · 3.27 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import streamlit as st
from pyiceberg.catalog import load_catalog
from pyiceberg.expressions import GreaterThan
import plotly.express as px
import os
os.environ['AWS_DEFAULT_REGION'] = os.getenv('AWS_DEFAULT_REGION')
os.environ['AWS_ACCESS_KEY_ID'] = os.getenv('AWS_ACCESS_KEY_ID')
os.environ['AWS_SECRET_ACCESS_KEY'] = os.getenv('AWS_SECRET_ACCESS_KEY')
# Set up Streamlit app
st.set_page_config(page_title="PyIceberg Streamlit Example", layout="wide")
# Load db & tables from Iceberg catalog
st.title(':blue[Apache Iceberg] and :orange[DuckDB] :duck:')
st.image('st_news.png', width=1500)
st.markdown("<h3></h3><h3></h3>", unsafe_allow_html=True)
st.markdown("""
This Streamlit app is designed to play around with the Apache Iceberg Table format. You can
further run your own SQL queries & analyze data using DuckDB
""")
st.markdown("<h3></h3>", unsafe_allow_html=True)
catalog = load_catalog("glue", **{"type": "glue"})
st.subheader('Enter your :blue[Apache Iceberg] Table Name')
db = st.text_input('', 'test' + '.' + 'churn')
ice_table = catalog.load_table(db)
table_iden = ice_table.identifier
table_name = table_iden[1]
@st.cache_data
def iceberg_func(db):
scan = ice_table.scan()
scan = scan.to_pandas()
return scan
if table_name=='churn':
scan_new = iceberg_func(db)
st.write(scan_new)
st.write(f"Number of rows: {scan_new.shape[0]}")
st.markdown("<h3></h3>", unsafe_allow_html=True)
churn_count = scan_new['Churn'].value_counts().reset_index()
churn_count.columns = ['Churn', 'Count']
col1, col2 = st.columns(2)
with col1:
state_churn_count = scan_new.groupby(['State', 'Churn']).size().reset_index()
state_churn_count.columns = ['State', 'Churn', 'Count']
fig1 = px.bar(state_churn_count, x='State', y='Count', color='Churn')
st.plotly_chart(fig1, use_container_width=True)
with col2:
fig2 = px.scatter(scan_new, x='Total_day_charge', y='Total_night_charge', color='Churn')
st.plotly_chart(fig2)
col3, col4 = st.columns(2)
with col3:
fig3 = px.bar(churn_count, x='Churn', y='Count', color='Churn')
st.plotly_chart(fig3, use_container_width=True)
with col4:
fig4 = px.box(scan_new, x='Churn', y='Account_length')
st.plotly_chart(fig4, use_container_width=True)
else:
scan_new = iceberg_func(db)
st.write(scan_new)
st.write(f"Number of rows: {scan_new.shape[0]}")
st.markdown("<h3></h3>", unsafe_allow_html=True)
product_count = scan_new['product'].value_counts().reset_index()
product_count.columns = ['product', 'Count']
fig5 = px.bar(product_count, x='product', y='Count')
st.plotly_chart(fig5)
# Expression operator:
# filter = st.text_input('Account Length greater than? ', '')
# sc_new = ice_table.scan(row_filter=GreaterThan("Account_length", filter)).to_pandas()
# Use the table scan to convert it into an in-memory DuckDB table
con = ice_table.scan().to_duckdb(table_name=table_name)
# Running a query using DuckDB with aggregations
st.subheader('Enter your :orange[Duck DB] Query')
query = st.text_input('', '')
if query=='':
print("no value entered")
else:
duck_val = con.execute(
query
).df()
st.write(duck_val)
# "SELECT Churn , AVG(CAST(Customer_service_calls AS INT)) AS avg_calls FROM churn GROUP BY Churn"