-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
51 lines (36 loc) · 980 Bytes
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
"""
Main entry point for HR_Attrition analysis
"""
from mylib.lib import (
start_spark,
end_spark,
extract,
load_data,
describe,
example_transform,
log_output,
reset_log, # reset_log
)
def main():
reset_log() # Clear log at the beginning to avoid duplication
extract()
spark = start_spark("HR_Attrition")
df = load_data(spark)
describe(df)
run_query(spark, df, "Yes")
run_query(spark, df, "No")
example_transform(df)
end_spark(spark)
def run_query(spark, df, attrition_value):
"""Filter by Attrition value and log the results."""
df.createOrReplaceTempView("HR_Attrition")
query = f"SELECT * FROM HR_Attrition WHERE Attrition = '{attrition_value}'"
result = spark.sql(query)
log_output(
f"Results for Attrition = '{attrition_value}'",
result.toPandas().to_markdown(),
query,
)
result.show(truncate=False)
if __name__ == "__main__":
main()