diff --git a/README.md b/README.md index 8ebc8b2..bab0be9 100644 --- a/README.md +++ b/README.md @@ -1,25 +1,30 @@ [![CI](https://github.com/nogibjj/Mobasserul_Haque_MiniProject5/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/Mobasserul_Haque_MiniProject5/actions/workflows/cicd.yml) -# Airline Safety Database ETL and Query Tool +# Graduate Employment Salary ETL Query Pipeline using Databricks -This project provides an ETL (Extract, Transform, Load) and querying tool for managing and analyzing the Airline Safety Database. It is built using Python and SQLite, enabling users to perform various operations on airline safety records, including extraction, loading, updating, deleting, creating, and querying records. +This project provides an ETL (Extract, Transform, Load) and querying tool designed to analyze critical employment statistics for both undergraduate and graduate students. The analysis focuses on employment rates, unemployment rates, and salary premiums, leveraging data from the **RecentGradsDB** and **GradStudentsDB** datasets. + +The pipeline is built using Python and Databricks, offering users the capability to efficiently extract data from various sources, transform and clean it for analysis, and load it into a Databricks table for further processing. Users can perform complex SQL queries that utilize JOINs, aggregations, filtering, and sorting to gain insights into employment trends, average salaries, and the effectiveness of various degree programs in securing employment for graduates. + +By utilizing this pipeline, educators, policymakers, and students can better understand the labor market dynamics and the value of different degrees, ultimately aiding in informed decision-making regarding education and career paths. ## Features - **ETL Operations**: - - Extract data from a source. - - Transform and load data into the SQLite database. + - Extract data from CSV files. + - Transform and load data into Databricks tables for analysis. -- **Query Operations**: - - Update existing records in the database. - - Delete records based on a unique identifier. - - Create new records in the database. - - Execute custom SQL queries. - - Read a limited number of records from the database. +- **Data Transformation**: Cleaning and preprocessing of data to ensure consistency and accuracy, including handling missing values and converting data types. +- **Data Loading**: Efficient loading of transformed data into a Databricks table, enabling scalable querying and analysis. + +- **Query Operations**: + - Execute complex SQL queries using JOINs, GROUP BY, HAVING, and UNION. + - Filter and sort data by employment rates, salary differences, and other attributes. + - **Logging and Output**: - - All executed queries are logged in a markdown file for reference. - - Query results are outputted in a formatted markdown file for easier readability. + - Query results are outputted in a structured format for easy interpretation. + - Errors and exceptions are logged during ETL and querying processes. ## Directory Structure @@ -30,7 +35,8 @@ This project provides an ETL (Extract, Transform, Load) and querying tool for ma ├── .github/ │ └── workflows/cicd.yml ├── data/ -│ └── airline_safety.csv +│ ├── airline_safety.csv + └── airline_safety.csv ├── myLib/ │ ├── __init__.py │ ├── __pycache__/ diff --git a/myLib/__pycache__/transform_load.cpython-312.pyc b/myLib/__pycache__/transform_load.cpython-312.pyc index 57acaad..1170dad 100644 Binary files a/myLib/__pycache__/transform_load.cpython-312.pyc and b/myLib/__pycache__/transform_load.cpython-312.pyc differ diff --git a/myLib/transform_load.py b/myLib/transform_load.py index c4d5da0..b52d2aa 100644 --- a/myLib/transform_load.py +++ b/myLib/transform_load.py @@ -4,7 +4,7 @@ from dotenv import load_dotenv def load_data(recent_grads_path="data/recent-grads.csv", grad_students_path="data/grad-students.csv"): - print("Starting the data loading process...") + print("Loading data to Databricks...") # Add this line # Load recent_grads.csv try: diff --git a/query_log.md b/query_log.md index 0437fec..e7a03e1 100644 --- a/query_log.md +++ b/query_log.md @@ -42,3 +42,47 @@ SELECT [Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL AND MANUFACTURING ENGINEERING', Undergrad_Employed=15604, Grad_Employed=66432, Undergrad_Unemployment_Rate=0.04287554323673248, Grad_Unemployment_Rate=0.03862462192773819, Salary_Premium=41000), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='INDUSTRIAL PRODUCTION TECHNOLOGIES', Undergrad_Employed=4428, Grad_Employed=14752, Undergrad_Unemployment_Rate=0.028308097273111343, Grad_Unemployment_Rate=0.03927059471607208, Salary_Premium=38500), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='MEDICAL ASSISTING SERVICES', Undergrad_Employed=9168, Grad_Employed=14499, Undergrad_Unemployment_Rate=0.04250652715563774, Grad_Unemployment_Rate=0.025342833250761032, Salary_Premium=38000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='NURSING', Undergrad_Employed=180903, Grad_Employed=437115, Undergrad_Unemployment_Rate=0.04486272484064102, Grad_Unemployment_Rate=0.02126673422753811, Salary_Premium=36000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='MEDICAL TECHNOLOGIES TECHNICIANS', Undergrad_Employed=13150, Grad_Employed=37639, Undergrad_Unemployment_Rate=0.036982789635658264, Grad_Unemployment_Rate=0.025552736595273018, Salary_Premium=31000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='GENERAL AGRICULTURE', Undergrad_Employed=8884, Grad_Employed=28930, Undergrad_Unemployment_Rate=0.019642462953925133, Grad_Unemployment_Rate=0.02932492271065712, Salary_Premium=28000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000), Row(Major='COURT REPORTING', Undergrad_Employed=930, Grad_Employed=1008, Undergrad_Unemployment_Rate=0.011689691804349422, Grad_Unemployment_Rate=0.0, Salary_Premium=21000)] ``` +```sql + + SELECT + rg.Major, + rg.Employed AS Undergrad_Employed, + gs.Grad_employed AS Grad_Employed, + rg.Unemployment_rate AS Undergrad_Unemployment_Rate, + gs.Grad_unemployment_rate AS Grad_Unemployment_Rate, + (gs.Grad_median - rg.Median) AS Salary_Premium + FROM RecentGradsDB rg + JOIN GradStudentsDB gs ON rg.Major_code = gs.Major_code + WHERE rg.Unemployment_rate < 0.05 + AND gs.Grad_unemployment_rate < 0.05 + ORDER BY Salary_Premium DESC + LIMIT 5; + +``` + +```response from databricks +[Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000)] +``` + +```sql + + SELECT + rg.Major, + rg.Employed AS Undergrad_Employed, + gs.Grad_employed AS Grad_Employed, + rg.Unemployment_rate AS Undergrad_Unemployment_Rate, + gs.Grad_unemployment_rate AS Grad_Unemployment_Rate, + (gs.Grad_median - rg.Median) AS Salary_Premium + FROM RecentGradsDB rg + JOIN GradStudentsDB gs ON rg.Major_code = gs.Major_code + WHERE rg.Unemployment_rate < 0.05 + AND gs.Grad_unemployment_rate < 0.05 + ORDER BY Salary_Premium DESC + LIMIT 5; + +``` + +```response from databricks +[Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000), Row(Major='OPERATIONS LOGISTICS AND E-COMMERCE', Undergrad_Employed=10027, Grad_Employed=12659, Undergrad_Unemployment_Rate=0.04785870388150215, Grad_Unemployment_Rate=0.02284832112491131, Salary_Premium=44000)] +``` + diff --git a/test_main.py b/test_main.py index 6ec3fdd..8920448 100644 --- a/test_main.py +++ b/test_main.py @@ -1,63 +1,72 @@ -""" -Test for ETL and complex SQL queries -""" - import subprocess def test_extract(): - """tests extract()""" + """Test extractData()""" result = subprocess.run( ["python", "main.py", "extract"], capture_output=True, text=True, check=True, ) - assert result.returncode == 0 - assert "Extracting data..." in result.stdout + assert ( + result.returncode == 0 + ), f"Extract failed with return code {result.returncode}" + assert ( + "Extracting data..." in result.stdout + ), "Expected 'Extracting data...' in output" + print("Extract Test Passed!") -def test_transform_load(): - """tests transform_load""" +def test_load(): + """Test loadData()""" result = subprocess.run( ["python", "main.py", "load"], capture_output=True, text=True, check=True, - ) - assert result.returncode == 0 - assert "Transforming data..." in result.stdout + ) + + if result.returncode != 0: + print(f"Load failed with return code {result.returncode}") + print(f"Error output: {result.stderr}") # Print the error output + assert result.returncode == 0 # Reassert to ensure the test fails + + assert ( + "Loading data to Databricks..." in result.stdout + ), "Expected 'Loading data to Databricks...' in output" + print("Load Test Passed!") def test_general_query(): - """tests general_query""" + """Test general_query() with a complex SQL query""" + query_string = """ + SELECT + rg.Major, + rg.Employed AS Undergrad_Employed, + gs.Grad_employed AS Grad_Employed, + rg.Unemployment_rate AS Undergrad_Unemployment_Rate, + gs.Grad_unemployment_rate AS Grad_Unemployment_Rate, + (gs.Grad_median - rg.Median) AS Salary_Premium + FROM RecentGradsDB rg + JOIN GradStudentsDB gs ON rg.Major_code = gs.Major_code + WHERE rg.Unemployment_rate < 0.05 + AND gs.Grad_unemployment_rate < 0.05 + ORDER BY Salary_Premium DESC + LIMIT 5; + """ + result = subprocess.run( - [ - "python", - "main.py", - "query", - """SELECT - rg.Major, - rg.Employed AS Undergrad_Employed, - gs.Grad_employed AS Grad_Employed, - rg.Unemployment_rate AS Undergrad_Unemployment_Rate, - gs.Grad_unemployment_rate AS Grad_Unemployment_Rate, - (gs.Grad_median - rg.Median) AS Salary_Premium - FROM RecentGradsDB rg - JOIN GradStudentsDB gs - ON rg.Major_code = gs.Major_code - WHERE rg.Unemployment_rate < 0.05 -- High undergraduate employment rate - AND gs.Grad_unemployment_rate < 0.05 -- High graduate employment rate - ORDER BY Salary_Premium DESC;""" - ], + ["python", "main.py", "query", query_string], capture_output=True, text=True, check=True, ) - assert result.returncode == 0 + assert result.returncode == 0, f"Query failed with return code {result.returncode}" + print("General Query Test Passed!") if __name__ == "__main__": test_extract() - test_transform_load() + test_load() test_general_query()