fixed test_load

nogibjj · Oct 20, 2024 · f661303 · f661303
1 parent 5c66447
commit f661303
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 47 deletions.
diff --git a/README.md b/README.md
@@ -1,25 +1,30 @@
 [![CI](https://github.com/nogibjj/Mobasserul_Haque_MiniProject5/actions/workflows/cicd.yml/badge.svg)](https://github.com/nogibjj/Mobasserul_Haque_MiniProject5/actions/workflows/cicd.yml)
 
-# Airline Safety Database ETL and Query Tool
+# Graduate Employment Salary ETL Query Pipeline using Databricks
 
-This project provides an ETL (Extract, Transform, Load) and querying tool for managing and analyzing the Airline Safety Database. It is built using Python and SQLite, enabling users to perform various operations on airline safety records, including extraction, loading, updating, deleting, creating, and querying records.
+This project provides an ETL (Extract, Transform, Load) and querying tool designed to analyze critical employment statistics for both undergraduate and graduate students. The analysis focuses on employment rates, unemployment rates, and salary premiums, leveraging data from the **RecentGradsDB** and **GradStudentsDB** datasets.
+
+The pipeline is built using Python and Databricks, offering users the capability to efficiently extract data from various sources, transform and clean it for analysis, and load it into a Databricks table for further processing. Users can perform complex SQL queries that utilize JOINs, aggregations, filtering, and sorting to gain insights into employment trends, average salaries, and the effectiveness of various degree programs in securing employment for graduates.
+
+By utilizing this pipeline, educators, policymakers, and students can better understand the labor market dynamics and the value of different degrees, ultimately aiding in informed decision-making regarding education and career paths.
 
 ## Features
 
 - **ETL Operations**: 
-  - Extract data from a source.
-  - Transform and load data into the SQLite database.
+  - Extract data from CSV files.
+  - Transform and load data into Databricks tables for analysis.
 
-- **Query Operations**:
-  - Update existing records in the database.
-  - Delete records based on a unique identifier.
-  - Create new records in the database.
-  - Execute custom SQL queries.
-  - Read a limited number of records from the database.
+- **Data Transformation**: Cleaning and preprocessing of data to ensure consistency and accuracy, including handling missing values and converting data types.
 
+- **Data Loading**: Efficient loading of transformed data into a Databricks table, enabling scalable querying and analysis.
+
+- **Query Operations**:
+  - Execute complex SQL queries using JOINs, GROUP BY, HAVING, and UNION.
+  - Filter and sort data by employment rates, salary differences, and other attributes.
+
 - **Logging and Output**:
-  - All executed queries are logged in a markdown file for reference.
-  - Query results are outputted in a formatted markdown file for easier readability.
+  - Query results are outputted in a structured format for easy interpretation.
+  - Errors and exceptions are logged during ETL and querying processes.
 
 ## Directory Structure
 
@@ -30,7 +35,8 @@ This project provides an ETL (Extract, Transform, Load) and querying tool for ma
 ├── .github/
 │   └── workflows/cicd.yml
 ├── data/
-│   └── airline_safety.csv
+│   ├── airline_safety.csv
+    └── airline_safety.csv
 ├── myLib/
 │   ├── __init__.py
 │   ├── __pycache__/

diff --git a/myLib/__pycache__/transform_load.cpython-312.pyc b/myLib/__pycache__/transform_load.cpython-312.pyc
diff --git a/myLib/transform_load.py b/myLib/transform_load.py
@@ -4,7 +4,7 @@
 from dotenv import load_dotenv
 
 def load_data(recent_grads_path="data/recent-grads.csv", grad_students_path="data/grad-students.csv"):
-    print("Starting the data loading process...")
+    print("Loading data to Databricks...")  # Add this line
 
     # Load recent_grads.csv
     try:

diff --git a/query_log.md b/query_log.md
diff --git a/test_main.py b/test_main.py
@@ -1,63 +1,72 @@
-"""
-Test for ETL and complex SQL queries
-"""
-
 import subprocess
 
 
 def test_extract():
-    """tests extract()"""
+    """Test extractData()"""
     result = subprocess.run(
         ["python", "main.py", "extract"],
         capture_output=True,
         text=True,
         check=True,
     )
-    assert result.returncode == 0
-    assert "Extracting data..." in result.stdout
+    assert (
+        result.returncode == 0
+    ), f"Extract failed with return code {result.returncode}"
+    assert (
+        "Extracting data..." in result.stdout
+    ), "Expected 'Extracting data...' in output"
+    print("Extract Test Passed!")
 
 
-def test_transform_load():
-    """tests transform_load"""
+def test_load():
+    """Test loadData()"""
     result = subprocess.run(
         ["python", "main.py", "load"],
         capture_output=True,
         text=True,
         check=True,
-    )
-    assert result.returncode == 0
-    assert "Transforming data..." in result.stdout
+    )  
+
+    if result.returncode != 0:
+        print(f"Load failed with return code {result.returncode}")
+        print(f"Error output: {result.stderr}")  # Print the error output
+        assert result.returncode == 0  # Reassert to ensure the test fails
+
+    assert (
+        "Loading data to Databricks..." in result.stdout
+    ), "Expected 'Loading data to Databricks...' in output"
+    print("Load Test Passed!")
 
 
 def test_general_query():
-    """tests general_query"""
+    """Test general_query() with a complex SQL query"""
+    query_string = """
+        SELECT 
+            rg.Major, 
+            rg.Employed AS Undergrad_Employed, 
+            gs.Grad_employed AS Grad_Employed,
+            rg.Unemployment_rate AS Undergrad_Unemployment_Rate,
+            gs.Grad_unemployment_rate AS Grad_Unemployment_Rate,
+            (gs.Grad_median - rg.Median) AS Salary_Premium
+        FROM RecentGradsDB rg
+        JOIN GradStudentsDB gs ON rg.Major_code = gs.Major_code
+        WHERE rg.Unemployment_rate < 0.05  
+          AND gs.Grad_unemployment_rate < 0.05  
+        ORDER BY Salary_Premium DESC
+        LIMIT 5;
+    """
+
     result = subprocess.run(
-        [
-            "python",
-            "main.py",
-            "query",
-            """SELECT 
-                rg.Major, 
-                rg.Employed AS Undergrad_Employed, 
-                gs.Grad_employed AS Grad_Employed,
-                rg.Unemployment_rate AS Undergrad_Unemployment_Rate,
-                gs.Grad_unemployment_rate AS Grad_Unemployment_Rate,
-                (gs.Grad_median - rg.Median) AS Salary_Premium
-            FROM RecentGradsDB rg
-            JOIN GradStudentsDB gs
-                ON rg.Major_code = gs.Major_code
-            WHERE rg.Unemployment_rate < 0.05  -- High undergraduate employment rate
-              AND gs.Grad_unemployment_rate < 0.05  -- High graduate employment rate
-            ORDER BY Salary_Premium DESC;"""
-        ],
+        ["python", "main.py", "query", query_string],
         capture_output=True,
         text=True,
         check=True,
     )
-    assert result.returncode == 0
+    assert result.returncode == 0, f"Query failed with return code {result.returncode}"
+    print("General Query Test Passed!")
 
 
 if __name__ == "__main__":
     test_extract()
-    test_transform_load()
+    test_load()
     test_general_query()