Merge pull request #43 from shyan0903/main

Reorganize structure and change makefile
PANDASANG1231 · Dec 4, 2021 · 221c3a2 · 221c3a2
2 parents 38cdd2c + 2f94e77
commit 221c3a2
Show file tree

Hide file tree

Showing 32 changed files with 4,004 additions and 2,657 deletions.
diff --git a/Makefile b/Makefile
@@ -1,12 +1,13 @@
 # Makefile
-# Allyson Stoll, Dec 2021
+# Allyson Stoll, Irene Yan
+# Dec 4, 2021
 
 # This Makefile completes data download and preprocessing
 # prior to EDA and model building of a ramen rating model 
 # based on ramen ratings from The Ramen Rater.
 
 # Example usage:
-# make data/raw/ramen_ratings.csv
+# make all
 
 #make all dependencies
 all : doc/Report.html
@@ -17,57 +18,59 @@ data/raw/ramen_ratings.csv : src/download_data.py
 	--out_file="data/raw/ramen_ratings.csv"
 
 # split the dataset
-data/process/train_df.csv data/process/test_df.csv : data/raw/ramen_ratings.csv src/split.py
-	python3.9 src/split.py --path="data/raw/ramen_ratings.csv" --out_file_train="data/process/train_df.csv" \
-	--out_file_test="data/process/test_df.csv"
+data/processed/train_df.csv data/processed/test_df.csv : data/raw/ramen_ratings.csv src/split.py
+	python3.9 src/split.py --path="data/raw/ramen_ratings.csv" --out_file_train="data/processed/train_df.csv" \
+	--out_file_test="data/processed/test_df.csv"
 
 # add country codes to the dataset
-data/process/train_codes_df.csv : data/process/train_df.csv src/get_countrycode_data.py
-	python3.9 src/get_countrycode_data.py --path="data/process/train_df.csv" \
-	--out_file="data/eda/train_codes_df.csv"
+data/processed/train_codes_df.csv : data/processed/train_df.csv src/get_countrycode_data.py
+	python3.9 src/get_countrycode_data.py --path="data/processed/train_df.csv" \
+	--out_file="data/processed/train_codes_df.csv"
 
 # complete EDA figures
 results/figures/stars_histogram.png results/figures/type_histogram.png \
-results/figures/variety_wordcloud.png results/figures/ramen_map.png : data/eda/train_codes_df.csv src/generate_EDA_figures.py
-	python3.9 src/generate_EDA_figures.py --path="data/eda/train_codes_df.csv" --out_path="results/figures/"
+results/figures/variety_wordcloud.png results/figures/ramen_map.png : data/processed/train_codes_df.csv src/generate_EDA_figures.py
+	python3.9 src/generate_EDA_figures.py --path="data/processed/train_codes_df.csv" --out_path="results/figures/"
 
 # preprocess data for modeling
-data/process/train_process.csv data/process/test_process.csv : \
-data/process/train_df.csv data/process/test_df.csv src/preprocess.py
-	python3.9 src/preprocess.py --train_path="data/process/train_df.csv" --test_path="data/process/test_df.csv" \
-	--out_file_train="data/process/train_process.csv" --out_file_test="data/process/test_process.csv"
+data/processed/train_processed.csv data/processed/test_processed.csv : \
+data/processed/train_df.csv data/processed/test_df.csv src/preprocess.py
+	python3.9 src/preprocess.py --train_path="data/processed/train_df.csv" --test_path="data/processed/test_df.csv" \
+	--out_file_train="data/processed/train_processed.csv" --out_file_test="data/processed/test_processed.csv"
 
 # create model and supporting figures
 results/best_model.pkl results/train_metrics.jpg results/Top_20_Good_features.csv results/Top_20_Bad_features.csv : \
-data/process/train_process.csv src/train_model.py
-	python3.9 src/train_model.py --train_file="data/process/train_process.csv" --out_file_train="results/best_model.pkl" \
+data/processed/train_processed.csv src/train_model.py
+	python3.9 src/train_model.py --train_file="data/processed/train_processed.csv" --out_file_train="results/best_model.pkl" \
 	--out_file_result="results/"
 
 # test model predictions
-results/prediction/prediction.csv results/test_metrics.jpg : data/process/test_process.csv results/best_model.pkl src/predict.py
-	python3.9 src/predict.py --test_file="data/process/test_process.csv" --model_file="results/best_model.pkl" \
+results/prediction/prediction.csv results/prediction/test_metrics.jpg : data/processed/test_processed.csv results/best_model.pkl src/predict.py
+	python3.9 src/predict.py --test_file="data/processed/test_processed.csv" --model_file="results/best_model.pkl" \
 	--out_file_result="results/prediction/"
 
 # write the report
-doc/Report.html : results/prediction/prediction.csv results/test_metrics.jpg \
+doc/Report.html : results/prediction/prediction.csv results/prediction/test_metrics.jpg \
 results/figures/stars_histogram.png results/figures/type_histogram.png \
 results/figures/variety_wordcloud.png results/figures/ramen_map.png
 	Rscript -e "rmarkdown::render('doc/Report.Rmd')"
 
+# remove the entire analysis
 clean :
 	rm -rf data/raw/ramen_ratings.csv
-	rm -rf data/process/train_df.csv
-	rm -rf data/process/test_df.csv
-	rm -rf data/process/train_codes_df.csv 
+	rm -rf data/processed/train_df.csv
+	rm -rf data/processed/test_df.csv
+	rm -rf data/processed/train_codes_df.csv 
 	rm -rf results/figures/stars_histogram.png
 	rm -rf results/figures/type_histogram.png
 	rm -rf results/figures/variety_wordcloud.png
 	rm -rf results/figures/ramen_map.png
-	rm -rf data/process/train_process.csv
-	rm -rf data/process/test_process.csv
+	rm -rf data/processed/train_processed.csv
+	rm -rf data/processed/test_processed.csv
 	rm -rf results/best_model.pkl
 	rm -rf results/train_metrics.jpg
 	rm -rf results/Top_20_Good_features.csv
 	rm -rf results/Top_20_Bad_features.csv
 	rm -rf results/prediction/prediction.csv
-	rm -rf results/test_metrics.jpg
+	rm -rf results/prediction/test_metrics.jpg
+	rm -rf doc/Report.html