Skip to content

Commit

Permalink
Merge pull request #16 from leaf-ai/7DMA
Browse files Browse the repository at this point in the history
#15 Compute a 7 days moving average diff metric called Diff7DMA
  • Loading branch information
ofrancon authored Sep 18, 2020
2 parents ccd515b + 8085e25 commit 0617d98
Showing 1 changed file with 102 additions and 58 deletions.
160 changes: 102 additions & 58 deletions robojudge.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -172,26 +172,6 @@
"# Get actual cases between these dates"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def get_actual_cases(df, start_date, end_date):\n",
" # 1 day earlier to compute the daily diff\n",
" start_date_for_diff = start_date - pd.offsets.Day(1)\n",
" actual_df = df[[\"CountryName\", \"RegionName\", \"Date\", \"ConfirmedCases\"]]\n",
" # Filter out the data set to include all the data needed to compute the diff\n",
" actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]\n",
" actual_df.sort_values(by=[\"CountryName\",\"RegionName\",\"Date\"], inplace=True)\n",
" # Compute the diff\n",
" actual_df[\"ActualDailyNewCases\"] = actual_df.groupby([\"CountryName\", \"RegionName\"])[\"ConfirmedCases\"].diff().fillna(0)\n",
" # Return only the data between start_date and end_date\n",
" actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]\n",
" return actual_df"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -208,28 +188,23 @@
"metadata": {},
"outputs": [],
"source": [
"# def get_actual_cases_new(df, start_date, end_date):\n",
"# actual_df = df[[\"CountryName\", \"RegionName\", \"Date\", \"ConfirmedCases\"]].reset_index(drop=True)\n",
"# # Filter out the data set but make sure to include data need for 7MA and Diff computations\n",
"# start_date_for_diff = start_date - pd.offsets.Day(NUM_PREV_DAYS_TO_INCLUDE)\n",
"# actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]\n",
"# # actual_df.sort_values(by=[\"CountryName\",\"RegionName\",\"Date\"], inplace=True)\n",
"# actual_df = actual_df.sort_values(by=[\"CountryName\",\"RegionName\",\"Date\"]).reset_index(drop=True)\n",
" \n",
"# # Compute the diff\n",
"# actual_df[\"ActualDailyNewCases\"] = actual_df.groupby(\n",
"# [\"CountryName\", \"RegionName\"])[\"ConfirmedCases\"].diff().fillna(0).reset_index(0, drop=True)\n",
"# # # Make sure daily cases are not negative, which happens in case of corrections\n",
"# # actual_df['ActualDailyNewCases'] = actual_df['ActualDailyNewCases'].clip(lower=0)\n",
" \n",
"# # 7 day moving average\n",
"# actual_df[\"7DayActualDailyNewCases\"] = actual_df.groupby(\n",
"# [\"CountryName\", \"RegionName\"])[\"ActualDailyNewCases\"].rolling(WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)\n",
"# # ma_df = actual_df[\"7DayActualDailyNewCases\"] = actual_df.groupby([\"CountryName\", \"RegionName\"])[\"ActualDailyNewCases\"].rolling(7, center=False).mean().reset_index(0, drop=True)\n",
"\n",
"def get_actual_cases(df, start_date, end_date):\n",
" # 1 day earlier to compute the daily diff\n",
" start_date_for_diff = start_date - pd.offsets.Day(WINDOW_SIZE)\n",
" actual_df = df[[\"CountryName\", \"RegionName\", \"Date\", \"ConfirmedCases\"]]\n",
" # Filter out the data set to include all the data needed to compute the diff\n",
" actual_df = actual_df[(actual_df.Date >= start_date_for_diff) & (actual_df.Date <= end_date)]\n",
" actual_df['GeoID'] = actual_df['CountryName'] + '__' + actual_df['RegionName'].astype(str)\n",
" actual_df.sort_values(by=[\"GeoID\",\"Date\"], inplace=True)\n",
" # Compute the diff\n",
" actual_df[\"ActualDailyNewCases\"] = actual_df.groupby(\"GeoID\")[\"ConfirmedCases\"].diff().fillna(0)\n",
" # Compute the 7 day moving average\n",
" actual_df[\"ActualDailyNewCases7DMA\"] = actual_df.groupby(\n",
" \"GeoID\")['ActualDailyNewCases'].rolling(\n",
" WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)\n",
"# # Return only the data between start_date and end_date\n",
"# # actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]\n",
"# return actual_df"
"# actual_df = actual_df[(actual_df.Date >= start_date) & (actual_df.Date <= end_date)]\n",
" return actual_df"
]
},
{
Expand All @@ -251,21 +226,24 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"cell_type": "markdown",
"metadata": {},
"outputs": [],
"source": [
"# ma_df = actual_df.groupby([\"CountryName\", \"RegionName\"])[\"ActualDailyNewCases\"].rolling(7, center=False).mean().reset_index()\n",
"# ma_df.head(12)\n"
"# Get historical data for 7 days moving average calculation\n",
"In order to compute the 7 days moving average, we need to get the historical true new cases for the last 7 days before start date"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
"ma_df = actual_df[actual_df[\"Date\"] < start_date]\n",
"ma_df = ma_df[[\"CountryName\", \"RegionName\", \"Date\", \"ActualDailyNewCases\"]]\n",
"ma_df = ma_df.rename(columns={\"ActualDailyNewCases\": \"PredictedDailyNewCases\"})\n",
"ma_df.head()"
]
},
{
"cell_type": "markdown",
Expand All @@ -280,13 +258,29 @@
"metadata": {},
"outputs": [],
"source": [
"def get_predictions_from_file(predictor_name, predictions_file):\n",
"def get_predictions_from_file(predictor_name, predictions_file, ma_df):\n",
" preds_df = pd.read_csv(predictions_file,\n",
" parse_dates=['Date'],\n",
" encoding=\"ISO-8859-1\",\n",
" error_bad_lines=False)\n",
" preds_df[\"RegionName\"] = preds_df[\"RegionName\"].fillna(\"\")\n",
" preds_df[\"PredictorName\"] = predictor_name\n",
" preds_df[\"Prediction\"] = True\n",
" \n",
" # Append the true number of cases before start date\n",
" ma_df[\"PredictorName\"] = predictor_name\n",
" ma_df[\"Prediction\"] = False\n",
" preds_df = ma_df.append(preds_df, ignore_index=True)\n",
"\n",
" # Compute the 7 days moving average for PredictedDailyNewCases\n",
" preds_df['GeoID'] = preds_df['CountryName'] + '__' + preds_df['RegionName'].astype(str)\n",
" # Sort\n",
"# preds_df.sort_values(by=[\"CountryName\",\"RegionName\", \"Date\"], inplace=True)\n",
" preds_df.sort_values(by=[\"GeoID\",\"Date\"], inplace=True)\n",
" preds_df[\"PredictedDailyNewCases7DMA\"] = preds_df.groupby(\n",
" \"GeoID\")['PredictedDailyNewCases'].rolling(\n",
" WINDOW_SIZE, center=False).mean().reset_index(0, drop=True)\n",
"\n",
" # Put PredictorName first\n",
" preds_df = preds_df[[\"PredictorName\"] + [col for col in preds_df.columns if col != \"PredictorName\"]]\n",
" return preds_df"
Expand All @@ -311,7 +305,17 @@
"outputs": [],
"source": [
"test_predictor_name = \"Predictor #27\"\n",
"get_predictions_from_file(test_predictor_name, predictions[test_predictor_name]).head()"
"temp_df = get_predictions_from_file(test_predictor_name, predictions[test_predictor_name], ma_df.copy())\n",
"temp_df.head(12)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# (121 + 106 + 105 + 103 + 0 + 71 + 73.132138) / 7"
]
},
{
Expand All @@ -331,8 +335,8 @@
"source": [
"ranking_df = pd.DataFrame()\n",
"for predictor_name, predictions_file in predictions.items():\n",
" preds_df = get_predictions_from_file(predictor_name, predictions_file)\n",
" merged_df = actual_df.merge(preds_df, on=['CountryName', 'RegionName', 'Date'], how='left')\n",
" preds_df = get_predictions_from_file(predictor_name, predictions_file, ma_df)\n",
" merged_df = actual_df.merge(preds_df, on=['CountryName', 'RegionName', 'Date', 'GeoID'], how='left')\n",
" ranking_df = ranking_df.append(merged_df)"
]
},
Expand All @@ -342,7 +346,47 @@
"metadata": {},
"outputs": [],
"source": [
"ranking_df['Diff'] = (ranking_df[\"ActualDailyNewCases\"] - ranking_df[\"PredictedDailyNewCases\"]).abs()"
"ranking_df['DiffDaily'] = (ranking_df[\"ActualDailyNewCases\"] - ranking_df[\"PredictedDailyNewCases\"]).abs()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ranking_df['Diff7DMA'] = (ranking_df[\"ActualDailyNewCases7DMA\"] - ranking_df[\"PredictedDailyNewCases7DMA\"]).abs()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Keep only predictions (either Prediction == True) or on or after start_date\n",
"ranking_df = ranking_df[ranking_df[\"Date\"] >= start_date]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Sort by 7 days moving average diff\n",
"ranking_df.sort_values(by=[\"CountryName\",\"RegionName\",\"Date\",\"Diff7DMA\"], inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# # Set true dailycases before start_date\n",
"# ranking_df[\"Prediction\"] = False\n",
"# ranking_df[\"Prediction\"][ranking_df[\"Date\"] >= start_date] = True"
]
},
{
Expand All @@ -351,7 +395,7 @@
"metadata": {},
"outputs": [],
"source": [
"ranking_df.sort_values(by=[\"CountryName\",\"RegionName\",\"Date\",\"Diff\"], inplace=True)"
"# ranking_df.head((7+4)*3)"
]
},
{
Expand All @@ -360,7 +404,7 @@
"metadata": {},
"outputs": [],
"source": [
"ranking_df.head(12)"
"ranking_df.head(3*4)"
]
},
{
Expand Down Expand Up @@ -403,7 +447,7 @@
"metadata": {},
"outputs": [],
"source": [
"ranking_df.groupby('PredictorName').Diff.sum().sort_values()"
"ranking_df.groupby('PredictorName').Diff7DMA.sum().sort_values()"
]
},
{
Expand All @@ -419,7 +463,7 @@
"metadata": {},
"outputs": [],
"source": [
"countries_ranking_df = ranking_df.groupby([\"CountryName\", \"RegionName\", \"PredictorName\"])[[\"CountryName\", \"RegionName\", \"PredictorName\", \"Diff\"]].sum().sort_values(by=[\"CountryName\", \"RegionName\", \"Diff\"])\n"
"countries_ranking_df = ranking_df.groupby([\"CountryName\", \"RegionName\", \"PredictorName\"])[[\"CountryName\", \"RegionName\", \"PredictorName\", \"Diff7DMA\"]].sum().sort_values(by=[\"CountryName\", \"RegionName\", \"Diff7DMA\"])\n"
]
},
{
Expand Down Expand Up @@ -478,7 +522,7 @@
"metadata": {},
"outputs": [],
"source": [
"ranking_df[(ranking_df.CountryName == \"United States\") & (ranking_df.RegionName == \"\")].groupby([\"PredictorName\"]).Diff.sum().sort_values()"
"ranking_df[(ranking_df.CountryName == \"United States\") & (ranking_df.RegionName == \"\")].groupby([\"PredictorName\"]).Diff7DMA.sum().sort_values()"
]
},
{
Expand Down Expand Up @@ -519,7 +563,7 @@
"metadata": {},
"outputs": [],
"source": [
"cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == \"\")].groupby('PredictorName').Diff.sum().sort_values().reset_index()"
"cr_df[(cr_df.CountryName.isin(NORTH_AMERICA)) & (cr_df.RegionName == \"\")].groupby('PredictorName').Diff7DMA.sum().sort_values().reset_index()"
]
},
{
Expand Down

0 comments on commit 0617d98

Please sign in to comment.