From eb13dd6184121922be109275f0589cca78ad4696 Mon Sep 17 00:00:00 2001 From: valentijn7 Date: Mon, 25 Nov 2024 21:02:50 +0100 Subject: [PATCH] Merging of double events now higher up the chain --- .gitignore | 1 + GoogleFloodHub/src/GRRR.ipynb | 94 +++++++++++++++++------------------ 2 files changed, 47 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 3e6be22..f985f9a 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ impact_data_Mali_tidied.csv missing_cercle_info.csv missing_cercle_info_after_mod.csv impact_events_per_admin_54.csv +impact_events_per_admin_529.csv impact_events_per_admin_673.csv # ignore experimentation notebooks diff --git a/GoogleFloodHub/src/GRRR.ipynb b/GoogleFloodHub/src/GRRR.ipynb index 584079c..c63e9ab 100644 --- a/GoogleFloodHub/src/GRRR.ipynb +++ b/GoogleFloodHub/src/GRRR.ipynb @@ -2104,7 +2104,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ @@ -2219,6 +2219,38 @@ " return df\n", "\n", "\n", + "def merge_duplicate_events(d_events: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:\n", + " \"\"\"\n", + " ilter out double impact events by checking whether events have the \n", + " same start date, and then merge them to an event with the start date\n", + " of the first, and end date of the last event with the same start date\n", + "\n", + " :param dict_events: dict with au codas keys and events dfs as values\n", + " :return: same dict, but merged\n", + " \"\"\"\n", + " d_events_merged = {}\n", + "\n", + " for admin_unit, df_events in d_events.items():\n", + " df_events = df_events.reset_index(drop = True)\n", + " grouped = df_events.groupby('flood_start', as_index = False)\n", + " # merge events with the same start date\n", + " merged_events = grouped.agg({\n", + " 'flood_start': 'first',\n", + " 'flood_end': 'max'\n", + " })\n", + " # recalculate duration and reset identifiers/columns\n", + " merged_events['duration'] = (merged_events['flood_end'] - \\\n", + " merged_events['flood_start']).dt.days + 1\n", + " merged_events = merged_events.sort_values('flood_start').reset_index(drop = True)\n", + " merged_events['event'] = merged_events.index\n", + " merged_events.set_index('event', inplace = True)\n", + " merged_events = merged_events[['flood_start', 'flood_end', 'duration']]\n", + " # add to result\n", + " d_events_merged[admin_unit] = merged_events\n", + "\n", + " return d_events_merged\n", + "\n", + "\n", "def process_impact_data_to_events(\n", " df: pd.DataFrame, verbose: bool = False\n", " ) -> Dict[str, pd.DataFrame]:\n", @@ -2356,15 +2388,17 @@ " # 'Region', 'Commune', 'Quartier/Villages']]\n", " dict_events[cercle] = df_events\n", "\n", + " # merge duplicate events\n", + " dict_events_merged = merge_duplicate_events(dict_events)\n", " # export to csv and return\n", - " export_dict_impact_events_to_csv(dict_events, verbose)\n", + " export_dict_impact_events_to_csv(dict_events_merged, verbose)\n", " \n", " return dict_events" ] }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ @@ -2373,7 +2407,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -2396,12 +2430,16 @@ "\n", "\n", "def subset_events_on_unit_and_date(\n", - " d_events: Dict[str, pd.DataFrame], d_units, earliest_date: str, latest_date: str\n", + " d_events: Dict[str, pd.DataFrame], d_units: Dict[str, xr.Dataset],\n", + " earliest_date: str, latest_date: str\n", " ) -> Dict[str, pd.DataFrame]:\n", " \"\"\"\n", " Subset the events on the available administrative units and dates,\n", - " while also returning a list of admin units with no impact data\n", - "\n", + " while also returning a list of admin units with no impact data.\n", + " This also includes looking per year if impact data is available\n", + " for an administrative unit. If not, the events for that unit for\n", + " that year are discarded (that is, from the flood event data).\n", + " \n", " :param d_events: dictionary with events\n", " :param d_units: dictionary with available administrative units\n", " :param earliest_date: earliest date\n", @@ -2681,46 +2719,6 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [ - "# filter out double impact events by checking whether events have the \n", - "# same start date, and then merge them to an event with the start date\n", - "# of the first, and end date of the last event with the same start date\n", - "def merge_duplicate_events(d_events: Dict[str, pd.DataFrame]) -> Dict[str, pd.DataFrame]:\n", - " \"\"\"\n", - " Merges duplicate impact events that have the same\n", - " flood start date for each administrative unit\n", - "\n", - " :param dict_events: dict with au codas keys and events dfs as values\n", - " :return: same dict, but merged\n", - " \"\"\"\n", - " d_events_merged = {}\n", - "\n", - " for admin_unit, df_events in d_events.items():\n", - " df_events = df_events.reset_index(drop = True)\n", - " grouped = df_events.groupby('flood_start', as_index = False)\n", - " # merge events with the same start date\n", - " merged_events = grouped.agg({\n", - " 'flood_start': 'first',\n", - " 'flood_end': 'max'\n", - " })\n", - " # recalculate duration and reset identifiers/columns\n", - " merged_events['duration'] = (merged_events['flood_end'] - \\\n", - " merged_events['flood_start']).dt.days + 1\n", - " merged_events = merged_events.sort_values('flood_start').reset_index(drop = True)\n", - " merged_events['event'] = merged_events.index\n", - " merged_events.set_index('event', inplace = True)\n", - " merged_events = merged_events[['flood_start', 'flood_end', 'duration']]\n", - " # add to result\n", - " d_events_merged[admin_unit] = merged_events\n", - "\n", - " return d_events_merged" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "metadata": {}, "outputs": [ { "name": "stdout", @@ -2795,7 +2793,7 @@ } ], "source": [ - "dict_impact_events_final = merge_duplicate_events(dict_impact_events_subset)\n", + "# dict_impact_events_final = merge_duplicate_events(dict_impact_events_subset)\n", "export_dict_impact_events_to_csv(dict_impact_events_final)\n", "print(dict_impact_events_final)" ]