Merge branch 'main' of https://github.com/rodekruis/river-flood-data-…

…analysis
rodekruis · Nov 22, 2024 · 9418704 · 9418704
2 parents eb3df09 + e2f9e5c
commit 9418704
Show file tree

Hide file tree

Showing 7 changed files with 81 additions and 59 deletions.
diff --git a/GloFAS/GloFAS_analysis/performance_calculator.py b/GloFAS/GloFAS_analysis/performance_calculator.py
@@ -9,7 +9,7 @@
 from GloFAS.GloFAS_prep.vectorCheck import checkVectorFormat
 import GloFAS.GloFAS_prep.configuration as cfg
 class PredictedToImpactPerformanceAnalyzer:
-    def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, adminPath, startYear, endYear, years, PredictedEvents_gdf):
+    def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, adminPath, startYear, endYear, years, PredictedEvents_gdf, comparisonType):
         """
         Initialize the FloodPerformanceAnalyzer class with the required data.
         
@@ -19,7 +19,6 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel,
                                         should make this a csv possiblity
             triggerProb (float): Probability threshold to classify flood triggers.
         """
-
         self.triggerProb = triggerProb
         self.adminLevel = adminLevel
         self.gdf_shape = gpd.read_file(adminPath)
@@ -31,7 +30,8 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel,
         self.years = years
         self.impactData = impactData
         self.PredictedEvents_gdf = PredictedEvents_gdf
-        if isinstance (self.impactData, (str, Path)):
+        self.comparisonType = comparisonType
+        if isinstance (self.impactData, (str)):
             self.impact_gdf = self.openObservedImpact_gdf()
         elif isinstance (self.impactData, gpd.GeoDataFrame):
             self.impact_gdf = self.impactData
@@ -40,15 +40,18 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel,
 
     def openObservedImpact_gdf(self):
         # Load the data
-        df = pd.read_csv(self.impactDataPath)
+        if self.impactData.endswith('.csv'):
+            df = pd.read_csv(self.impactData)
+        else: 
+            df = gpd.read_file(self.impactData)
 
         # Convert 'End Date' and 'Start Date' to datetime
         df['End Date'] = pd.to_datetime(df['End Date'], format='%d/%m/%Y', errors='coerce')
         df['Start Date'] = pd.to_datetime(df['Start Date'], format='%d/%m/%Y', errors= 'coerce')
 
         # Filter rows between 2004 and 2022 ()
         df_filtered = df[(df['End Date'].dt.year >= self.startYear) & (df['End Date'].dt.year < self.endYear)]
-
+        
         # Remove non-string entries from ADM columns
         df_filtered = df_filtered[df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: isinstance(x, str))]
         self.gdf_shape = self.gdf_shape[self.gdf_shape[f'ADM{self.adminLevel}_FR'].apply(lambda x: isinstance(x, str))]
@@ -57,6 +60,7 @@ def openObservedImpact_gdf(self):
         self.gdf_shape.rename(columns={f'ADM{cfg.adminLevel}_FR':f'ADM{cfg.adminLevel}'}, inplace=True)
         self.gdf_shape[f'ADM{cfg.adminLevel}'] = self.gdf_shape[f'ADM{cfg.adminLevel}'].apply(lambda x: unidecode.unidecode(x).upper())
         # Apply normalization to both DataFrames (converting to uppercase and removing special characters)
+
         df_filtered[f'ADM{self.adminLevel}'] = df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: unidecode.unidecode(x).upper())
 
         # Merge the CSV data with the shapefile data
@@ -136,6 +140,7 @@ def _check_impact(self, PredictedEvents_gdf, commune, startdate):
                                 (PredictedEvents_gdf['EndValidTime'] >= startdate) &
                                 (PredictedEvents_gdf['Event']==1)
                                 ]
+        print (match)
         return 1 if not match.empty else 0
 
 
@@ -187,24 +192,28 @@ def calculateCommunePerformance(self):
         Calculate the performance scores for each commune and merge them back into the GeoDataFrame.
         """
         # Group by 'Commune' and calculate performance scores for each group
+        print (self.impact_gdf.columns)
+        print (self.impact_gdf.head)
+
         scores_by_commune = self.impact_gdf.groupby(f'ADM{self.adminLevel}').apply(
-            lambda x: self.calc_performance_scores(x['Impact'], x['Event'])
+            lambda x: self.calc_performance_scores(x[f'{self.comparisonType}'], x['Event'])
         )
         scores_byCommune_gdf = self.gdf_shape.merge(scores_by_commune, on=f'ADM{cfg.adminLevel}')
-        scores_byCommune_gdf.to_file (f"{self.DataDir}/glofas_to_hydrodata/scores_byCommuneRP{self.RPyr:.1f}_yr_leadtime{self.leadtime:.0f}.shp")
+        scores_byCommune_gdf.to_file (f"{self.DataDir}/{comparisonType}/scores_byCommuneRP{self.RPyr:.1f}_yr_leadtime{self.leadtime:.0f}.shp")
         return scores_byCommune_gdf
 
 if __name__=='__main__':
     for RPyr in cfg.RPsyr: 
-        hydro_impact_gdf = loop_over_stations (cfg.DNHstations , cfg.DataDir, RPyr, cfg.admPath)
+        hydro_impact_gdf = f'{cfg.DataDir}/Impact_from_hydro_RP_{RPyr}.gpkg'
+        #hydro_impact_gdf = loop_over_stations (cfg.DNHstations , cfg.DataDir, RPyr, cfg.admPath, cfg.adminLevel)
         for leadtime in cfg.leadtimes: 
             floodProbability_path = cfg.DataDir/ f"floodedRP{RPyr}yr_leadtime{leadtime}_ADM{cfg.adminLevel}.gpkg"
             floodProbability_gdf = checkVectorFormat (floodProbability_path)
             #calculate the flood events
             definer = FloodDefiner (cfg.adminLevel)
             PredictedEvents_gdf = definer.EventMaker (floodProbability_gdf, cfg.actionLifetime, cfg.triggerProb)
         #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-            analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, leadtime, hydro_impact_gdf, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PredictedEvents_gdf)
+            analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, leadtime, hydro_impact_gdf, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PredictedEvents_gdf, 'Observation')
             analyzer.matchImpact_and_Trigger()
             analyzer.calculateCommunePerformance()
 

diff --git a/GloFAS/GloFAS_prep/accent_remover.py b/GloFAS/GloFAS_prep/accent_remover.py
diff --git a/GloFAS/GloFAS_prep/text_formatter.py b/GloFAS/GloFAS_prep/text_formatter.py
@@ -0,0 +1,14 @@
+import unicodedata
+def remove_accents(input_str):
+    """Remove accents from a given string."""
+    if isinstance(input_str, str):  # Only process strings
+        nfkd_form = unicodedata.normalize('NFKD', input_str)
+        return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
+    return input_str  # Return as is if not a string
+
+def capitalize(input_str):
+    """Remove accents and capitalize the input string."""
+    if isinstance(input_str, str):  # Only process strings
+        input_str = remove_accents(input_str)  # Remove accents first
+        return input_str.upper()  # Capitalize (convert to uppercase)
+    return input_str 
diff --git a/comparison/HydroImpact.py b/comparison/HydroImpact.py
@@ -1,10 +1,10 @@
 
 import GloFAS.GloFAS_prep.configuration as cfg
 import pandas as pd
-from GloFAS.GloFAS_prep.accent_remover import remove_accents
+from GloFAS.GloFAS_prep.text_formatter import remove_accents, capitalize
 import scipy.stats as stats
 from comparison.pointMatching import attributePoints_to_Polygon
-
+import geopandas as gpd
 def parse_date_with_fallback(date_str, year):
     try:
         # Try to parse the date with the given year
@@ -64,7 +64,6 @@ def QRP_fit (hydro_df, RP):
 
     # 2. Fit a Gumbel distribution to the annual maximum discharge values
     loc, scale = stats.gumbel_r.fit(annual_max_discharge)
-    print (RP)
     # 4. Calculate the discharge value corresponding to the return period
     discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale)
     return discharge_value
@@ -142,9 +141,9 @@ def createEvent(trigger_df):
 
             # Create a temporary dataframe for the current event
             temp_event_df = pd.DataFrame({
-                'Event': [Event],
-                'StartDate': [StartDate],
-                'EndDate': [final_endtime],
+                'Observation': [Event],
+                'Start Date': [StartDate],
+                'End Date': [final_endtime],
             })
 
             # Append the event to the list of events
@@ -162,10 +161,10 @@ def createEvent(trigger_df):
     else:
         # Return an empty GeoDataFrame if no events were found
         # Initialize an empty dataframe 
-        events_df = pd.DataFrame(columns=['Event', 'Start Date', 'End Date'])
+        events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date'])
         return events_df
 
-def loop_over_stations(station_csv, DataDir, RP, admPath): 
+def loop_over_stations(station_csv, DataDir, RP, admPath, adminLevel): 
     RP = float(RP)
     station_df = pd.read_csv (station_csv, header=0)
     #print (station_df.columns)
@@ -189,11 +188,10 @@ def loop_over_stations(station_csv, DataDir, RP, admPath):
 
     #generate the gdf to merge with where the points are attributed to the respective administrative units
 
-
     all_events_df = pd.concat (all_events, ignore_index=True)
-    print (all_events_df.columns)
+
     gdf_pointPolygon = attributePoints_to_Polygon (admPath, station_csv, 'StationName')
-    print (gdf_pointPolygon.columns)
+    gdf_pointPolygon.rename(columns={f'ADM{adminLevel}_FR':f'ADM{adminLevel}'}, inplace=True)
     gdf_melt = gdf_pointPolygon.melt(
         id_vars=gdf_pointPolygon.columns.difference(['StationName', 'StationName_0', 'StationName_1', 'StationName_2']),
         value_vars=['StationName', 'StationName_0', 'StationName_1', 'StationName_2'],
@@ -204,12 +202,13 @@ def loop_over_stations(station_csv, DataDir, RP, admPath):
 
     # Proceed with the merge
     hydro_events_df = pd.merge(gdf_melt, all_events_df, left_on='StationName_Merged', right_on='StationName', how='inner')
-
-    hydro_events_gdf = gpd.GeoDataFrame(hydro_events_df, geometry='geometry')    
+    hydro_events_df [f'ADM{adminLevel}'] = hydro_events_df [f'ADM{adminLevel}'].apply(capitalize)
+    hydro_events_gdf = gpd.GeoDataFrame(hydro_events_df, geometry='geometry')   
+    hydro_events_gdf.to_file(f"{DataDir}/Impact_from_hydro_RP_{RP}.gpkg")
     return hydro_events_gdf
 
 
 if __name__=="__main__": 
     #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-    event_gdf = loop_over_stations (cfg.DNHstations, cfg.DataDir, 5, cfg.admPath)
+    event_gdf = loop_over_stations (cfg.DNHstations, cfg.DataDir, 5, cfg.admPath, cfg.adminLevel)
     print (event_gdf)
diff --git a/comparison/pointMatching.py b/comparison/pointMatching.py
@@ -6,7 +6,7 @@
 from scipy.spatial import cKDTree
 from GloFAS.GloFAS_prep.vectorCheck import checkVectorFormat
 import GloFAS.GloFAS_prep.configuration as cfg 
-from GloFAS.GloFAS_prep.accent_remover import remove_accents
+from GloFAS.GloFAS_prep.text_formatter import remove_accents
 
 def findclosestpoint(point_x, point_y, target_gdf):
     '''
@@ -32,8 +32,6 @@ def findclosestpoint(point_x, point_y, target_gdf):
     return target_gdf.iloc[idx[0]]  # Return row of the closest point
 
 
-
-
 def matchPoints(
             vector_original, 
             ID1, 
@@ -120,16 +118,17 @@ def matchPoints(
     return match_df
 
 def attributePoints_to_Polygon(
-            vectorPolygon, 
-            vectorPoint, 
-            ID2, 
-            crs='EPSG:4326', 
-            StationDataDir=Path.cwd(),
-            filename='attributePoints_to_Polygon.csv'
-            ):
+        vectorPolygon, 
+        vectorPoint, 
+        ID2, 
+        crs='EPSG:4326', 
+        border_tolerance=5000,  # Buffer distance in meters
+        StationDataDir=Path.cwd(), 
+        filename='attributePoints_to_Polygon.csv'
+        ):
     '''
     Assigns points from vector_original to polygons in vector2, adding a new column in vector2 that contains the IDs of all points
-    from vector_original that fall within each polygon.
+    from vector_original that fall within each polygon or within a specified buffer distance.
 
     Parameters:
     vectorPolygon : str or GeoDataFrame
@@ -142,12 +141,14 @@ def attributePoints_to_Polygon(
         Column name in vector2 identifying the polygons.
     crs : str, optional
         Coordinate reference system for all data. Defaults to 'EPSG:4326'.
+    border_tolerance : float, optional
+        Distance in meters to expand the polygons for including nearby points. Defaults to 5000 (5 km).
     StationDataDir : str or Path, optional
         Directory where the output CSV file will be saved. Default is the current working directory.
 
     Writes:
     CSV
-        A CSV file containing the polygon ID and the IDs of points within each polygon.
+        A CSV file containing the polygon ID and the IDs of points within or near each polygon.
 
     Returns:
     GeoDataFrame
@@ -162,25 +163,29 @@ def attributePoints_to_Polygon(
     if points_gdf.crs != polygons_gdf.crs:
         points_gdf = points_gdf.to_crs(polygons_gdf.crs)
 
+    # Apply a buffer to the polygons
+    expanded_polygons_gdf = polygons_gdf.copy()
+    expanded_polygons_gdf['geometry'] = expanded_polygons_gdf.geometry.buffer(border_tolerance)
+
     # Initialize a new column in the polygons GeoDataFrame to store point IDs
     polygons_gdf[f'{ID2}'] = None
 
     # Loop through each polygon to find points within it
-    for idx, polygon_row in polygons_gdf.iterrows():
-        # Get the geometry of the current polygon
-        polygon_geom = polygon_row.geometry
+    for idx, polygon_row in expanded_polygons_gdf.iterrows():
+        # Get the geometry of the current buffered polygon
+        expanded_polygon_geom = polygon_row.geometry
 
-        # Find points that fall within this polygon
-        points_within = points_gdf[points_gdf.geometry.within(polygon_geom)]
+        # Find points that fall within this expanded polygon
+        points_within = points_gdf[points_gdf.geometry.within(expanded_polygon_geom)]
 
         # Collect the IDs of these points
         point_ids = points_within[ID2].tolist()
-        # Update the GeoDataFrame with the list of point IDs
 
-        nrstations_inpolygon = len (point_ids)
-        if nrstations_inpolygon>0:
+        # Update the original polygons GeoDataFrame with the list of point IDs
+        nrstations_inpolygon = len(point_ids)
+        if nrstations_inpolygon > 0:
             polygons_gdf.at[idx, f'{ID2}'] = remove_accents(point_ids[0])
-        if nrstations_inpolygon>1: 
+        if nrstations_inpolygon > 1: 
             for stationnr in range(nrstations_inpolygon):
                 polygons_gdf.at[idx, f'{ID2}_{stationnr}'] = remove_accents(point_ids[stationnr-1])
 
@@ -189,14 +194,16 @@ def attributePoints_to_Polygon(
     polygons_gdf.drop(columns='geometry').to_csv(output_file, index=False)
     return polygons_gdf
 
+
 if __name__ == '__main__': 
-        result = attributePoints_to_Polygon(
-        cfg.admPath, 
-        cfg.DNHstations, 
-        'StationName', 
-        crs=cfg.crs, 
-        StationDataDir=cfg.stationsDir,
-        filename='DNHstations_in_ADM2.csv'
-        )
+    result = attributePoints_to_Polygon(
+    cfg.admPath, 
+    cfg.DNHstations, 
+    'StationName', 
+    crs=cfg.crs, 
+    border_tolerance=5000, # to tolerate the bolder 
+    StationDataDir=cfg.stationsDir,
+    filename='DNHstations_in_ADM2.csv'
+    )
 
     # matchPoints(cfg.GloFASstations, 'StationName', cfg.googlestations, 'gaugeId', crs=cfg.crs, StationDataDir=cfg.stationsDir)
diff --git a/comparison/visualization/journal_paper_list_of_figures_tables.md b/comparison/visualization/journal_paper_list_of_figures_tables.md
diff --git a/comparison/plot.py → comparison/visualization/plot.py b/comparison/plot.py → comparison/visualization/plot.py