something goes wrong in performance calculator for observation compar…

…ison : no string values in adm2
rodekruis · Nov 22, 2024 · e2f9e5c · e2f9e5c
1 parent 2893fcd
commit e2f9e5c
Show file tree

Hide file tree

Showing 5 changed files with 19 additions and 11 deletions.
diff --git a/GloFAS/GloFAS_analysis/performance_calculator.py b/GloFAS/GloFAS_analysis/performance_calculator.py
@@ -9,7 +9,7 @@
 from GloFAS.GloFAS_prep.vectorCheck import checkVectorFormat
 import GloFAS.GloFAS_prep.configuration as cfg
 class PredictedToImpactPerformanceAnalyzer:
-    def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, adminPath, startYear, endYear, years, PredictedEvents_gdf):
+    def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, adminPath, startYear, endYear, years, PredictedEvents_gdf, comparisonType):
         """
         Initialize the FloodPerformanceAnalyzer class with the required data.
         
@@ -30,6 +30,7 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel,
         self.years = years
         self.impactData = impactData
         self.PredictedEvents_gdf = PredictedEvents_gdf
+        self.comparisonType = comparisonType
         if isinstance (self.impactData, (str)):
             self.impact_gdf = self.openObservedImpact_gdf()
         elif isinstance (self.impactData, gpd.GeoDataFrame):
@@ -39,15 +40,18 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel,
 
     def openObservedImpact_gdf(self):
         # Load the data
-        df = pd.read_csv(self.impactDataPath)
+        if self.impactData.endswith('.csv'):
+            df = pd.read_csv(self.impactData)
+        else: 
+            df = gpd.read_file(self.impactData)
 
         # Convert 'End Date' and 'Start Date' to datetime
         df['End Date'] = pd.to_datetime(df['End Date'], format='%d/%m/%Y', errors='coerce')
         df['Start Date'] = pd.to_datetime(df['Start Date'], format='%d/%m/%Y', errors= 'coerce')
 
         # Filter rows between 2004 and 2022 ()
         df_filtered = df[(df['End Date'].dt.year >= self.startYear) & (df['End Date'].dt.year < self.endYear)]
-
+        
         # Remove non-string entries from ADM columns
         df_filtered = df_filtered[df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: isinstance(x, str))]
         self.gdf_shape = self.gdf_shape[self.gdf_shape[f'ADM{self.adminLevel}_FR'].apply(lambda x: isinstance(x, str))]
@@ -56,6 +60,7 @@ def openObservedImpact_gdf(self):
         self.gdf_shape.rename(columns={f'ADM{cfg.adminLevel}_FR':f'ADM{cfg.adminLevel}'}, inplace=True)
         self.gdf_shape[f'ADM{cfg.adminLevel}'] = self.gdf_shape[f'ADM{cfg.adminLevel}'].apply(lambda x: unidecode.unidecode(x).upper())
         # Apply normalization to both DataFrames (converting to uppercase and removing special characters)
+
         df_filtered[f'ADM{self.adminLevel}'] = df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: unidecode.unidecode(x).upper())
 
         # Merge the CSV data with the shapefile data
@@ -135,6 +140,7 @@ def _check_impact(self, PredictedEvents_gdf, commune, startdate):
                                 (PredictedEvents_gdf['EndValidTime'] >= startdate) &
                                 (PredictedEvents_gdf['Event']==1)
                                 ]
+        print (match)
         return 1 if not match.empty else 0
 
 
@@ -188,24 +194,26 @@ def calculateCommunePerformance(self):
         # Group by 'Commune' and calculate performance scores for each group
         print (self.impact_gdf.columns)
         print (self.impact_gdf.head)
+
         scores_by_commune = self.impact_gdf.groupby(f'ADM{self.adminLevel}').apply(
-            lambda x: self.calc_performance_scores(x['Impact'], x['Event'])
+            lambda x: self.calc_performance_scores(x[f'{self.comparisonType}'], x['Event'])
         )
         scores_byCommune_gdf = self.gdf_shape.merge(scores_by_commune, on=f'ADM{cfg.adminLevel}')
-        scores_byCommune_gdf.to_file (f"{self.DataDir}/glofas_to_hydrodata/scores_byCommuneRP{self.RPyr:.1f}_yr_leadtime{self.leadtime:.0f}.shp")
+        scores_byCommune_gdf.to_file (f"{self.DataDir}/{comparisonType}/scores_byCommuneRP{self.RPyr:.1f}_yr_leadtime{self.leadtime:.0f}.shp")
         return scores_byCommune_gdf
 
 if __name__=='__main__':
     for RPyr in cfg.RPsyr: 
-        hydro_impact_gdf = loop_over_stations (cfg.DNHstations , cfg.DataDir, RPyr, cfg.admPath)
+        hydro_impact_gdf = f'{cfg.DataDir}/Impact_from_hydro_RP_{RPyr}.gpkg'
+        #hydro_impact_gdf = loop_over_stations (cfg.DNHstations , cfg.DataDir, RPyr, cfg.admPath, cfg.adminLevel)
         for leadtime in cfg.leadtimes: 
             floodProbability_path = cfg.DataDir/ f"floodedRP{RPyr}yr_leadtime{leadtime}_ADM{cfg.adminLevel}.gpkg"
             floodProbability_gdf = checkVectorFormat (floodProbability_path)
             #calculate the flood events
             definer = FloodDefiner (cfg.adminLevel)
             PredictedEvents_gdf = definer.EventMaker (floodProbability_gdf, cfg.actionLifetime, cfg.triggerProb)
         #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-            analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, leadtime, hydro_impact_gdf, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PredictedEvents_gdf)
+            analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, leadtime, hydro_impact_gdf, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PredictedEvents_gdf, 'Observation')
             analyzer.matchImpact_and_Trigger()
             analyzer.calculateCommunePerformance()
 

diff --git a/comparison/HydroImpact.py b/comparison/HydroImpact.py
@@ -141,7 +141,7 @@ def createEvent(trigger_df):
 
             # Create a temporary dataframe for the current event
             temp_event_df = pd.DataFrame({
-                'Event': [Event],
+                'Observation': [Event],
                 'Start Date': [StartDate],
                 'End Date': [final_endtime],
             })
@@ -161,7 +161,7 @@ def createEvent(trigger_df):
     else:
         # Return an empty GeoDataFrame if no events were found
         # Initialize an empty dataframe 
-        events_df = pd.DataFrame(columns=['Event', 'Start Date', 'End Date'])
+        events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date'])
         return events_df
 
 def loop_over_stations(station_csv, DataDir, RP, admPath, adminLevel): 

diff --git a/comparison/pointMatching.py b/comparison/pointMatching.py
@@ -141,7 +141,7 @@ def attributePoints_to_Polygon(
         Column name in vector2 identifying the polygons.
     crs : str, optional
         Coordinate reference system for all data. Defaults to 'EPSG:4326'.
-    buffer_distance : float, optional
+    border_tolerance : float, optional
         Distance in meters to expand the polygons for including nearby points. Defaults to 5000 (5 km).
     StationDataDir : str or Path, optional
         Directory where the output CSV file will be saved. Default is the current working directory.
@@ -165,7 +165,7 @@ def attributePoints_to_Polygon(
 
     # Apply a buffer to the polygons
     expanded_polygons_gdf = polygons_gdf.copy()
-    expanded_polygons_gdf['geometry'] = expanded_polygons_gdf.geometry.buffer(buffer_distance)
+    expanded_polygons_gdf['geometry'] = expanded_polygons_gdf.geometry.buffer(border_tolerance)
 
     # Initialize a new column in the polygons GeoDataFrame to store point IDs
     polygons_gdf[f'{ID2}'] = None

diff --git a/...n/journal_paper_list_of_figures_tables.md → ...n/journal_paper_list_of_figures_tables.md b/...n/journal_paper_list_of_figures_tables.md → ...n/journal_paper_list_of_figures_tables.md
diff --git a/comparison/plot.py → comparison/visualization/plot.py b/comparison/plot.py → comparison/visualization/plot.py