From 6b4ee9b29ef877004ecb0d5e1fb6dec644e27e97 Mon Sep 17 00:00:00 2001 From: ERKuipers Date: Fri, 22 Nov 2024 15:54:52 +0100 Subject: [PATCH 1/2] added coulance for border consideration and capitalize --- .../GloFAS_analysis/performance_calculator.py | 5 +- GloFAS/GloFAS_prep/accent_remover.py | 7 -- GloFAS/GloFAS_prep/text_formatter.py | 14 ++++ comparison/HydroImpact.py | 23 ++++--- .../journal_paper_list_of_figures_tables.md | 0 comparison/pointMatching.py | 65 ++++++++++--------- 6 files changed, 64 insertions(+), 50 deletions(-) delete mode 100644 GloFAS/GloFAS_prep/accent_remover.py create mode 100644 GloFAS/GloFAS_prep/text_formatter.py create mode 100644 comparison/journal_paper_list_of_figures_tables.md diff --git a/GloFAS/GloFAS_analysis/performance_calculator.py b/GloFAS/GloFAS_analysis/performance_calculator.py index bf04eae..77369f5 100644 --- a/GloFAS/GloFAS_analysis/performance_calculator.py +++ b/GloFAS/GloFAS_analysis/performance_calculator.py @@ -19,7 +19,6 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, should make this a csv possiblity triggerProb (float): Probability threshold to classify flood triggers. """ - self.triggerProb = triggerProb self.adminLevel = adminLevel self.gdf_shape = gpd.read_file(adminPath) @@ -31,7 +30,7 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, self.years = years self.impactData = impactData self.PredictedEvents_gdf = PredictedEvents_gdf - if isinstance (self.impactData, (str, Path)): + if isinstance (self.impactData, (str)): self.impact_gdf = self.openObservedImpact_gdf() elif isinstance (self.impactData, gpd.GeoDataFrame): self.impact_gdf = self.impactData @@ -187,6 +186,8 @@ def calculateCommunePerformance(self): Calculate the performance scores for each commune and merge them back into the GeoDataFrame. """ # Group by 'Commune' and calculate performance scores for each group + print (self.impact_gdf.columns) + print (self.impact_gdf.head) scores_by_commune = self.impact_gdf.groupby(f'ADM{self.adminLevel}').apply( lambda x: self.calc_performance_scores(x['Impact'], x['Event']) ) diff --git a/GloFAS/GloFAS_prep/accent_remover.py b/GloFAS/GloFAS_prep/accent_remover.py deleted file mode 100644 index a216f80..0000000 --- a/GloFAS/GloFAS_prep/accent_remover.py +++ /dev/null @@ -1,7 +0,0 @@ -import unicodedata -def remove_accents(input_str): - """Remove accents from a given string.""" - if isinstance(input_str, str): # Only process strings - nfkd_form = unicodedata.normalize('NFKD', input_str) - return "".join([c for c in nfkd_form if not unicodedata.combining(c)]) - return input_str # Return as is if not a string \ No newline at end of file diff --git a/GloFAS/GloFAS_prep/text_formatter.py b/GloFAS/GloFAS_prep/text_formatter.py new file mode 100644 index 0000000..2a44220 --- /dev/null +++ b/GloFAS/GloFAS_prep/text_formatter.py @@ -0,0 +1,14 @@ +import unicodedata +def remove_accents(input_str): + """Remove accents from a given string.""" + if isinstance(input_str, str): # Only process strings + nfkd_form = unicodedata.normalize('NFKD', input_str) + return "".join([c for c in nfkd_form if not unicodedata.combining(c)]) + return input_str # Return as is if not a string + +def capitalize(input_str): + """Remove accents and capitalize the input string.""" + if isinstance(input_str, str): # Only process strings + input_str = remove_accents(input_str) # Remove accents first + return input_str.upper() # Capitalize (convert to uppercase) + return input_str diff --git a/comparison/HydroImpact.py b/comparison/HydroImpact.py index ddfb6b7..1752b46 100644 --- a/comparison/HydroImpact.py +++ b/comparison/HydroImpact.py @@ -1,10 +1,10 @@ import GloFAS.GloFAS_prep.configuration as cfg import pandas as pd -from GloFAS.GloFAS_prep.accent_remover import remove_accents +from GloFAS.GloFAS_prep.text_formatter import remove_accents, capitalize import scipy.stats as stats from comparison.pointMatching import attributePoints_to_Polygon - +import geopandas as gpd def parse_date_with_fallback(date_str, year): try: # Try to parse the date with the given year @@ -64,7 +64,6 @@ def QRP_fit (hydro_df, RP): # 2. Fit a Gumbel distribution to the annual maximum discharge values loc, scale = stats.gumbel_r.fit(annual_max_discharge) - print (RP) # 4. Calculate the discharge value corresponding to the return period discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale) return discharge_value @@ -143,8 +142,8 @@ def createEvent(trigger_df): # Create a temporary dataframe for the current event temp_event_df = pd.DataFrame({ 'Event': [Event], - 'StartDate': [StartDate], - 'EndDate': [final_endtime], + 'Start Date': [StartDate], + 'End Date': [final_endtime], }) # Append the event to the list of events @@ -165,7 +164,7 @@ def createEvent(trigger_df): events_df = pd.DataFrame(columns=['Event', 'Start Date', 'End Date']) return events_df -def loop_over_stations(station_csv, DataDir, RP, admPath): +def loop_over_stations(station_csv, DataDir, RP, admPath, adminLevel): RP = float(RP) station_df = pd.read_csv (station_csv, header=0) #print (station_df.columns) @@ -189,11 +188,10 @@ def loop_over_stations(station_csv, DataDir, RP, admPath): #generate the gdf to merge with where the points are attributed to the respective administrative units - all_events_df = pd.concat (all_events, ignore_index=True) - print (all_events_df.columns) + gdf_pointPolygon = attributePoints_to_Polygon (admPath, station_csv, 'StationName') - print (gdf_pointPolygon.columns) + gdf_pointPolygon.rename(columns={f'ADM{adminLevel}_FR':f'ADM{adminLevel}'}, inplace=True) gdf_melt = gdf_pointPolygon.melt( id_vars=gdf_pointPolygon.columns.difference(['StationName', 'StationName_0', 'StationName_1', 'StationName_2']), value_vars=['StationName', 'StationName_0', 'StationName_1', 'StationName_2'], @@ -204,12 +202,13 @@ def loop_over_stations(station_csv, DataDir, RP, admPath): # Proceed with the merge hydro_events_df = pd.merge(gdf_melt, all_events_df, left_on='StationName_Merged', right_on='StationName', how='inner') - - hydro_events_gdf = gpd.GeoDataFrame(hydro_events_df, geometry='geometry') + hydro_events_df [f'ADM{adminLevel}'] = hydro_events_df [f'ADM{adminLevel}'].apply(capitalize) + hydro_events_gdf = gpd.GeoDataFrame(hydro_events_df, geometry='geometry') + hydro_events_gdf.to_file(f"{DataDir}/Impact_from_hydro_RP_{RP}.gpkg") return hydro_events_gdf if __name__=="__main__": #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv")) - event_gdf = loop_over_stations (cfg.DNHstations, cfg.DataDir, 5, cfg.admPath) + event_gdf = loop_over_stations (cfg.DNHstations, cfg.DataDir, 5, cfg.admPath, cfg.adminLevel) print (event_gdf) \ No newline at end of file diff --git a/comparison/journal_paper_list_of_figures_tables.md b/comparison/journal_paper_list_of_figures_tables.md new file mode 100644 index 0000000..e69de29 diff --git a/comparison/pointMatching.py b/comparison/pointMatching.py index 115361f..607de6d 100644 --- a/comparison/pointMatching.py +++ b/comparison/pointMatching.py @@ -6,7 +6,7 @@ from scipy.spatial import cKDTree from GloFAS.GloFAS_prep.vectorCheck import checkVectorFormat import GloFAS.GloFAS_prep.configuration as cfg -from GloFAS.GloFAS_prep.accent_remover import remove_accents +from GloFAS.GloFAS_prep.text_formatter import remove_accents def findclosestpoint(point_x, point_y, target_gdf): ''' @@ -32,8 +32,6 @@ def findclosestpoint(point_x, point_y, target_gdf): return target_gdf.iloc[idx[0]] # Return row of the closest point - - def matchPoints( vector_original, ID1, @@ -120,16 +118,17 @@ def matchPoints( return match_df def attributePoints_to_Polygon( - vectorPolygon, - vectorPoint, - ID2, - crs='EPSG:4326', - StationDataDir=Path.cwd(), - filename='attributePoints_to_Polygon.csv' - ): + vectorPolygon, + vectorPoint, + ID2, + crs='EPSG:4326', + border_tolerance=5000, # Buffer distance in meters + StationDataDir=Path.cwd(), + filename='attributePoints_to_Polygon.csv' + ): ''' Assigns points from vector_original to polygons in vector2, adding a new column in vector2 that contains the IDs of all points - from vector_original that fall within each polygon. + from vector_original that fall within each polygon or within a specified buffer distance. Parameters: vectorPolygon : str or GeoDataFrame @@ -142,12 +141,14 @@ def attributePoints_to_Polygon( Column name in vector2 identifying the polygons. crs : str, optional Coordinate reference system for all data. Defaults to 'EPSG:4326'. + buffer_distance : float, optional + Distance in meters to expand the polygons for including nearby points. Defaults to 5000 (5 km). StationDataDir : str or Path, optional Directory where the output CSV file will be saved. Default is the current working directory. Writes: CSV - A CSV file containing the polygon ID and the IDs of points within each polygon. + A CSV file containing the polygon ID and the IDs of points within or near each polygon. Returns: GeoDataFrame @@ -162,25 +163,29 @@ def attributePoints_to_Polygon( if points_gdf.crs != polygons_gdf.crs: points_gdf = points_gdf.to_crs(polygons_gdf.crs) + # Apply a buffer to the polygons + expanded_polygons_gdf = polygons_gdf.copy() + expanded_polygons_gdf['geometry'] = expanded_polygons_gdf.geometry.buffer(buffer_distance) + # Initialize a new column in the polygons GeoDataFrame to store point IDs polygons_gdf[f'{ID2}'] = None # Loop through each polygon to find points within it - for idx, polygon_row in polygons_gdf.iterrows(): - # Get the geometry of the current polygon - polygon_geom = polygon_row.geometry + for idx, polygon_row in expanded_polygons_gdf.iterrows(): + # Get the geometry of the current buffered polygon + expanded_polygon_geom = polygon_row.geometry - # Find points that fall within this polygon - points_within = points_gdf[points_gdf.geometry.within(polygon_geom)] + # Find points that fall within this expanded polygon + points_within = points_gdf[points_gdf.geometry.within(expanded_polygon_geom)] # Collect the IDs of these points point_ids = points_within[ID2].tolist() - # Update the GeoDataFrame with the list of point IDs - nrstations_inpolygon = len (point_ids) - if nrstations_inpolygon>0: + # Update the original polygons GeoDataFrame with the list of point IDs + nrstations_inpolygon = len(point_ids) + if nrstations_inpolygon > 0: polygons_gdf.at[idx, f'{ID2}'] = remove_accents(point_ids[0]) - if nrstations_inpolygon>1: + if nrstations_inpolygon > 1: for stationnr in range(nrstations_inpolygon): polygons_gdf.at[idx, f'{ID2}_{stationnr}'] = remove_accents(point_ids[stationnr-1]) @@ -189,14 +194,16 @@ def attributePoints_to_Polygon( polygons_gdf.drop(columns='geometry').to_csv(output_file, index=False) return polygons_gdf + if __name__ == '__main__': - result = attributePoints_to_Polygon( - cfg.admPath, - cfg.DNHstations, - 'StationName', - crs=cfg.crs, - StationDataDir=cfg.stationsDir, - filename='DNHstations_in_ADM2.csv' - ) + result = attributePoints_to_Polygon( + cfg.admPath, + cfg.DNHstations, + 'StationName', + crs=cfg.crs, + border_tolerance=5000, # to tolerate the bolder + StationDataDir=cfg.stationsDir, + filename='DNHstations_in_ADM2.csv' + ) # matchPoints(cfg.GloFASstations, 'StationName', cfg.googlestations, 'gaugeId', crs=cfg.crs, StationDataDir=cfg.stationsDir) \ No newline at end of file From e2f9e5c9e2f94f7bfdd3fa100370a9da18be72aa Mon Sep 17 00:00:00 2001 From: ERKuipers Date: Fri, 22 Nov 2024 17:07:43 +0100 Subject: [PATCH 2/2] something goes wrong in performance calculator for observation comparison : no string values in adm2 --- .../GloFAS_analysis/performance_calculator.py | 22 +++++++++++++------ comparison/HydroImpact.py | 4 ++-- comparison/pointMatching.py | 4 ++-- .../journal_paper_list_of_figures_tables.md | 0 comparison/{ => visualization}/plot.py | 0 5 files changed, 19 insertions(+), 11 deletions(-) rename comparison/{ => visualization}/journal_paper_list_of_figures_tables.md (100%) rename comparison/{ => visualization}/plot.py (100%) diff --git a/GloFAS/GloFAS_analysis/performance_calculator.py b/GloFAS/GloFAS_analysis/performance_calculator.py index 77369f5..515601c 100644 --- a/GloFAS/GloFAS_analysis/performance_calculator.py +++ b/GloFAS/GloFAS_analysis/performance_calculator.py @@ -9,7 +9,7 @@ from GloFAS.GloFAS_prep.vectorCheck import checkVectorFormat import GloFAS.GloFAS_prep.configuration as cfg class PredictedToImpactPerformanceAnalyzer: - def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, adminPath, startYear, endYear, years, PredictedEvents_gdf): + def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, adminPath, startYear, endYear, years, PredictedEvents_gdf, comparisonType): """ Initialize the FloodPerformanceAnalyzer class with the required data. @@ -30,6 +30,7 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, self.years = years self.impactData = impactData self.PredictedEvents_gdf = PredictedEvents_gdf + self.comparisonType = comparisonType if isinstance (self.impactData, (str)): self.impact_gdf = self.openObservedImpact_gdf() elif isinstance (self.impactData, gpd.GeoDataFrame): @@ -39,7 +40,10 @@ def __init__(self, DataDir, RPyr, leadtime, impactData, triggerProb, adminLevel, def openObservedImpact_gdf(self): # Load the data - df = pd.read_csv(self.impactDataPath) + if self.impactData.endswith('.csv'): + df = pd.read_csv(self.impactData) + else: + df = gpd.read_file(self.impactData) # Convert 'End Date' and 'Start Date' to datetime df['End Date'] = pd.to_datetime(df['End Date'], format='%d/%m/%Y', errors='coerce') @@ -47,7 +51,7 @@ def openObservedImpact_gdf(self): # Filter rows between 2004 and 2022 () df_filtered = df[(df['End Date'].dt.year >= self.startYear) & (df['End Date'].dt.year < self.endYear)] - + # Remove non-string entries from ADM columns df_filtered = df_filtered[df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: isinstance(x, str))] self.gdf_shape = self.gdf_shape[self.gdf_shape[f'ADM{self.adminLevel}_FR'].apply(lambda x: isinstance(x, str))] @@ -56,6 +60,7 @@ def openObservedImpact_gdf(self): self.gdf_shape.rename(columns={f'ADM{cfg.adminLevel}_FR':f'ADM{cfg.adminLevel}'}, inplace=True) self.gdf_shape[f'ADM{cfg.adminLevel}'] = self.gdf_shape[f'ADM{cfg.adminLevel}'].apply(lambda x: unidecode.unidecode(x).upper()) # Apply normalization to both DataFrames (converting to uppercase and removing special characters) + df_filtered[f'ADM{self.adminLevel}'] = df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: unidecode.unidecode(x).upper()) # Merge the CSV data with the shapefile data @@ -135,6 +140,7 @@ def _check_impact(self, PredictedEvents_gdf, commune, startdate): (PredictedEvents_gdf['EndValidTime'] >= startdate) & (PredictedEvents_gdf['Event']==1) ] + print (match) return 1 if not match.empty else 0 @@ -188,16 +194,18 @@ def calculateCommunePerformance(self): # Group by 'Commune' and calculate performance scores for each group print (self.impact_gdf.columns) print (self.impact_gdf.head) + scores_by_commune = self.impact_gdf.groupby(f'ADM{self.adminLevel}').apply( - lambda x: self.calc_performance_scores(x['Impact'], x['Event']) + lambda x: self.calc_performance_scores(x[f'{self.comparisonType}'], x['Event']) ) scores_byCommune_gdf = self.gdf_shape.merge(scores_by_commune, on=f'ADM{cfg.adminLevel}') - scores_byCommune_gdf.to_file (f"{self.DataDir}/glofas_to_hydrodata/scores_byCommuneRP{self.RPyr:.1f}_yr_leadtime{self.leadtime:.0f}.shp") + scores_byCommune_gdf.to_file (f"{self.DataDir}/{comparisonType}/scores_byCommuneRP{self.RPyr:.1f}_yr_leadtime{self.leadtime:.0f}.shp") return scores_byCommune_gdf if __name__=='__main__': for RPyr in cfg.RPsyr: - hydro_impact_gdf = loop_over_stations (cfg.DNHstations , cfg.DataDir, RPyr, cfg.admPath) + hydro_impact_gdf = f'{cfg.DataDir}/Impact_from_hydro_RP_{RPyr}.gpkg' + #hydro_impact_gdf = loop_over_stations (cfg.DNHstations , cfg.DataDir, RPyr, cfg.admPath, cfg.adminLevel) for leadtime in cfg.leadtimes: floodProbability_path = cfg.DataDir/ f"floodedRP{RPyr}yr_leadtime{leadtime}_ADM{cfg.adminLevel}.gpkg" floodProbability_gdf = checkVectorFormat (floodProbability_path) @@ -205,7 +213,7 @@ def calculateCommunePerformance(self): definer = FloodDefiner (cfg.adminLevel) PredictedEvents_gdf = definer.EventMaker (floodProbability_gdf, cfg.actionLifetime, cfg.triggerProb) #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv")) - analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, leadtime, hydro_impact_gdf, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PredictedEvents_gdf) + analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, leadtime, hydro_impact_gdf, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PredictedEvents_gdf, 'Observation') analyzer.matchImpact_and_Trigger() analyzer.calculateCommunePerformance() diff --git a/comparison/HydroImpact.py b/comparison/HydroImpact.py index 1752b46..d854002 100644 --- a/comparison/HydroImpact.py +++ b/comparison/HydroImpact.py @@ -141,7 +141,7 @@ def createEvent(trigger_df): # Create a temporary dataframe for the current event temp_event_df = pd.DataFrame({ - 'Event': [Event], + 'Observation': [Event], 'Start Date': [StartDate], 'End Date': [final_endtime], }) @@ -161,7 +161,7 @@ def createEvent(trigger_df): else: # Return an empty GeoDataFrame if no events were found # Initialize an empty dataframe - events_df = pd.DataFrame(columns=['Event', 'Start Date', 'End Date']) + events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date']) return events_df def loop_over_stations(station_csv, DataDir, RP, admPath, adminLevel): diff --git a/comparison/pointMatching.py b/comparison/pointMatching.py index 607de6d..988dbea 100644 --- a/comparison/pointMatching.py +++ b/comparison/pointMatching.py @@ -141,7 +141,7 @@ def attributePoints_to_Polygon( Column name in vector2 identifying the polygons. crs : str, optional Coordinate reference system for all data. Defaults to 'EPSG:4326'. - buffer_distance : float, optional + border_tolerance : float, optional Distance in meters to expand the polygons for including nearby points. Defaults to 5000 (5 km). StationDataDir : str or Path, optional Directory where the output CSV file will be saved. Default is the current working directory. @@ -165,7 +165,7 @@ def attributePoints_to_Polygon( # Apply a buffer to the polygons expanded_polygons_gdf = polygons_gdf.copy() - expanded_polygons_gdf['geometry'] = expanded_polygons_gdf.geometry.buffer(buffer_distance) + expanded_polygons_gdf['geometry'] = expanded_polygons_gdf.geometry.buffer(border_tolerance) # Initialize a new column in the polygons GeoDataFrame to store point IDs polygons_gdf[f'{ID2}'] = None diff --git a/comparison/journal_paper_list_of_figures_tables.md b/comparison/visualization/journal_paper_list_of_figures_tables.md similarity index 100% rename from comparison/journal_paper_list_of_figures_tables.md rename to comparison/visualization/journal_paper_list_of_figures_tables.md diff --git a/comparison/plot.py b/comparison/visualization/plot.py similarity index 100% rename from comparison/plot.py rename to comparison/visualization/plot.py