Merge branch 'main' of https://github.com/rodekruis/river-flood-data-…

…analysis
rodekruis · Dec 16, 2024 · 060a80b · 060a80b
2 parents 5953904 + b78e552
commit 060a80b
Show file tree

Hide file tree

Showing 3 changed files with 357 additions and 103 deletions.
diff --git a/GloFAS/GloFAS_prep/configuration.py b/GloFAS/GloFAS_prep/configuration.py
@@ -23,7 +23,8 @@
 settlements_tif = DataDir / "GlobalHumanSettlement/GHS_BUILT_S_E2030_GLOBE_R2023A_54009_100_V1_0.tif"
 
 crs = f'EPSG:4326' 
-RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years 
+RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years
+percentiles = [95, 98, 99] 
 leadtimes = [72, 96, 120,144, 168] # add also 24hours
 startYear = 2004 # could be 2004 but needs to be 2016 since there is no google data available before 
 endYear = 2023 # 00:00 1st of january of that year, so up to but not including

diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
@@ -4,6 +4,7 @@
 from GloFAS.GloFAS_prep.text_formatter import remove_accents, capitalize
 import scipy.stats as stats
 from comparison.pointMatching import attributePoints_to_Polygon
+from comparison.observation.thresholds import Q_Gumbel_fit_percentile, Q_Gumbel_fit_RP
 import geopandas as gpd
 import numpy as np
 from pyextremes import EVA
@@ -15,6 +16,7 @@ def parse_date_with_fallback(date_str, year):
         date = pd.to_datetime(f"{year} {date_str}", format="%Y %d/%m %H:%M")
         return date
     except ValueError:
+
         # If the date is invalid (e.g., Feb 29 in non-leap years), return None
         #print(f"Invalid date skipped: {date_str} for year {year}")
         return None
@@ -31,38 +33,16 @@ def create_datetime(row):
         return parse_date_with_fallback(row['Date'], row['Year'])
 
     hydro_df_long['ParsedDate'] = hydro_df_long.apply(create_datetime, axis=1)
-
     # Drop rows where ParsedDate is None (invalid dates)
     hydro_df_long = hydro_df_long.dropna(subset=["ParsedDate"])
-
     # Drop the now redundant 'Year' and 'Date' columns if not needed
     hydro_df_long = hydro_df_long.drop(columns=["Year", "Date"])
-
     # Rename ParsedDate back to Date
     hydro_df_long.rename(columns={"ParsedDate": "Date"}, inplace=True)
-
     # Display the reshaped DataFrame
     return hydro_df_long
-
-def z_RP_station(HydroStations_RP_file, StationName, RP):
-    '''waterlevel
-    '''
-    HydroStations_RP_df = pd.read_csv(HydroStations_RP_file)
 
-    # Remove accents from StationName
-    StationName = remove_accents(StationName)
-
-    # Handle NaN values and ensure all are strings before applying the function
-    HydroStations_RP_df['StationName'] = HydroStations_RP_df['StationName'].fillna("").astype(str).apply(remove_accents)
-
-    # Filter or calculate QRP based on RP and StationName
-    z_RP = HydroStations_RP_df[HydroStations_RP_df['StationName'] == StationName][f'{RP}'].values
-
-    return z_RP
-
-
-
-def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): 
+def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value', date_col='Date'): 
     """
     Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold.
 
@@ -76,32 +56,55 @@ def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probabili
     Returns:
     - pd.DataFrame: Copy of the input DataFrame with an additional 'trigger' column.
     """
+    # Assuming hydro_df is already loaded
+    if date_col in hydro_df.columns:
+        # Convert the date_col to datetime
+        hydro_df[date_col] = pd.to_datetime(hydro_df[date_col], format="%Y-%m-%d %H:%M", errors='coerce')
+
+        # Check for any invalid dates
+        if hydro_df[date_col].isnull().any():
+            print("Warning: Some dates in 'date_col' could not be parsed and are set to NaT.")
+
+        # Set the index to the datetime column
+        hydro_df.set_index(date_col, inplace=True)
+        #print(f"Index successfully set to datetime using '{date_col}':\n{hydro_df.index}")
+    else:
+        print(f"Error: Column '{date_col}' does not exist in the DataFrame, hopefully teh date is the index")
+
     # Ensure "Value" column exists in the DataFrame
-    if "Value" not in hydro_df.columns:
-        raise ValueError("The input DataFrame must contain a 'Value' column.")
+    if value_col not in hydro_df.columns:
+        raise ValueError("The input DataFrame must contain a 'Value' column, please specify the name correctly")
     #QRP = QRP_station(HydroStations_RP_file, StationName, RP)
-    Q_station = hydro_df["Value"] 
-
-    if distributionType =='Gumbel':
-        QRP= QRP_fit (hydro_df, probability)
-    else: # assuming above 20 is percentile, RP is percentile instead 
-        print ('applying a GEV, assuming your RP is in percentiles')
-        QRP = Q_GEV_fit (hydro_df, RP) 
+    Q_station = hydro_df[value_col] # this is where it goes wrong!!!
+    if type_of_extremity == 'RP':
+        Q_prob= Q_Gumbel_fit_RP (hydro_df, probability, value_col=value_col) # results in infinite
+    elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead 
+        Q_prob = Q_Gumbel_fit_percentile (hydro_df, percentile, value_col) 
+    else: 
+        print ('no such type of extremity')
     #print (f"for {StationName} : return period Q= {QRP}")
-    if not isinstance(QRP, (int, float)):
-        raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(QRP)}.")
+    if not isinstance(Q_prob, (int, float)):
+        raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(Q_prob)}.")
     # Calculate the QRP threshold
 
 
     # Copy the DataFrame and add the 'trigger' column
     hydro_trigger_df = hydro_df.copy()
-    hydro_trigger_df['Trigger'] = (hydro_trigger_df['Value'] > QRP).astype(int)
+    hydro_trigger_df['Trigger'] = (hydro_trigger_df[value_col] > Q_prob).astype(int)
     return hydro_trigger_df
 
-def createEvent(trigger_df):
+def createEvent(trigger_df, date_col='Date'):
     # Sort the dataframe by 'ValidTime', then by administrative unit: so that for each administrative unit, they are 
     # first sort on valid time 
-    trigger_df = trigger_df.sort_values(by=[f'Date']).reset_index(drop=True)
+    if date_col in trigger_df.columns:
+        trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True)
+    else:
+        try:
+            trigger_df = trigger_df.sort_index()
+        except: 
+            raise ValueError (f'{date_col} not in trigger_df index and no set index either')
+
+
 
     # Prepare commune info with geometry for event creation
     event_data = []
@@ -112,18 +115,29 @@ def createEvent(trigger_df):
     r = 0
     while r < len(trigger_df):
         row = trigger_df.iloc[r]
+
         if processed_rows[r]:
             # If the row is already processed, continue to the next row
             r += 1
             continue
 
-        # Checking if the current row and next row are both part of an event (Trigger == 1), and the trigger happens in the same adiministrative unit
+        # Checking if the current row and next row are both part of an event (Trigger == 1)
+        # and the trigger happens in the same administrative unit
         elif row['Trigger'] == 1 and r + 1 < len(trigger_df) and trigger_df.iloc[r + 1]['Trigger'] == 1:
             Event = 1
-            StartDate = row['Date'] 
+            StartDate = row[f'{date_col}'] if date_col in trigger_df.columns else trigger_df.index[r]
+
             # Continue until the end of the event where 'Trigger' is no longer 1
             while r < len(trigger_df) and trigger_df.iloc[r]['Trigger'] == 1:
-                possible_endtime = trigger_df.iloc[r]['Date']
+                # Check if the date column is part of the index or a column
+                if date_col in trigger_df.columns:
+                    possible_endtime = trigger_df.iloc[r][f'{date_col}']
+                else: # assuming it is the index
+                    try:
+                        possible_endtime = trigger_df.index[r]
+                    except: 
+                        raise ValueError(f'{date_col} not in columns and not a valid index either')
+
                 processed_rows[r] = True  # Mark row as processed
                 row = trigger_df.iloc[r]
                 r += 1
@@ -158,8 +172,8 @@ def createEvent(trigger_df):
         events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date'])
         return events_df
 
-def loop_over_stations(station_csv, DataDir, RP): 
-    RP = float(RP)
+def loop_over_stations_obs(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): 
+    probability = float(probability)
     station_df = pd.read_csv (station_csv, header=0)
     #print (station_df.columns)
     hydrodir = rf"{DataDir}/DNHMali_2019/Q_stations"
@@ -169,20 +183,54 @@ def loop_over_stations(station_csv, DataDir, RP):
         BasinName= stationrow['Basin']
         stationPath = rf"{hydrodir}/{BasinName}_{StationName}.csv"
         try: 
-            hydro_df = transform_hydro (stationPath)
             print (f'calculating {StationName}, in {BasinName}')
+            hydro_df = transform_hydro (stationPath) # this actually looks good
         except: 
             # print (f'no discharge measures found for station {StationName} in {BasinName}')
             continue
 
-        trigger_df = stampHydroTrigger (hydro_df, RP, StationName)
+        trigger_df = stampHydroTrigger (hydro_df, StationName, type_of_extremity, probability, value_col)
         event_df = createEvent (trigger_df)
         event_df ['StationName'] = StationName
         all_events.append (event_df)
 
     #generate the gdf to merge with where the points are attributed to the respective administrative units
     all_events_df = pd.concat (all_events, ignore_index=True)
-    all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv")
+    if type_of_extremity =='RP': 
+        all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{probability}yr.csv")
+    elif type_of_extremity =='percentile': 
+        all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_percentile{probability}.csv")
+    return all_events_df 
+
+def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extremity, value_col, leadtime): 
+    probability = float(probability)
+    station_df = pd.read_csv (station_csv, header=0)
+    all_events = []
+    for _, stationrow in station_df.iterrows(): 
+        StationName = remove_accents(stationrow['StationName'])
+        BasinName= stationrow['Basin']
+
+        glofas_csv = rf"{stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{StationName}_{leadtime}.csv"
+        try:
+            glofas_df = pd.read_csv(glofas_csv, 
+                                    header=0,
+                                    index_col=0,  # setting ValidTime as index column
+                                    parse_dates=True)
+        except:
+            print (f'no discharge series for station {StationName}')
+            continue
+
+        trigger_df = stampHydroTrigger (glofas_df, StationName, type_of_extremity, probability, value_col)
+        event_df = createEvent (trigger_df)
+        event_df ['StationName'] = StationName
+        all_events.append (event_df)
+
+    #generate the gdf to merge with where the points are attributed to the respective administrative units
+    all_events_df = pd.concat (all_events, ignore_index=True)
+    if type_of_extremity =='RP':
+        all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_RP{probability}yr_leadtime{leadtime}.csv")
+    elif type_of_extremity =='percentile': 
+        all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_percentile{probability}_leadtime{leadtime}.csv")
     return all_events_df 
 
 def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, all_events_df, model, RP):
@@ -208,7 +256,21 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
 
 
 if __name__=="__main__": 
-    # running this script for the GloFAS 
-    #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-    event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
-    event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)
+
+    station_df = pd.read_csv (cfg.DNHstations, header=0)
+    #print (station_df.columns)
+    date_col = 'ValidTime'
+    value_col = 'percentile_40.0' # we're interested in 60% probability of being exceeded amongst the ensemble members 
+    all_events = []
+    for leadtime in cfg.leadtimes:
+        for RP in cfg.RPsyr: 
+            loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'RP', RP, value_col='Value')
+            #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
+        for percentile in cfg.percentiles:
+            loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'percentile', percentile, value_col='Value')
+            #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
+
+    # # running this script for the GloFAS 
+    # #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
+    # event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
+    # event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)