From 6adb870b4a6fe490afe70015b94a89da9852028a Mon Sep 17 00:00:00 2001
From: ERKuipers <elskuipers@protonmail.com>
Date: Fri, 13 Dec 2024 16:09:21 +0100
Subject: [PATCH 1/3] new thresholds: need to be adjusted, i actually really
 dont have time to calculate the thresholds

---
 comparison/observation/HydroImpact.py |  4 +-
 comparison/observation/thresholds.py  | 78 ++++++++++++++++++---------
 2 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
index 4d94091..d08876a 100644
--- a/comparison/observation/HydroImpact.py
+++ b/comparison/observation/HydroImpact.py
@@ -59,9 +59,7 @@ def z_RP_station(HydroStations_RP_file, StationName, RP):
     z_RP = HydroStations_RP_df[HydroStations_RP_df['StationName'] == StationName][f'{RP}'].values
 
     return z_RP
-
-
-
+    
 def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): 
     """
     Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold.
diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py
index bd0df10..ccff81c 100644
--- a/comparison/observation/thresholds.py
+++ b/comparison/observation/thresholds.py
@@ -6,7 +6,7 @@
 
 def csv_to_cleaned_series (csv): 
     df = pd.read_csv(
-        Q_Bamako_csv,
+        csv,
         index_col=0, # setting ValidTime as index column
         parse_dates=True, # parsing dates in ValidTime
         )
@@ -15,16 +15,16 @@ def csv_to_cleaned_series (csv):
     series = df['percentile_40.0']
     series = series.loc[series.index < pd.to_datetime('2023-01-01')]
     series = (
-    series
-    .sort_index(ascending=True)
-    .astype(float)
-    .dropna()
-    )
+        series
+        .sort_index(ascending=True)
+        .astype(float)
+        .dropna()
+        )
     return series
 def Q_Gumbel_fit_RP (hydro_df, RP): 
 
     # Extract the annual maximum discharge values
-    hydro_df['Year'] = hydro_df['Date'].dt.year
+    hydro_df['Year'] = hydro_df['ValidTime'].dt.year
     annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
 
     # Fit a Gumbel distribution to the annual maximum discharge values
@@ -33,13 +33,40 @@ def Q_Gumbel_fit_RP (hydro_df, RP):
     discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale)
     return discharge_value
 
-def Q_Gumbel_fit_percentile (hydro_df, percentile): 
-    # now for 
-    return discharge_value
+def Q_Gumbel_fit_percentile(hydro_df, percentile): 
+    # Extract the annual maximum discharge values
+    hydro_df['Year'] = hydro_df['ValidTime'].dt.year
+    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
 
-def Q_GEV_fit_RP (hydro_df, RP):  
+    # Fit a Gumbel distribution to the annual maximum discharge values
+    loc, scale = stats.gumbel_r.fit(annual_max_discharge)
+
+    # Convert the percentile to a probability (e.g., 95th percentile = 0.95)
+    probability = percentile / 100.0
+    probplot(annual_max_discharge, dist="genextreme", sparams=(shape, loc, scale), plot=plt)
+    plt.show()
+    # Calculate the discharge value corresponding to the percentile
+    discharge_value = stats.gumbel_r.ppf(probability, loc, scale)
     return discharge_value
 
+def Q_GEV_fit_RP (series):  
+    model = EVA(series)
+    model.get_extremes(
+        method='BM',  # Block Maxima method
+        block_size="365D",  # One year per block
+        )
+
+    #model.plot_extremes()
+    gev_fit = model.fit_model()
+    model.plot_diagnostic(alpha=0.95)
+    summary = model.get_summary(
+        return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
+        alpha=0.95, # confidence interval
+        n_samples=1000,
+        )
+
+    return summary
+
 def Q_GEV_fit_percentile (hydro_df, percentile): 
     """
     Fits a GEV distribution to the daily discharge values and calculates the discharge 
@@ -53,31 +80,30 @@ def Q_GEV_fit_percentile (hydro_df, percentile):
     Returns:
         float: The discharge value corresponding to the given percentile.
     """
-    # Ensure 'Date' column is a datetime object
-    hydro_df['Date'] = pd.to_datetime(hydro_df['Date'])
-
-    # Extract daily discharge values
-    daily_discharge = hydro_df['Value']
-    #  Remove non-finite values
-    daily_discharge_cleaned = daily_discharge[np.isfinite(daily_discharge)]
-
+    hydro_df['Year'] = hydro_df['ValidTime'].dt.year
+    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
     # Check if there are still issues
-    if daily_discharge_cleaned.empty:
+    if series.empty:
         raise ValueError("All data was non-finite after cleaning. Please check your dataset.")
 
     # Fit a GEV distribution
-    shape, loc, scale = genextreme.fit(daily_discharge_cleaned)
-
-
+    shape, loc, scale = genextreme.fit(series)
     # Calculate the discharge value corresponding to the specified percentile
     discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale)
-
     return discharge_value
 
 if __name__ == '__main__': 
     station = 'Bamako'
     leadtime = 168 
     Q_Bamako_csv = f"{cfg.stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{station}_{leadtime}.csv"
+    df = pd.read_csv(
+        Q_Bamako_csv,
+        index_col=0, # setting ValidTime as index column
+        parse_dates=True, # parsing dates in ValidTime
+        )
+
+    df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time')
+
     series = csv_to_cleaned_series(Q_Bamako_csv)
     model = EVA(series)
     model.get_extremes(
@@ -93,5 +119,9 @@ def Q_GEV_fit_percentile (hydro_df, percentile):
         alpha=0.95, # confidence interval
         n_samples=1000,
         )
+
     print (summary)
+    percentile95_GEV = Q_GEV_fit_percentile(df, 95)
+    percentile95_Gumbel = Q_Gumbel_fit_percentile (df, 95)
+    print (percentile95)
 

From 33600f6d383fbbaea389ddc138af0d764ec8dcf8 Mon Sep 17 00:00:00 2001
From: ERKuipers <elskuipers@protonmail.com>
Date: Mon, 16 Dec 2024 12:40:17 +0100
Subject: [PATCH 2/3] hydroimpact for glfoas

---
 GloFAS/GloFAS_prep/configuration.py   |   3 +-
 comparison/observation/HydroImpact.py | 104 +++++----
 comparison/observation/thresholds.py  | 307 +++++++++++++++++++-------
 3 files changed, 296 insertions(+), 118 deletions(-)

diff --git a/GloFAS/GloFAS_prep/configuration.py b/GloFAS/GloFAS_prep/configuration.py
index 4b68f95..b28ebc3 100644
--- a/GloFAS/GloFAS_prep/configuration.py
+++ b/GloFAS/GloFAS_prep/configuration.py
@@ -23,7 +23,8 @@
 settlements_tif = DataDir / "GlobalHumanSettlement/GHS_BUILT_S_E2030_GLOBE_R2023A_54009_100_V1_0.tif"
 
 crs = f'EPSG:4326' 
-RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years 
+RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years
+percentiles = [95, 98, 99] 
 leadtimes = [72, 96, 120,144, 168] # add also 24hours
 startYear = 2004 # could be 2004 but needs to be 2016 since there is no google data available before 
 endYear = 2023 # 00:00 1st of january of that year, so up to but not including
diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
index d08876a..3758b34 100644
--- a/comparison/observation/HydroImpact.py
+++ b/comparison/observation/HydroImpact.py
@@ -4,6 +4,7 @@
 from GloFAS.GloFAS_prep.text_formatter import remove_accents, capitalize
 import scipy.stats as stats
 from comparison.pointMatching import attributePoints_to_Polygon
+from comparison.observation.thresholds import Q_Gumbel_fit_percentile, Q_Gumbel_fit_RP
 import geopandas as gpd
 import numpy as np
 from pyextremes import EVA
@@ -31,36 +32,16 @@ def create_datetime(row):
         return parse_date_with_fallback(row['Date'], row['Year'])
 
     hydro_df_long['ParsedDate'] = hydro_df_long.apply(create_datetime, axis=1)
-
     # Drop rows where ParsedDate is None (invalid dates)
     hydro_df_long = hydro_df_long.dropna(subset=["ParsedDate"])
-
     # Drop the now redundant 'Year' and 'Date' columns if not needed
     hydro_df_long = hydro_df_long.drop(columns=["Year", "Date"])
-
     # Rename ParsedDate back to Date
     hydro_df_long.rename(columns={"ParsedDate": "Date"}, inplace=True)
-
     # Display the reshaped DataFrame
     return hydro_df_long
-
-def z_RP_station(HydroStations_RP_file, StationName, RP):
-    '''waterlevel
-    '''
-    HydroStations_RP_df = pd.read_csv(HydroStations_RP_file)
-    
-    # Remove accents from StationName
-    StationName = remove_accents(StationName)
     
-    # Handle NaN values and ensure all are strings before applying the function
-    HydroStations_RP_df['StationName'] = HydroStations_RP_df['StationName'].fillna("").astype(str).apply(remove_accents)
-    
-    # Filter or calculate QRP based on RP and StationName
-    z_RP = HydroStations_RP_df[HydroStations_RP_df['StationName'] == StationName][f'{RP}'].values
-
-    return z_RP
-    
-def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): 
+def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value'): 
     """
     Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold.
 
@@ -75,31 +56,32 @@ def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probabili
     - pd.DataFrame: Copy of the input DataFrame with an additional 'trigger' column.
     """
     # Ensure "Value" column exists in the DataFrame
-    if "Value" not in hydro_df.columns:
-        raise ValueError("The input DataFrame must contain a 'Value' column.")
+    if value_col not in hydro_df.columns:
+        raise ValueError("The input DataFrame must contain a 'Value' column, please specify the name correctly")
     #QRP = QRP_station(HydroStations_RP_file, StationName, RP)
-    Q_station = hydro_df["Value"] 
+    Q_station = hydro_df[value_col] 
     
-    if distributionType =='Gumbel':
-        QRP= QRP_fit (hydro_df, probability)
-    else: # assuming above 20 is percentile, RP is percentile instead 
-        print ('applying a GEV, assuming your RP is in percentiles')
-        QRP = Q_GEV_fit (hydro_df, RP) 
+    if type_of_extremity == 'RP':
+        Q_prob= Q_Gumbel_fit_RP (hydro_df, probability)
+    elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead 
+        Q_prob = Q_GEV_fit (hydro_df, probability) 
+    else: 
+        print ('no such type of extremity')
     #print (f"for {StationName} : return period Q= {QRP}")
-    if not isinstance(QRP, (int, float)):
-        raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(QRP)}.")
+    if not isinstance(Q_prob, (int, float)):
+        raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(Q_prob)}.")
     # Calculate the QRP threshold
     
 
     # Copy the DataFrame and add the 'trigger' column
     hydro_trigger_df = hydro_df.copy()
-    hydro_trigger_df['Trigger'] = (hydro_trigger_df['Value'] > QRP).astype(int)
+    hydro_trigger_df['Trigger'] = (hydro_trigger_df[value_col] > Q_prob).astype(int)
     return hydro_trigger_df
 
-def createEvent(trigger_df):
+def createEvent(trigger_df, date_col='Date'):
     # Sort the dataframe by 'ValidTime', then by administrative unit: so that for each administrative unit, they are 
     # first sort on valid time 
-    trigger_df = trigger_df.sort_values(by=[f'Date']).reset_index(drop=True)
+    trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True)
 
     # Prepare commune info with geometry for event creation
     event_data = []
@@ -118,10 +100,10 @@ def createEvent(trigger_df):
         # Checking if the current row and next row are both part of an event (Trigger == 1), and the trigger happens in the same adiministrative unit
         elif row['Trigger'] == 1 and r + 1 < len(trigger_df) and trigger_df.iloc[r + 1]['Trigger'] == 1:
             Event = 1
-            StartDate = row['Date'] 
+            StartDate = row[f'{date_col}'] 
             # Continue until the end of the event where 'Trigger' is no longer 1
             while r < len(trigger_df) and trigger_df.iloc[r]['Trigger'] == 1:
-                possible_endtime = trigger_df.iloc[r]['Date']
+                possible_endtime = trigger_df.iloc[r][f'{date_col}']
                 processed_rows[r] = True  # Mark row as processed
                 row = trigger_df.iloc[r]
                 r += 1
@@ -156,8 +138,8 @@ def createEvent(trigger_df):
         events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date'])
         return events_df
 
-def loop_over_stations(station_csv, DataDir, RP): 
-    RP = float(RP)
+def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): 
+    probability = float(probability)
     station_df = pd.read_csv (station_csv, header=0)
     #print (station_df.columns)
     hydrodir = rf"{DataDir}/DNHMali_2019/Q_stations"
@@ -173,7 +155,31 @@ def loop_over_stations(station_csv, DataDir, RP):
             # print (f'no discharge measures found for station {StationName} in {BasinName}')
             continue
 
-        trigger_df = stampHydroTrigger (hydro_df, RP, StationName)
+        trigger_df = stampHydroTrigger (hydro_df, StationName, type_of_extremity, probability, value_col)
+        event_df = createEvent (trigger_df)
+        event_df ['StationName'] = StationName
+        all_events.append (event_df)
+    
+    #generate the gdf to merge with where the points are attributed to the respective administrative units
+    all_events_df = pd.concat (all_events, ignore_index=True)
+    all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv")
+    return all_events_df 
+
+def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extremity, value_col, leadtime): 
+    probability = float(probability)
+    station_df = pd.read_csv (station_csv, header=0)
+    all_events = []
+    for _, stationrow in station_df.iterrows(): 
+        StationName = stationrow['StationName']
+        BasinName= stationrow['Basin']
+        "C:\Users\els-2\MaliGloFAS\data\stations\GloFAS_Q\timeseries\discharge_timeseries_Guelelinkoro_72.csv"
+
+
+        glofas_csv = rf"{stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{StationName}_{leadtime}.csv"
+        glofas_df = pd.read_csv(glofas_csv, header=0)
+
+
+        trigger_df = stampHydroTrigger (glofas_df, StationName, type_of_extremity, probability, value_col)
         event_df = createEvent (trigger_df)
         event_df ['StationName'] = StationName
         all_events.append (event_df)
@@ -206,7 +212,19 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
 
 
 if __name__=="__main__": 
-    # running this script for the GloFAS 
-    #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-    event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
-    event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)
\ No newline at end of file
+
+    station_df = pd.read_csv (cfg.DNHstations, header=0)
+    #print (station_df.columns)
+    date_col = 'ValidTime'
+    value_col = 'percentile_40.0' # we're interested in 60% probability of being exceeded amongst the ensemble members 
+    all_events = []
+    for leadtime in cfg.leadtimes:
+        for RP in cfg.RPsyr: 
+            loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
+        for percentile in cfg.percentiles:
+            loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
+
+    # # running this script for the GloFAS 
+    # #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
+    # event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
+    # event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)
\ No newline at end of file
diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py
index ccff81c..abab528 100644
--- a/comparison/observation/thresholds.py
+++ b/comparison/observation/thresholds.py
@@ -2,15 +2,61 @@
 import GloFAS.GloFAS_prep.configuration as cfg
 import pyextremes as pe
 from pyextremes import EVA
-from scipy.stats import genextreme
+from scipy.stats import genextreme, gumbel_r
+import matplotlib.pyplot as plt
+from scipy.stats import probplot
+import matplotlib.pyplot as plt
+import numpy as np
 
-def csv_to_cleaned_series (csv): 
-    df = pd.read_csv(
-        csv,
-        index_col=0, # setting ValidTime as index column
-        parse_dates=True, # parsing dates in ValidTime
+def calculate_max_discharge(hydro_df: pd.DataFrame, 
+                            value_col: str = 'Value', 
+                            timescale: str = 'Year', 
+                            incomplete_lastyear: (str,int) = 2023 # not considering the maximum of incomplete years, as these are not representative of the season
+                            ) -> pd.Series:
+    """
+    Calculates the maximum discharge values for a given timescale.
+
+    Parameters:
+        hydro_df (pd.DataFrame): DataFrame with discharge data. The index must be datetime or convertible to datetime.
+        value_col (str): The column name of the discharge values.
+        timescale (str): 'Year' for annual maxima, 'Day' for daily maxima.
+
+    Returns:
+        pd.Series: Maximum discharge values with the corresponding timescale as the index.
+    """
+    if not isinstance(hydro_df.index, pd.DatetimeIndex):
+        hydro_df.index = pd.to_datetime(hydro_df.index)
+
+    hydro_df = hydro_df.copy()  
+
+    hydro_df = hydro_df.loc[hydro_df.index < pd.to_datetime(f'{incomplete_lastyear}-01-01')]
+    hydro_df = (
+        hydro_df
+        .sort_index(ascending=True)
+        .astype(float)
+        .dropna()
         )
+    if timescale == 'Year':
+        hydro_df['Year'] = hydro_df.index.year
+        return hydro_df.groupby('Year')[value_col].max()
+    elif timescale == 'Day':
+        hydro_df['Date'] = hydro_df.index.date
+        return hydro_df.groupby('Date')[value_col].max()
+    else:
+        raise ValueError("Invalid timescale. Use 'Year' or 'Day'.")
 
+def csv_to_cleaned_series(csv: str) -> pd.Series:
+    """
+    Reads a CSV file and prepares a cleaned pandas Series for analysis.
+
+    :param csv: Path to the CSV file containing discharge data.
+    :return: A cleaned pandas Series of discharge values.
+    """
+    df = pd.read_csv(
+        csv,
+        index_col=0,  # setting ValidTime as index column
+        parse_dates=True,  # parsing dates in ValidTime
+    )
     df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time')
     series = df['percentile_40.0']
     series = series.loc[series.index < pd.to_datetime('2023-01-01')]
@@ -19,109 +65,222 @@ def csv_to_cleaned_series (csv):
         .sort_index(ascending=True)
         .astype(float)
         .dropna()
-        )
+    )
     return series
-def Q_Gumbel_fit_RP (hydro_df, RP): 
 
-    # Extract the annual maximum discharge values
-    hydro_df['Year'] = hydro_df['ValidTime'].dt.year
-    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
+def Q_GEV_fit(hydro_df: pd.DataFrame, value_col: str = 'Value', timescale='Year') -> tuple:
+    """
+    Fits a GEV distribution to the annual maximum discharge values and calculates the discharge 
+    corresponding to a given percentile.
+
+    Parameters:
+        hydro_df (pd.DataFrame): DataFrame with discharge data. Index must be datetime.
+        percentile (float): Percentile to calculate (0-100).
+        value_col (str): The column name for discharge values.
+
+    Returns:
+        shape, loc, scale: tuple of corresponding fit
+    """
+    # Calculate annual maximum discharges
+    daily_max_discharge = calculate_max_discharge(hydro_df, value_col, timescale=timescale)
 
-    # Fit a Gumbel distribution to the annual maximum discharge values
-    loc, scale = stats.gumbel_r.fit(annual_max_discharge)
-    # Calculate the discharge value corresponding to the return period
-    discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale)
+    # Fit a GEV distribution
+    shape_GEV, loc_GEV, scale_GEV = genextreme.fit(daily_max_discharge)
+    return shape_GEV, loc_GEV, scale_GEV
+
+def Q_GEV_fit_percentile (hydro_df: pd.DataFrame, percentile: float, value_col: str = 'Value') -> float:
+    shape,loc,scale = Q_GEV_fit (hydro_df, value_col, timescale='Day')
+    # Calculate the discharge value corresponding to the percentile
+    discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale)
     return discharge_value
 
-def Q_Gumbel_fit_percentile(hydro_df, percentile): 
-    # Extract the annual maximum discharge values
-    hydro_df['Year'] = hydro_df['ValidTime'].dt.year
-    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
 
-    # Fit a Gumbel distribution to the annual maximum discharge values
-    loc, scale = stats.gumbel_r.fit(annual_max_discharge)
+def Q_Gumbel_fit_percentile(hydro_df: pd.DataFrame, percentile: float, value_col: str = 'Value') -> float:
+    """
+    Fits a Gumbel distribution to the annual maximum discharge values and calculates the discharge 
+    corresponding to a given percentile.
 
-    # Convert the percentile to a probability (e.g., 95th percentile = 0.95)
-    probability = percentile / 100.0
-    probplot(annual_max_discharge, dist="genextreme", sparams=(shape, loc, scale), plot=plt)
-    plt.show()
+    Parameters:
+        hydro_df (pd.DataFrame): DataFrame with discharge data. Index must be datetime.
+        percentile (float): Percentile to calculate (0-100).
+        value_col (str): The column name for discharge values.
+
+    Returns:
+        float: The discharge value corresponding to the given percentile.
+    """
+    # Calculate annual maximum discharges
+    daily_max_discharge = calculate_max_discharge(hydro_df, value_col, timescale='Day')
+
+    # Fit a Gumbel distribution
+    loc, scale = gumbel_r.fit(daily_max_discharge)
+    
     # Calculate the discharge value corresponding to the percentile
-    discharge_value = stats.gumbel_r.ppf(probability, loc, scale)
+    discharge_value = gumbel_r.ppf(percentile / 100, loc, scale)
     return discharge_value
 
-def Q_GEV_fit_RP (series):  
-    model = EVA(series)
-    model.get_extremes(
-        method='BM',  # Block Maxima method
-        block_size="365D",  # One year per block
-        )
+def Q_Gumbel_fit (hydro_df: pd.DataFrame, 
+                    value_col: str = 'Value', 
+                    timescale: str = 'Year'):
+    max_discharges = calculate_max_discharge(hydro_df, value_col, timescale=timescale)
+    loc_gumbel, scale_gumbel = gumbel_r.fit(max_discharges)
+    return loc_gumbel, scale_gumbel
 
-    #model.plot_extremes()
+def Q_Gumbel_fit_RP(hydro_df: pd.DataFrame, 
+                    RP: float, 
+                    value_col: str = 'Value') -> float:
+    """
+    Fits a Gumbel distribution to annual maximum discharge values and calculates
+    the discharge corresponding to a given return period.
+
+    :param hydro_df: A DataFrame containing discharge data with date and value columns.
+    :param RP: The return period in years.
+    :param date_col: Name of the column containing datetime information (default: 'ValidTime').
+    :param value_col: Name of the column containing discharge values (default: 'Value').
+    :return: The discharge value corresponding to the return period.
+    """
+    loc, scale = Q_Gumbel_fit (hydro_df, value_col, timescale='Year')
+    return gumbel_r.ppf(1 - 1 / RP, loc, scale)
+
+
+def Q_GEV_fit_RP(hydro_df, value_col, RP):
+    """
+    Fits a GEV distribution to a time series of discharge values and calculates
+    the discharge values corresponding to specified return periods.
+
+    :param series: A pandas Series containing discharge values.
+    :param return_periods: A list of return periods in years.
+    :return: A dictionary with return periods as keys and corresponding discharge values.
+    """
+
+    shape,loc, scale = Q_GEV_fit(hydro_df, value_col, timescale='Year')
+    discharge_value = genextreme.ppf(1-1/RP, shape, loc=loc, scale=scale)
+    return discharge_value
+
+    return discharge_value
+
+def Q_GEV_fit_RP_EVA(series: pd.Series, return_periods: list = [2, 5, 10, 25, 50, 100]) -> dict:
+    """
+    Fits a GEV distribution to a time series of discharge values and calculates
+    the discharge values corresponding to specified return periods.
+
+    :param series: A pandas Series containing discharge values.
+    :param return_periods: A list of return periods in years.
+    :return: A dictionary with return periods as keys and corresponding discharge values.
+    """
+    model = EVA(series)
+    model.get_extremes(method='BM', block_size="365D")
     gev_fit = model.fit_model()
     model.plot_diagnostic(alpha=0.95)
     summary = model.get_summary(
-        return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
-        alpha=0.95, # confidence interval
+        return_period=return_periods,
+        alpha=0.95,  # confidence interval
         n_samples=1000,
-        )
-
+    )
     return summary
 
-def Q_GEV_fit_percentile (hydro_df, percentile): 
+def plot_diagnostics(series, shape_gev, loc_gev, scale_gev, loc_gumbel, scale_gumbel, rp_values):
     """
-    Fits a GEV distribution to the daily discharge values and calculates the discharge 
-    corresponding to a given percentile.
+    Generate diagnostic and comparison plots for GEV and Gumbel fits.
 
     Parameters:
-        hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns.
-            'Date' should be a datetime object and 'Value' is the discharge value.
-        percentile (float): Percentile for which to compute the discharge value (between 0 and 100).
-
-    Returns:
-        float: The discharge value corresponding to the given percentile.
+        series (pd.Series): Time series of discharge values.
+        shape_gev, loc_gev, scale_gev: GEV parameters.
+        loc_gumbel, scale_gumbel: Gumbel parameters.
+        rp_values (list): List of return periods.
     """
-    hydro_df['Year'] = hydro_df['ValidTime'].dt.year
-    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
-    # Check if there are still issues
-    if series.empty:
-        raise ValueError("All data was non-finite after cleaning. Please check your dataset.")
+    # Sort data for plotting purposes
+    sorted_data = np.sort(series)
+    ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
+
+    # Theoretical values
+    gev_theoretical = genextreme.ppf(ecdf, shape_gev, loc=loc_gev, scale=scale_gev)
+    gumbel_theoretical = gumbel_r.ppf(ecdf, loc=loc_gumbel, scale=scale_gumbel)
+
+    # Plot diagnostics
+    fig, axes = plt.subplots(2, 2, figsize=(10, 10))
+
+    # Q-Q plot
+    axes[0, 0].scatter(gev_theoretical, sorted_data, label='GEV', color='b')
+    axes[0, 0].scatter(gumbel_theoretical, sorted_data, label='Gumbel', color='r', alpha=0.6)
+    axes[0, 0].plot([min(sorted_data), max(sorted_data)], [min(sorted_data), max(sorted_data)], '--k')
+    axes[0, 0].set_title("Q-Q Plot")
+    axes[0, 0].set_xlabel("Theoretical")
+    axes[0, 0].set_ylabel("Observed")
+    axes[0, 0].legend()
+
+    # P-P plot
+    axes[0, 1].scatter(ecdf, genextreme.cdf(sorted_data, shape_gev, loc=loc_gev, scale=scale_gev), label='GEV', color='b')
+    axes[0, 1].scatter(ecdf, gumbel_r.cdf(sorted_data, loc=loc_gumbel, scale=scale_gumbel), label='Gumbel', color='r', alpha=0.6)
+    axes[0, 1].plot([0, 1], [0, 1], '--k')
+    axes[0, 1].set_title("P-P Plot")
+    axes[0, 1].set_xlabel("Empirical CDF")
+    axes[0, 1].set_ylabel("Theoretical CDF")
+    axes[0, 1].legend()
+
+    # Return Period Plot
+    gev_return = genextreme.ppf(1 - 1 / np.array(rp_values), shape_gev, loc=loc_gev, scale=scale_gev)
+    gumbel_return = gumbel_r.ppf(1 - 1 / np.array(rp_values), loc=loc_gumbel, scale=scale_gumbel)
+    axes[1, 0].scatter(rp_values, series.max(), color='k', label="Observed Data")
+    axes[1, 0].plot(rp_values, gev_return, '-b', label="GEV Fit")
+    axes[1, 0].plot(rp_values, gumbel_return, '-r', label="Gumbel Fit")
+    axes[1, 0].set_xscale('log')
+    axes[1, 0].set_title("Return Period Plot")
+    axes[1, 0].set_xlabel("Return Period (years)")
+    axes[1, 0].set_ylabel("Discharge")
+    axes[1, 0].legend()
+
+    # Probability Density Function
+    x = np.linspace(min(series), max(series), 1000)
+    gev_pdf = genextreme.pdf(x, shape_gev, loc=loc_gev, scale=scale_gev)
+    gumbel_pdf = gumbel_r.pdf(x, loc=loc_gumbel, scale=scale_gumbel)
+    axes[1, 1].hist(series, bins=20, density=True, alpha=0.4, color='g', edgecolor='k')
+    axes[1, 1].plot(x, gev_pdf, '-b', label="GEV PDF")
+    axes[1, 1].plot(x, gumbel_pdf, '-r', label="Gumbel PDF")
+    axes[1, 1].set_title("Probability Density Function")
+    axes[1, 1].set_xlabel("Discharge")
+    axes[1, 1].set_ylabel("Density")
+    axes[1, 1].legend()
+
+    plt.tight_layout()
+    plt.show()
 
-    # Fit a GEV distribution
-    shape, loc, scale = genextreme.fit(series)
-    # Calculate the discharge value corresponding to the specified percentile
-    discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale)
-    return discharge_value
 
-if __name__ == '__main__': 
+if __name__ == '__main__':
     station = 'Bamako'
-    leadtime = 168 
+    leadtime = 168
+    return_periods = [1, 1.5, 2, 5, 10, 25, 50, 100]  
     Q_Bamako_csv = f"{cfg.stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{station}_{leadtime}.csv"
+
     df = pd.read_csv(
         Q_Bamako_csv,
-        index_col=0, # setting ValidTime as index column
-        parse_dates=True, # parsing dates in ValidTime
-        )
+        index_col=0,  # setting ValidTime as index column
+        parse_dates=True,  # parsing dates in ValidTime
+    )
 
     df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time')
 
     series = csv_to_cleaned_series(Q_Bamako_csv)
-    model = EVA(series)
-    model.get_extremes(
-        method='BM',  # Block Maxima method
-        block_size="365D",  # One year per block
-        )
 
-    #model.plot_extremes()
+    # GEV modeling
+    model = EVA(series)
+    model.get_extremes(method='BM', block_size="365D")
     gev_fit = model.fit_model()
     model.plot_diagnostic(alpha=0.95)
     summary = model.get_summary(
-        return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
-        alpha=0.95, # confidence interval
+        return_period=return_periods,
+        alpha=0.95,  # confidence interval
         n_samples=1000,
-        )
+    )
+    print(summary)
+
+    # Percentile calculations
+    percentile95_GEV = Q_GEV_fit_percentile(df, 95, value_col='percentile_40.0')
+    percentile95_Gumbel = Q_Gumbel_fit_percentile(df, 95,value_col='percentile_40.0')
+
 
-    print (summary)
-    percentile95_GEV = Q_GEV_fit_percentile(df, 95)
-    percentile95_Gumbel = Q_Gumbel_fit_percentile (df, 95)
-    print (percentile95)
+    print(f"95th Percentile (GEV): {percentile95_GEV}")
+    print(f"95th Percentile (Gumbel): {percentile95_Gumbel}")
 
+    shape_gev, loc_gev, scale_gev = Q_GEV_fit (df, value_col='percentile_40.0', timescale='Year')
+    loc_gumbel, scale_gumbel = Q_Gumbel_fit(df, value_col='percentile_40.0')
+    plot_diagnostics(series, shape_gev, loc_gev, scale_gev, loc_gumbel, scale_gumbel, return_periods)
\ No newline at end of file

From b78e5521b23b05221a4fcac6b8ff198c25410366 Mon Sep 17 00:00:00 2001
From: ERKuipers <elskuipers@protonmail.com>
Date: Mon, 16 Dec 2024 18:26:50 +0100
Subject: [PATCH 3/3] update for observational data

---
 comparison/observation/HydroImpact.py | 86 ++++++++++++++++++++-------
 comparison/observation/thresholds.py  | 12 ++--
 2 files changed, 73 insertions(+), 25 deletions(-)

diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
index 3758b34..ef8a14e 100644
--- a/comparison/observation/HydroImpact.py
+++ b/comparison/observation/HydroImpact.py
@@ -16,6 +16,7 @@ def parse_date_with_fallback(date_str, year):
         date = pd.to_datetime(f"{year} {date_str}", format="%Y %d/%m %H:%M")
         return date
     except ValueError:
+
         # If the date is invalid (e.g., Feb 29 in non-leap years), return None
         #print(f"Invalid date skipped: {date_str} for year {year}")
         return None
@@ -41,7 +42,7 @@ def create_datetime(row):
     # Display the reshaped DataFrame
     return hydro_df_long
     
-def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value'): 
+def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value', date_col='Date'): 
     """
     Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold.
 
@@ -55,16 +56,30 @@ def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, val
     Returns:
     - pd.DataFrame: Copy of the input DataFrame with an additional 'trigger' column.
     """
+    # Assuming hydro_df is already loaded
+    if date_col in hydro_df.columns:
+        # Convert the date_col to datetime
+        hydro_df[date_col] = pd.to_datetime(hydro_df[date_col], format="%Y-%m-%d %H:%M", errors='coerce')
+
+        # Check for any invalid dates
+        if hydro_df[date_col].isnull().any():
+            print("Warning: Some dates in 'date_col' could not be parsed and are set to NaT.")
+        
+        # Set the index to the datetime column
+        hydro_df.set_index(date_col, inplace=True)
+        #print(f"Index successfully set to datetime using '{date_col}':\n{hydro_df.index}")
+    else:
+        print(f"Error: Column '{date_col}' does not exist in the DataFrame, hopefully teh date is the index")
+
     # Ensure "Value" column exists in the DataFrame
     if value_col not in hydro_df.columns:
         raise ValueError("The input DataFrame must contain a 'Value' column, please specify the name correctly")
     #QRP = QRP_station(HydroStations_RP_file, StationName, RP)
-    Q_station = hydro_df[value_col] 
-    
+    Q_station = hydro_df[value_col] # this is where it goes wrong!!!
     if type_of_extremity == 'RP':
-        Q_prob= Q_Gumbel_fit_RP (hydro_df, probability)
+        Q_prob= Q_Gumbel_fit_RP (hydro_df, probability, value_col=value_col) # results in infinite
     elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead 
-        Q_prob = Q_GEV_fit (hydro_df, probability) 
+        Q_prob = Q_Gumbel_fit_percentile (hydro_df, percentile, value_col) 
     else: 
         print ('no such type of extremity')
     #print (f"for {StationName} : return period Q= {QRP}")
@@ -81,7 +96,15 @@ def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, val
 def createEvent(trigger_df, date_col='Date'):
     # Sort the dataframe by 'ValidTime', then by administrative unit: so that for each administrative unit, they are 
     # first sort on valid time 
-    trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True)
+    if date_col in trigger_df.columns:
+        trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True)
+    else:
+        try:
+            trigger_df = trigger_df.sort_index()
+        except: 
+            raise ValueError (f'{date_col} not in trigger_df index and no set index either')
+
+        
 
     # Prepare commune info with geometry for event creation
     event_data = []
@@ -92,18 +115,29 @@ def createEvent(trigger_df, date_col='Date'):
     r = 0
     while r < len(trigger_df):
         row = trigger_df.iloc[r]
+        
         if processed_rows[r]:
             # If the row is already processed, continue to the next row
             r += 1
             continue
         
-        # Checking if the current row and next row are both part of an event (Trigger == 1), and the trigger happens in the same adiministrative unit
+        # Checking if the current row and next row are both part of an event (Trigger == 1)
+        # and the trigger happens in the same administrative unit
         elif row['Trigger'] == 1 and r + 1 < len(trigger_df) and trigger_df.iloc[r + 1]['Trigger'] == 1:
             Event = 1
-            StartDate = row[f'{date_col}'] 
+            StartDate = row[f'{date_col}'] if date_col in trigger_df.columns else trigger_df.index[r]
+            
             # Continue until the end of the event where 'Trigger' is no longer 1
             while r < len(trigger_df) and trigger_df.iloc[r]['Trigger'] == 1:
-                possible_endtime = trigger_df.iloc[r][f'{date_col}']
+                # Check if the date column is part of the index or a column
+                if date_col in trigger_df.columns:
+                    possible_endtime = trigger_df.iloc[r][f'{date_col}']
+                else: # assuming it is the index
+                    try:
+                        possible_endtime = trigger_df.index[r]
+                    except: 
+                        raise ValueError(f'{date_col} not in columns and not a valid index either')
+                
                 processed_rows[r] = True  # Mark row as processed
                 row = trigger_df.iloc[r]
                 r += 1
@@ -138,7 +172,7 @@ def createEvent(trigger_df, date_col='Date'):
         events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date'])
         return events_df
 
-def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): 
+def loop_over_stations_obs(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): 
     probability = float(probability)
     station_df = pd.read_csv (station_csv, header=0)
     #print (station_df.columns)
@@ -149,8 +183,8 @@ def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, val
         BasinName= stationrow['Basin']
         stationPath = rf"{hydrodir}/{BasinName}_{StationName}.csv"
         try: 
-            hydro_df = transform_hydro (stationPath)
             print (f'calculating {StationName}, in {BasinName}')
+            hydro_df = transform_hydro (stationPath) # this actually looks good
         except: 
             # print (f'no discharge measures found for station {StationName} in {BasinName}')
             continue
@@ -162,7 +196,10 @@ def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, val
     
     #generate the gdf to merge with where the points are attributed to the respective administrative units
     all_events_df = pd.concat (all_events, ignore_index=True)
-    all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv")
+    if type_of_extremity =='RP': 
+        all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{probability}yr.csv")
+    elif type_of_extremity =='percentile': 
+        all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_percentile{probability}.csv")
     return all_events_df 
 
 def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extremity, value_col, leadtime): 
@@ -170,14 +207,18 @@ def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extre
     station_df = pd.read_csv (station_csv, header=0)
     all_events = []
     for _, stationrow in station_df.iterrows(): 
-        StationName = stationrow['StationName']
+        StationName = remove_accents(stationrow['StationName'])
         BasinName= stationrow['Basin']
-        "C:\Users\els-2\MaliGloFAS\data\stations\GloFAS_Q\timeseries\discharge_timeseries_Guelelinkoro_72.csv"
-
 
         glofas_csv = rf"{stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{StationName}_{leadtime}.csv"
-        glofas_df = pd.read_csv(glofas_csv, header=0)
-
+        try:
+            glofas_df = pd.read_csv(glofas_csv, 
+                                    header=0,
+                                    index_col=0,  # setting ValidTime as index column
+                                    parse_dates=True)
+        except:
+            print (f'no discharge series for station {StationName}')
+            continue
 
         trigger_df = stampHydroTrigger (glofas_df, StationName, type_of_extremity, probability, value_col)
         event_df = createEvent (trigger_df)
@@ -186,7 +227,10 @@ def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extre
     
     #generate the gdf to merge with where the points are attributed to the respective administrative units
     all_events_df = pd.concat (all_events, ignore_index=True)
-    all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv")
+    if type_of_extremity =='RP':
+        all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_RP{probability}yr_leadtime{leadtime}.csv")
+    elif type_of_extremity =='percentile': 
+        all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_percentile{probability}_leadtime{leadtime}.csv")
     return all_events_df 
 
 def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, all_events_df, model, RP):
@@ -220,9 +264,11 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
     all_events = []
     for leadtime in cfg.leadtimes:
         for RP in cfg.RPsyr: 
-            loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
+            loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'RP', RP, value_col='Value')
+            #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
         for percentile in cfg.percentiles:
-            loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
+            loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'percentile', percentile, value_col='Value')
+            #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
 
     # # running this script for the GloFAS 
     # #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py
index abab528..6d6762c 100644
--- a/comparison/observation/thresholds.py
+++ b/comparison/observation/thresholds.py
@@ -11,8 +11,8 @@
 def calculate_max_discharge(hydro_df: pd.DataFrame, 
                             value_col: str = 'Value', 
                             timescale: str = 'Year', 
-                            incomplete_lastyear: (str,int) = 2023 # not considering the maximum of incomplete years, as these are not representative of the season
-                            ) -> pd.Series:
+                            incomplete_lastyear: (str,int) = 2023, # not considering the maximum of incomplete years, as these are not representative of the season
+                            date_col: str ='Date') -> pd.Series:
     """
     Calculates the maximum discharge values for a given timescale.
 
@@ -25,6 +25,7 @@ def calculate_max_discharge(hydro_df: pd.DataFrame,
         pd.Series: Maximum discharge values with the corresponding timescale as the index.
     """
     if not isinstance(hydro_df.index, pd.DatetimeIndex):
+        hydro_df [date_col] = hydro_df.index
         hydro_df.index = pd.to_datetime(hydro_df.index)
 
     hydro_df = hydro_df.copy()  
@@ -38,13 +39,14 @@ def calculate_max_discharge(hydro_df: pd.DataFrame,
         )
     if timescale == 'Year':
         hydro_df['Year'] = hydro_df.index.year
-        return hydro_df.groupby('Year')[value_col].max()
+
+        maximum =  hydro_df.groupby('Year')[value_col].max()
     elif timescale == 'Day':
         hydro_df['Date'] = hydro_df.index.date
-        return hydro_df.groupby('Date')[value_col].max()
+        maximum = hydro_df.groupby('Date')[value_col].max()
     else:
         raise ValueError("Invalid timescale. Use 'Year' or 'Day'.")
-
+    return maximum
 def csv_to_cleaned_series(csv: str) -> pd.Series:
     """
     Reads a CSV file and prepares a cleaned pandas Series for analysis.