From df5fb5bb2231e245b17ee1c028aae64c80bd830e Mon Sep 17 00:00:00 2001
From: ERKuipers <elskuipers@protonmail.com>
Date: Fri, 13 Dec 2024 13:56:07 +0100
Subject: [PATCH] thresholds

---
 GloFAS/GloFAS_prep/Q_timeseries_station.py |  6 +-
 GloFAS/GloFAS_prep/station_to_glofas.py    |  2 +-
 comparison/observation/HydroImpact.py      | 56 ++-----------
 comparison/observation/thresholds.py       | 97 ++++++++++++++++++++++
 comparison/pointMatching.py                |  2 +-
 5 files changed, 110 insertions(+), 53 deletions(-)
 create mode 100644 comparison/observation/thresholds.py

diff --git a/GloFAS/GloFAS_prep/Q_timeseries_station.py b/GloFAS/GloFAS_prep/Q_timeseries_station.py
index a8a64b0..d2e20f0 100644
--- a/GloFAS/GloFAS_prep/Q_timeseries_station.py
+++ b/GloFAS/GloFAS_prep/Q_timeseries_station.py
@@ -13,6 +13,7 @@
 from GloFAS.GloFAS_data_extractor.threshold_opener import openThreshold
 from GloFAS.GloFAS_data_extractor.unzip_open import openGloFAS, unzipGloFAS
 import GloFAS.GloFAS_prep.configuration as cfg
+
 class GloFAS_timeseries: 
     def __init__(self,
                 DataDir, 
@@ -99,8 +100,11 @@ def Q_over_time(self):
                 station_results_df = pd.DataFrame(station_results)
                 station_results_df.to_csv(f'{self.DataDir}/stations/timeseries/discharge_timeseries_{stationname}_{self.leadtime}.csv', index=False)
                 print(f"Finished station {stationname}")
+            
+                
 
 
+    
 if __name__ =='__main__': 
     for leadtime in cfg.leadtimes:
         timeseries = GloFAS_timeseries(DataDir=cfg.DataDir,
@@ -113,4 +117,4 @@ def Q_over_time(self):
                                         end_date=None, 
                                         IDhead='Station names',
                                         percentile=(100-(cfg.triggerProb*100))) # percentile is 1 - the probability of exceedence. At 40 percent percentile, tehre is a 60 procent probability of this value being exceeded
-        timeseries.Q_over_time()
\ No newline at end of file
+        Q_timeseries.Q_over_time()
diff --git a/GloFAS/GloFAS_prep/station_to_glofas.py b/GloFAS/GloFAS_prep/station_to_glofas.py
index 7090e2d..3257e9f 100644
--- a/GloFAS/GloFAS_prep/station_to_glofas.py
+++ b/GloFAS/GloFAS_prep/station_to_glofas.py
@@ -22,7 +22,7 @@
     glofas_x, glofas_y, area_diff = find_corresponding_point_within_box(
         station_lon=row['Lon'],
         station_lat=row['Lat'],
-        ups_area_point_m=row['Catchment area (km2)'] * 1e6,  # Convert from km² to m²
+        ups_area_point_m=row['Catchment area (km2)']*1e6,  # Convert from km² to m²
         stationName=row['Station names'],
         glofas_ups_area_xr=glofas_ups_area_da,
         radius_m=10000
diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
index 656ded1..4d94091 100644
--- a/comparison/observation/HydroImpact.py
+++ b/comparison/observation/HydroImpact.py
@@ -6,6 +6,7 @@
 from comparison.pointMatching import attributePoints_to_Polygon
 import geopandas as gpd
 import numpy as np
+from pyextremes import EVA
 from scipy.stats import genextreme
 
 def parse_date_with_fallback(date_str, year):
@@ -59,54 +60,9 @@ def z_RP_station(HydroStations_RP_file, StationName, RP):
 
     return z_RP
 
-def QRP_fit (hydro_df, RP): 
 
-    # Extract the annual maximum discharge values
-    hydro_df['Year'] = hydro_df['Date'].dt.year
-    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
 
-    # Fit a Gumbel distribution to the annual maximum discharge values
-    loc, scale = stats.gumbel_r.fit(annual_max_discharge)
-    # Calculate the discharge value corresponding to the return period
-    discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale)
-    return discharge_value
-
-def Q_GEV_fit(hydro_df, percentile): 
-    """
-    Fits a GEV distribution to the daily discharge values and calculates the discharge 
-    corresponding to a given percentile.
-
-    Parameters:
-        hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns.
-            'Date' should be a datetime object and 'Value' is the discharge value.
-        percentile (float): Percentile for which to compute the discharge value (between 0 and 100).
-
-    Returns:
-        float: The discharge value corresponding to the given percentile.
-    """
-    # Ensure 'Date' column is a datetime object
-    hydro_df['Date'] = pd.to_datetime(hydro_df['Date'])
-
-    # Extract daily discharge values
-    daily_discharge = hydro_df['Value']
-    #  Remove non-finite values
-    daily_discharge_cleaned = daily_discharge[np.isfinite(daily_discharge)]
-
-    # Check if there are still issues
-    if daily_discharge_cleaned.empty:
-        raise ValueError("All data was non-finite after cleaning. Please check your dataset.")
-
-    # Fit a GEV distribution
-    shape, loc, scale = genextreme.fit(daily_discharge_cleaned)
-
-
-    # Calculate the discharge value corresponding to the specified percentile
-    discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale)
-
-    return discharge_value
-
-
-def stampHydroTrigger(hydro_df, RP, StationName): 
+def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): 
     """
     Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold.
 
@@ -126,8 +82,8 @@ def stampHydroTrigger(hydro_df, RP, StationName):
     #QRP = QRP_station(HydroStations_RP_file, StationName, RP)
     Q_station = hydro_df["Value"] 
     
-    if RP < 21:
-        QRP= QRP_fit (hydro_df, RP)
+    if distributionType =='Gumbel':
+        QRP= QRP_fit (hydro_df, probability)
     else: # assuming above 20 is percentile, RP is percentile instead 
         print ('applying a GEV, assuming your RP is in percentiles')
         QRP = Q_GEV_fit (hydro_df, RP) 
@@ -206,7 +162,7 @@ def loop_over_stations(station_csv, DataDir, RP):
     RP = float(RP)
     station_df = pd.read_csv (station_csv, header=0)
     #print (station_df.columns)
-    hydrodir = rf"{DataDir}/DNHMali_2019\Q_stations"
+    hydrodir = rf"{DataDir}/DNHMali_2019/Q_stations"
     all_events = []
     for _, stationrow in station_df.iterrows(): 
         StationName = stationrow['StationName']
@@ -252,7 +208,7 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
 
 
 if __name__=="__main__": 
-
+    # running this script for the GloFAS 
     #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
     event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
     event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)
\ No newline at end of file
diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py
new file mode 100644
index 0000000..bd0df10
--- /dev/null
+++ b/comparison/observation/thresholds.py
@@ -0,0 +1,97 @@
+import pandas as pd
+import GloFAS.GloFAS_prep.configuration as cfg
+import pyextremes as pe
+from pyextremes import EVA
+from scipy.stats import genextreme
+
+def csv_to_cleaned_series (csv): 
+    df = pd.read_csv(
+        Q_Bamako_csv,
+        index_col=0, # setting ValidTime as index column
+        parse_dates=True, # parsing dates in ValidTime
+        )
+
+    df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time')
+    series = df['percentile_40.0']
+    series = series.loc[series.index < pd.to_datetime('2023-01-01')]
+    series = (
+    series
+    .sort_index(ascending=True)
+    .astype(float)
+    .dropna()
+    )
+    return series
+def Q_Gumbel_fit_RP (hydro_df, RP): 
+
+    # Extract the annual maximum discharge values
+    hydro_df['Year'] = hydro_df['Date'].dt.year
+    annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
+
+    # Fit a Gumbel distribution to the annual maximum discharge values
+    loc, scale = stats.gumbel_r.fit(annual_max_discharge)
+    # Calculate the discharge value corresponding to the return period
+    discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale)
+    return discharge_value
+
+def Q_Gumbel_fit_percentile (hydro_df, percentile): 
+    # now for 
+    return discharge_value
+
+def Q_GEV_fit_RP (hydro_df, RP):  
+    return discharge_value
+
+def Q_GEV_fit_percentile (hydro_df, percentile): 
+    """
+    Fits a GEV distribution to the daily discharge values and calculates the discharge 
+    corresponding to a given percentile.
+
+    Parameters:
+        hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns.
+            'Date' should be a datetime object and 'Value' is the discharge value.
+        percentile (float): Percentile for which to compute the discharge value (between 0 and 100).
+
+    Returns:
+        float: The discharge value corresponding to the given percentile.
+    """
+    # Ensure 'Date' column is a datetime object
+    hydro_df['Date'] = pd.to_datetime(hydro_df['Date'])
+
+    # Extract daily discharge values
+    daily_discharge = hydro_df['Value']
+    #  Remove non-finite values
+    daily_discharge_cleaned = daily_discharge[np.isfinite(daily_discharge)]
+
+    # Check if there are still issues
+    if daily_discharge_cleaned.empty:
+        raise ValueError("All data was non-finite after cleaning. Please check your dataset.")
+
+    # Fit a GEV distribution
+    shape, loc, scale = genextreme.fit(daily_discharge_cleaned)
+
+
+    # Calculate the discharge value corresponding to the specified percentile
+    discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale)
+
+    return discharge_value
+
+if __name__ == '__main__': 
+    station = 'Bamako'
+    leadtime = 168 
+    Q_Bamako_csv = f"{cfg.stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{station}_{leadtime}.csv"
+    series = csv_to_cleaned_series(Q_Bamako_csv)
+    model = EVA(series)
+    model.get_extremes(
+        method='BM',  # Block Maxima method
+        block_size="365D",  # One year per block
+        )
+
+    #model.plot_extremes()
+    gev_fit = model.fit_model()
+    model.plot_diagnostic(alpha=0.95)
+    summary = model.get_summary(
+        return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000],
+        alpha=0.95, # confidence interval
+        n_samples=1000,
+        )
+    print (summary)
+
diff --git a/comparison/pointMatching.py b/comparison/pointMatching.py
index a986243..4a428cb 100644
--- a/comparison/pointMatching.py
+++ b/comparison/pointMatching.py
@@ -170,8 +170,8 @@ def find_corresponding_point_within_box(station_lon, station_lat, ups_area_point
     plt.xlabel("Longitude")
     plt.ylabel("Latitude")
     plt.legend(loc="upper right", fontsize=10, frameon=True)
-    plt.show()
     plt.savefig(f'{cfg.DataDir}/GloFAS_station_for_DNH_{stationName}.png')
+    plt.show()
     plt.close()
     return model_point_lon, model_point_lat, best_area_diff