From df5fb5bb2231e245b17ee1c028aae64c80bd830e Mon Sep 17 00:00:00 2001 From: ERKuipers Date: Fri, 13 Dec 2024 13:56:07 +0100 Subject: [PATCH] thresholds --- GloFAS/GloFAS_prep/Q_timeseries_station.py | 6 +- GloFAS/GloFAS_prep/station_to_glofas.py | 2 +- comparison/observation/HydroImpact.py | 56 ++----------- comparison/observation/thresholds.py | 97 ++++++++++++++++++++++ comparison/pointMatching.py | 2 +- 5 files changed, 110 insertions(+), 53 deletions(-) create mode 100644 comparison/observation/thresholds.py diff --git a/GloFAS/GloFAS_prep/Q_timeseries_station.py b/GloFAS/GloFAS_prep/Q_timeseries_station.py index a8a64b0..d2e20f0 100644 --- a/GloFAS/GloFAS_prep/Q_timeseries_station.py +++ b/GloFAS/GloFAS_prep/Q_timeseries_station.py @@ -13,6 +13,7 @@ from GloFAS.GloFAS_data_extractor.threshold_opener import openThreshold from GloFAS.GloFAS_data_extractor.unzip_open import openGloFAS, unzipGloFAS import GloFAS.GloFAS_prep.configuration as cfg + class GloFAS_timeseries: def __init__(self, DataDir, @@ -99,8 +100,11 @@ def Q_over_time(self): station_results_df = pd.DataFrame(station_results) station_results_df.to_csv(f'{self.DataDir}/stations/timeseries/discharge_timeseries_{stationname}_{self.leadtime}.csv', index=False) print(f"Finished station {stationname}") + + + if __name__ =='__main__': for leadtime in cfg.leadtimes: timeseries = GloFAS_timeseries(DataDir=cfg.DataDir, @@ -113,4 +117,4 @@ def Q_over_time(self): end_date=None, IDhead='Station names', percentile=(100-(cfg.triggerProb*100))) # percentile is 1 - the probability of exceedence. At 40 percent percentile, tehre is a 60 procent probability of this value being exceeded - timeseries.Q_over_time() \ No newline at end of file + Q_timeseries.Q_over_time() diff --git a/GloFAS/GloFAS_prep/station_to_glofas.py b/GloFAS/GloFAS_prep/station_to_glofas.py index 7090e2d..3257e9f 100644 --- a/GloFAS/GloFAS_prep/station_to_glofas.py +++ b/GloFAS/GloFAS_prep/station_to_glofas.py @@ -22,7 +22,7 @@ glofas_x, glofas_y, area_diff = find_corresponding_point_within_box( station_lon=row['Lon'], station_lat=row['Lat'], - ups_area_point_m=row['Catchment area (km2)'] * 1e6, # Convert from km² to m² + ups_area_point_m=row['Catchment area (km2)']*1e6, # Convert from km² to m² stationName=row['Station names'], glofas_ups_area_xr=glofas_ups_area_da, radius_m=10000 diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py index 656ded1..4d94091 100644 --- a/comparison/observation/HydroImpact.py +++ b/comparison/observation/HydroImpact.py @@ -6,6 +6,7 @@ from comparison.pointMatching import attributePoints_to_Polygon import geopandas as gpd import numpy as np +from pyextremes import EVA from scipy.stats import genextreme def parse_date_with_fallback(date_str, year): @@ -59,54 +60,9 @@ def z_RP_station(HydroStations_RP_file, StationName, RP): return z_RP -def QRP_fit (hydro_df, RP): - # Extract the annual maximum discharge values - hydro_df['Year'] = hydro_df['Date'].dt.year - annual_max_discharge = hydro_df.groupby('Year')['Value'].max() - # Fit a Gumbel distribution to the annual maximum discharge values - loc, scale = stats.gumbel_r.fit(annual_max_discharge) - # Calculate the discharge value corresponding to the return period - discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale) - return discharge_value - -def Q_GEV_fit(hydro_df, percentile): - """ - Fits a GEV distribution to the daily discharge values and calculates the discharge - corresponding to a given percentile. - - Parameters: - hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns. - 'Date' should be a datetime object and 'Value' is the discharge value. - percentile (float): Percentile for which to compute the discharge value (between 0 and 100). - - Returns: - float: The discharge value corresponding to the given percentile. - """ - # Ensure 'Date' column is a datetime object - hydro_df['Date'] = pd.to_datetime(hydro_df['Date']) - - # Extract daily discharge values - daily_discharge = hydro_df['Value'] - # Remove non-finite values - daily_discharge_cleaned = daily_discharge[np.isfinite(daily_discharge)] - - # Check if there are still issues - if daily_discharge_cleaned.empty: - raise ValueError("All data was non-finite after cleaning. Please check your dataset.") - - # Fit a GEV distribution - shape, loc, scale = genextreme.fit(daily_discharge_cleaned) - - - # Calculate the discharge value corresponding to the specified percentile - discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale) - - return discharge_value - - -def stampHydroTrigger(hydro_df, RP, StationName): +def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): """ Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold. @@ -126,8 +82,8 @@ def stampHydroTrigger(hydro_df, RP, StationName): #QRP = QRP_station(HydroStations_RP_file, StationName, RP) Q_station = hydro_df["Value"] - if RP < 21: - QRP= QRP_fit (hydro_df, RP) + if distributionType =='Gumbel': + QRP= QRP_fit (hydro_df, probability) else: # assuming above 20 is percentile, RP is percentile instead print ('applying a GEV, assuming your RP is in percentiles') QRP = Q_GEV_fit (hydro_df, RP) @@ -206,7 +162,7 @@ def loop_over_stations(station_csv, DataDir, RP): RP = float(RP) station_df = pd.read_csv (station_csv, header=0) #print (station_df.columns) - hydrodir = rf"{DataDir}/DNHMali_2019\Q_stations" + hydrodir = rf"{DataDir}/DNHMali_2019/Q_stations" all_events = [] for _, stationrow in station_df.iterrows(): StationName = stationrow['StationName'] @@ -252,7 +208,7 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a if __name__=="__main__": - + # running this script for the GloFAS #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv")) event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1) event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1) \ No newline at end of file diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py new file mode 100644 index 0000000..bd0df10 --- /dev/null +++ b/comparison/observation/thresholds.py @@ -0,0 +1,97 @@ +import pandas as pd +import GloFAS.GloFAS_prep.configuration as cfg +import pyextremes as pe +from pyextremes import EVA +from scipy.stats import genextreme + +def csv_to_cleaned_series (csv): + df = pd.read_csv( + Q_Bamako_csv, + index_col=0, # setting ValidTime as index column + parse_dates=True, # parsing dates in ValidTime + ) + + df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time') + series = df['percentile_40.0'] + series = series.loc[series.index < pd.to_datetime('2023-01-01')] + series = ( + series + .sort_index(ascending=True) + .astype(float) + .dropna() + ) + return series +def Q_Gumbel_fit_RP (hydro_df, RP): + + # Extract the annual maximum discharge values + hydro_df['Year'] = hydro_df['Date'].dt.year + annual_max_discharge = hydro_df.groupby('Year')['Value'].max() + + # Fit a Gumbel distribution to the annual maximum discharge values + loc, scale = stats.gumbel_r.fit(annual_max_discharge) + # Calculate the discharge value corresponding to the return period + discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale) + return discharge_value + +def Q_Gumbel_fit_percentile (hydro_df, percentile): + # now for + return discharge_value + +def Q_GEV_fit_RP (hydro_df, RP): + return discharge_value + +def Q_GEV_fit_percentile (hydro_df, percentile): + """ + Fits a GEV distribution to the daily discharge values and calculates the discharge + corresponding to a given percentile. + + Parameters: + hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns. + 'Date' should be a datetime object and 'Value' is the discharge value. + percentile (float): Percentile for which to compute the discharge value (between 0 and 100). + + Returns: + float: The discharge value corresponding to the given percentile. + """ + # Ensure 'Date' column is a datetime object + hydro_df['Date'] = pd.to_datetime(hydro_df['Date']) + + # Extract daily discharge values + daily_discharge = hydro_df['Value'] + # Remove non-finite values + daily_discharge_cleaned = daily_discharge[np.isfinite(daily_discharge)] + + # Check if there are still issues + if daily_discharge_cleaned.empty: + raise ValueError("All data was non-finite after cleaning. Please check your dataset.") + + # Fit a GEV distribution + shape, loc, scale = genextreme.fit(daily_discharge_cleaned) + + + # Calculate the discharge value corresponding to the specified percentile + discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale) + + return discharge_value + +if __name__ == '__main__': + station = 'Bamako' + leadtime = 168 + Q_Bamako_csv = f"{cfg.stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{station}_{leadtime}.csv" + series = csv_to_cleaned_series(Q_Bamako_csv) + model = EVA(series) + model.get_extremes( + method='BM', # Block Maxima method + block_size="365D", # One year per block + ) + + #model.plot_extremes() + gev_fit = model.fit_model() + model.plot_diagnostic(alpha=0.95) + summary = model.get_summary( + return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000], + alpha=0.95, # confidence interval + n_samples=1000, + ) + print (summary) + diff --git a/comparison/pointMatching.py b/comparison/pointMatching.py index a986243..4a428cb 100644 --- a/comparison/pointMatching.py +++ b/comparison/pointMatching.py @@ -170,8 +170,8 @@ def find_corresponding_point_within_box(station_lon, station_lat, ups_area_point plt.xlabel("Longitude") plt.ylabel("Latitude") plt.legend(loc="upper right", fontsize=10, frameon=True) - plt.show() plt.savefig(f'{cfg.DataDir}/GloFAS_station_for_DNH_{stationName}.png') + plt.show() plt.close() return model_point_lon, model_point_lat, best_area_diff