From 6adb870b4a6fe490afe70015b94a89da9852028a Mon Sep 17 00:00:00 2001 From: ERKuipers Date: Fri, 13 Dec 2024 16:09:21 +0100 Subject: [PATCH 1/3] new thresholds: need to be adjusted, i actually really dont have time to calculate the thresholds --- comparison/observation/HydroImpact.py | 4 +- comparison/observation/thresholds.py | 78 ++++++++++++++++++--------- 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py index 4d94091..d08876a 100644 --- a/comparison/observation/HydroImpact.py +++ b/comparison/observation/HydroImpact.py @@ -59,9 +59,7 @@ def z_RP_station(HydroStations_RP_file, StationName, RP): z_RP = HydroStations_RP_df[HydroStations_RP_df['StationName'] == StationName][f'{RP}'].values return z_RP - - - + def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): """ Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold. diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py index bd0df10..ccff81c 100644 --- a/comparison/observation/thresholds.py +++ b/comparison/observation/thresholds.py @@ -6,7 +6,7 @@ def csv_to_cleaned_series (csv): df = pd.read_csv( - Q_Bamako_csv, + csv, index_col=0, # setting ValidTime as index column parse_dates=True, # parsing dates in ValidTime ) @@ -15,16 +15,16 @@ def csv_to_cleaned_series (csv): series = df['percentile_40.0'] series = series.loc[series.index < pd.to_datetime('2023-01-01')] series = ( - series - .sort_index(ascending=True) - .astype(float) - .dropna() - ) + series + .sort_index(ascending=True) + .astype(float) + .dropna() + ) return series def Q_Gumbel_fit_RP (hydro_df, RP): # Extract the annual maximum discharge values - hydro_df['Year'] = hydro_df['Date'].dt.year + hydro_df['Year'] = hydro_df['ValidTime'].dt.year annual_max_discharge = hydro_df.groupby('Year')['Value'].max() # Fit a Gumbel distribution to the annual maximum discharge values @@ -33,13 +33,40 @@ def Q_Gumbel_fit_RP (hydro_df, RP): discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale) return discharge_value -def Q_Gumbel_fit_percentile (hydro_df, percentile): - # now for - return discharge_value +def Q_Gumbel_fit_percentile(hydro_df, percentile): + # Extract the annual maximum discharge values + hydro_df['Year'] = hydro_df['ValidTime'].dt.year + annual_max_discharge = hydro_df.groupby('Year')['Value'].max() -def Q_GEV_fit_RP (hydro_df, RP): + # Fit a Gumbel distribution to the annual maximum discharge values + loc, scale = stats.gumbel_r.fit(annual_max_discharge) + + # Convert the percentile to a probability (e.g., 95th percentile = 0.95) + probability = percentile / 100.0 + probplot(annual_max_discharge, dist="genextreme", sparams=(shape, loc, scale), plot=plt) + plt.show() + # Calculate the discharge value corresponding to the percentile + discharge_value = stats.gumbel_r.ppf(probability, loc, scale) return discharge_value +def Q_GEV_fit_RP (series): + model = EVA(series) + model.get_extremes( + method='BM', # Block Maxima method + block_size="365D", # One year per block + ) + + #model.plot_extremes() + gev_fit = model.fit_model() + model.plot_diagnostic(alpha=0.95) + summary = model.get_summary( + return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000], + alpha=0.95, # confidence interval + n_samples=1000, + ) + + return summary + def Q_GEV_fit_percentile (hydro_df, percentile): """ Fits a GEV distribution to the daily discharge values and calculates the discharge @@ -53,31 +80,30 @@ def Q_GEV_fit_percentile (hydro_df, percentile): Returns: float: The discharge value corresponding to the given percentile. """ - # Ensure 'Date' column is a datetime object - hydro_df['Date'] = pd.to_datetime(hydro_df['Date']) - - # Extract daily discharge values - daily_discharge = hydro_df['Value'] - # Remove non-finite values - daily_discharge_cleaned = daily_discharge[np.isfinite(daily_discharge)] - + hydro_df['Year'] = hydro_df['ValidTime'].dt.year + annual_max_discharge = hydro_df.groupby('Year')['Value'].max() # Check if there are still issues - if daily_discharge_cleaned.empty: + if series.empty: raise ValueError("All data was non-finite after cleaning. Please check your dataset.") # Fit a GEV distribution - shape, loc, scale = genextreme.fit(daily_discharge_cleaned) - - + shape, loc, scale = genextreme.fit(series) # Calculate the discharge value corresponding to the specified percentile discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale) - return discharge_value if __name__ == '__main__': station = 'Bamako' leadtime = 168 Q_Bamako_csv = f"{cfg.stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{station}_{leadtime}.csv" + df = pd.read_csv( + Q_Bamako_csv, + index_col=0, # setting ValidTime as index column + parse_dates=True, # parsing dates in ValidTime + ) + + df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time') + series = csv_to_cleaned_series(Q_Bamako_csv) model = EVA(series) model.get_extremes( @@ -93,5 +119,9 @@ def Q_GEV_fit_percentile (hydro_df, percentile): alpha=0.95, # confidence interval n_samples=1000, ) + print (summary) + percentile95_GEV = Q_GEV_fit_percentile(df, 95) + percentile95_Gumbel = Q_Gumbel_fit_percentile (df, 95) + print (percentile95) From 33600f6d383fbbaea389ddc138af0d764ec8dcf8 Mon Sep 17 00:00:00 2001 From: ERKuipers Date: Mon, 16 Dec 2024 12:40:17 +0100 Subject: [PATCH 2/3] hydroimpact for glfoas --- GloFAS/GloFAS_prep/configuration.py | 3 +- comparison/observation/HydroImpact.py | 104 +++++---- comparison/observation/thresholds.py | 307 +++++++++++++++++++------- 3 files changed, 296 insertions(+), 118 deletions(-) diff --git a/GloFAS/GloFAS_prep/configuration.py b/GloFAS/GloFAS_prep/configuration.py index 4b68f95..b28ebc3 100644 --- a/GloFAS/GloFAS_prep/configuration.py +++ b/GloFAS/GloFAS_prep/configuration.py @@ -23,7 +23,8 @@ settlements_tif = DataDir / "GlobalHumanSettlement/GHS_BUILT_S_E2030_GLOBE_R2023A_54009_100_V1_0.tif" crs = f'EPSG:4326' -RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years +RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years +percentiles = [95, 98, 99] leadtimes = [72, 96, 120,144, 168] # add also 24hours startYear = 2004 # could be 2004 but needs to be 2016 since there is no google data available before endYear = 2023 # 00:00 1st of january of that year, so up to but not including diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py index d08876a..3758b34 100644 --- a/comparison/observation/HydroImpact.py +++ b/comparison/observation/HydroImpact.py @@ -4,6 +4,7 @@ from GloFAS.GloFAS_prep.text_formatter import remove_accents, capitalize import scipy.stats as stats from comparison.pointMatching import attributePoints_to_Polygon +from comparison.observation.thresholds import Q_Gumbel_fit_percentile, Q_Gumbel_fit_RP import geopandas as gpd import numpy as np from pyextremes import EVA @@ -31,36 +32,16 @@ def create_datetime(row): return parse_date_with_fallback(row['Date'], row['Year']) hydro_df_long['ParsedDate'] = hydro_df_long.apply(create_datetime, axis=1) - # Drop rows where ParsedDate is None (invalid dates) hydro_df_long = hydro_df_long.dropna(subset=["ParsedDate"]) - # Drop the now redundant 'Year' and 'Date' columns if not needed hydro_df_long = hydro_df_long.drop(columns=["Year", "Date"]) - # Rename ParsedDate back to Date hydro_df_long.rename(columns={"ParsedDate": "Date"}, inplace=True) - # Display the reshaped DataFrame return hydro_df_long - -def z_RP_station(HydroStations_RP_file, StationName, RP): - '''waterlevel - ''' - HydroStations_RP_df = pd.read_csv(HydroStations_RP_file) - - # Remove accents from StationName - StationName = remove_accents(StationName) - # Handle NaN values and ensure all are strings before applying the function - HydroStations_RP_df['StationName'] = HydroStations_RP_df['StationName'].fillna("").astype(str).apply(remove_accents) - - # Filter or calculate QRP based on RP and StationName - z_RP = HydroStations_RP_df[HydroStations_RP_df['StationName'] == StationName][f'{RP}'].values - - return z_RP - -def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType): +def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value'): """ Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold. @@ -75,31 +56,32 @@ def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probabili - pd.DataFrame: Copy of the input DataFrame with an additional 'trigger' column. """ # Ensure "Value" column exists in the DataFrame - if "Value" not in hydro_df.columns: - raise ValueError("The input DataFrame must contain a 'Value' column.") + if value_col not in hydro_df.columns: + raise ValueError("The input DataFrame must contain a 'Value' column, please specify the name correctly") #QRP = QRP_station(HydroStations_RP_file, StationName, RP) - Q_station = hydro_df["Value"] + Q_station = hydro_df[value_col] - if distributionType =='Gumbel': - QRP= QRP_fit (hydro_df, probability) - else: # assuming above 20 is percentile, RP is percentile instead - print ('applying a GEV, assuming your RP is in percentiles') - QRP = Q_GEV_fit (hydro_df, RP) + if type_of_extremity == 'RP': + Q_prob= Q_Gumbel_fit_RP (hydro_df, probability) + elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead + Q_prob = Q_GEV_fit (hydro_df, probability) + else: + print ('no such type of extremity') #print (f"for {StationName} : return period Q= {QRP}") - if not isinstance(QRP, (int, float)): - raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(QRP)}.") + if not isinstance(Q_prob, (int, float)): + raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(Q_prob)}.") # Calculate the QRP threshold # Copy the DataFrame and add the 'trigger' column hydro_trigger_df = hydro_df.copy() - hydro_trigger_df['Trigger'] = (hydro_trigger_df['Value'] > QRP).astype(int) + hydro_trigger_df['Trigger'] = (hydro_trigger_df[value_col] > Q_prob).astype(int) return hydro_trigger_df -def createEvent(trigger_df): +def createEvent(trigger_df, date_col='Date'): # Sort the dataframe by 'ValidTime', then by administrative unit: so that for each administrative unit, they are # first sort on valid time - trigger_df = trigger_df.sort_values(by=[f'Date']).reset_index(drop=True) + trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True) # Prepare commune info with geometry for event creation event_data = [] @@ -118,10 +100,10 @@ def createEvent(trigger_df): # Checking if the current row and next row are both part of an event (Trigger == 1), and the trigger happens in the same adiministrative unit elif row['Trigger'] == 1 and r + 1 < len(trigger_df) and trigger_df.iloc[r + 1]['Trigger'] == 1: Event = 1 - StartDate = row['Date'] + StartDate = row[f'{date_col}'] # Continue until the end of the event where 'Trigger' is no longer 1 while r < len(trigger_df) and trigger_df.iloc[r]['Trigger'] == 1: - possible_endtime = trigger_df.iloc[r]['Date'] + possible_endtime = trigger_df.iloc[r][f'{date_col}'] processed_rows[r] = True # Mark row as processed row = trigger_df.iloc[r] r += 1 @@ -156,8 +138,8 @@ def createEvent(trigger_df): events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date']) return events_df -def loop_over_stations(station_csv, DataDir, RP): - RP = float(RP) +def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): + probability = float(probability) station_df = pd.read_csv (station_csv, header=0) #print (station_df.columns) hydrodir = rf"{DataDir}/DNHMali_2019/Q_stations" @@ -173,7 +155,31 @@ def loop_over_stations(station_csv, DataDir, RP): # print (f'no discharge measures found for station {StationName} in {BasinName}') continue - trigger_df = stampHydroTrigger (hydro_df, RP, StationName) + trigger_df = stampHydroTrigger (hydro_df, StationName, type_of_extremity, probability, value_col) + event_df = createEvent (trigger_df) + event_df ['StationName'] = StationName + all_events.append (event_df) + + #generate the gdf to merge with where the points are attributed to the respective administrative units + all_events_df = pd.concat (all_events, ignore_index=True) + all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv") + return all_events_df + +def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extremity, value_col, leadtime): + probability = float(probability) + station_df = pd.read_csv (station_csv, header=0) + all_events = [] + for _, stationrow in station_df.iterrows(): + StationName = stationrow['StationName'] + BasinName= stationrow['Basin'] + "C:\Users\els-2\MaliGloFAS\data\stations\GloFAS_Q\timeseries\discharge_timeseries_Guelelinkoro_72.csv" + + + glofas_csv = rf"{stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{StationName}_{leadtime}.csv" + glofas_df = pd.read_csv(glofas_csv, header=0) + + + trigger_df = stampHydroTrigger (glofas_df, StationName, type_of_extremity, probability, value_col) event_df = createEvent (trigger_df) event_df ['StationName'] = StationName all_events.append (event_df) @@ -206,7 +212,19 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a if __name__=="__main__": - # running this script for the GloFAS - #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv")) - event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1) - event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1) \ No newline at end of file + + station_df = pd.read_csv (cfg.DNHstations, header=0) + #print (station_df.columns) + date_col = 'ValidTime' + value_col = 'percentile_40.0' # we're interested in 60% probability of being exceeded amongst the ensemble members + all_events = [] + for leadtime in cfg.leadtimes: + for RP in cfg.RPsyr: + loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime) + for percentile in cfg.percentiles: + loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime) + + # # running this script for the GloFAS + # #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv")) + # event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1) + # event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1) \ No newline at end of file diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py index ccff81c..abab528 100644 --- a/comparison/observation/thresholds.py +++ b/comparison/observation/thresholds.py @@ -2,15 +2,61 @@ import GloFAS.GloFAS_prep.configuration as cfg import pyextremes as pe from pyextremes import EVA -from scipy.stats import genextreme +from scipy.stats import genextreme, gumbel_r +import matplotlib.pyplot as plt +from scipy.stats import probplot +import matplotlib.pyplot as plt +import numpy as np -def csv_to_cleaned_series (csv): - df = pd.read_csv( - csv, - index_col=0, # setting ValidTime as index column - parse_dates=True, # parsing dates in ValidTime +def calculate_max_discharge(hydro_df: pd.DataFrame, + value_col: str = 'Value', + timescale: str = 'Year', + incomplete_lastyear: (str,int) = 2023 # not considering the maximum of incomplete years, as these are not representative of the season + ) -> pd.Series: + """ + Calculates the maximum discharge values for a given timescale. + + Parameters: + hydro_df (pd.DataFrame): DataFrame with discharge data. The index must be datetime or convertible to datetime. + value_col (str): The column name of the discharge values. + timescale (str): 'Year' for annual maxima, 'Day' for daily maxima. + + Returns: + pd.Series: Maximum discharge values with the corresponding timescale as the index. + """ + if not isinstance(hydro_df.index, pd.DatetimeIndex): + hydro_df.index = pd.to_datetime(hydro_df.index) + + hydro_df = hydro_df.copy() + + hydro_df = hydro_df.loc[hydro_df.index < pd.to_datetime(f'{incomplete_lastyear}-01-01')] + hydro_df = ( + hydro_df + .sort_index(ascending=True) + .astype(float) + .dropna() ) + if timescale == 'Year': + hydro_df['Year'] = hydro_df.index.year + return hydro_df.groupby('Year')[value_col].max() + elif timescale == 'Day': + hydro_df['Date'] = hydro_df.index.date + return hydro_df.groupby('Date')[value_col].max() + else: + raise ValueError("Invalid timescale. Use 'Year' or 'Day'.") +def csv_to_cleaned_series(csv: str) -> pd.Series: + """ + Reads a CSV file and prepares a cleaned pandas Series for analysis. + + :param csv: Path to the CSV file containing discharge data. + :return: A cleaned pandas Series of discharge values. + """ + df = pd.read_csv( + csv, + index_col=0, # setting ValidTime as index column + parse_dates=True, # parsing dates in ValidTime + ) df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time') series = df['percentile_40.0'] series = series.loc[series.index < pd.to_datetime('2023-01-01')] @@ -19,109 +65,222 @@ def csv_to_cleaned_series (csv): .sort_index(ascending=True) .astype(float) .dropna() - ) + ) return series -def Q_Gumbel_fit_RP (hydro_df, RP): - # Extract the annual maximum discharge values - hydro_df['Year'] = hydro_df['ValidTime'].dt.year - annual_max_discharge = hydro_df.groupby('Year')['Value'].max() +def Q_GEV_fit(hydro_df: pd.DataFrame, value_col: str = 'Value', timescale='Year') -> tuple: + """ + Fits a GEV distribution to the annual maximum discharge values and calculates the discharge + corresponding to a given percentile. + + Parameters: + hydro_df (pd.DataFrame): DataFrame with discharge data. Index must be datetime. + percentile (float): Percentile to calculate (0-100). + value_col (str): The column name for discharge values. + + Returns: + shape, loc, scale: tuple of corresponding fit + """ + # Calculate annual maximum discharges + daily_max_discharge = calculate_max_discharge(hydro_df, value_col, timescale=timescale) - # Fit a Gumbel distribution to the annual maximum discharge values - loc, scale = stats.gumbel_r.fit(annual_max_discharge) - # Calculate the discharge value corresponding to the return period - discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale) + # Fit a GEV distribution + shape_GEV, loc_GEV, scale_GEV = genextreme.fit(daily_max_discharge) + return shape_GEV, loc_GEV, scale_GEV + +def Q_GEV_fit_percentile (hydro_df: pd.DataFrame, percentile: float, value_col: str = 'Value') -> float: + shape,loc,scale = Q_GEV_fit (hydro_df, value_col, timescale='Day') + # Calculate the discharge value corresponding to the percentile + discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale) return discharge_value -def Q_Gumbel_fit_percentile(hydro_df, percentile): - # Extract the annual maximum discharge values - hydro_df['Year'] = hydro_df['ValidTime'].dt.year - annual_max_discharge = hydro_df.groupby('Year')['Value'].max() - # Fit a Gumbel distribution to the annual maximum discharge values - loc, scale = stats.gumbel_r.fit(annual_max_discharge) +def Q_Gumbel_fit_percentile(hydro_df: pd.DataFrame, percentile: float, value_col: str = 'Value') -> float: + """ + Fits a Gumbel distribution to the annual maximum discharge values and calculates the discharge + corresponding to a given percentile. - # Convert the percentile to a probability (e.g., 95th percentile = 0.95) - probability = percentile / 100.0 - probplot(annual_max_discharge, dist="genextreme", sparams=(shape, loc, scale), plot=plt) - plt.show() + Parameters: + hydro_df (pd.DataFrame): DataFrame with discharge data. Index must be datetime. + percentile (float): Percentile to calculate (0-100). + value_col (str): The column name for discharge values. + + Returns: + float: The discharge value corresponding to the given percentile. + """ + # Calculate annual maximum discharges + daily_max_discharge = calculate_max_discharge(hydro_df, value_col, timescale='Day') + + # Fit a Gumbel distribution + loc, scale = gumbel_r.fit(daily_max_discharge) + # Calculate the discharge value corresponding to the percentile - discharge_value = stats.gumbel_r.ppf(probability, loc, scale) + discharge_value = gumbel_r.ppf(percentile / 100, loc, scale) return discharge_value -def Q_GEV_fit_RP (series): - model = EVA(series) - model.get_extremes( - method='BM', # Block Maxima method - block_size="365D", # One year per block - ) +def Q_Gumbel_fit (hydro_df: pd.DataFrame, + value_col: str = 'Value', + timescale: str = 'Year'): + max_discharges = calculate_max_discharge(hydro_df, value_col, timescale=timescale) + loc_gumbel, scale_gumbel = gumbel_r.fit(max_discharges) + return loc_gumbel, scale_gumbel - #model.plot_extremes() +def Q_Gumbel_fit_RP(hydro_df: pd.DataFrame, + RP: float, + value_col: str = 'Value') -> float: + """ + Fits a Gumbel distribution to annual maximum discharge values and calculates + the discharge corresponding to a given return period. + + :param hydro_df: A DataFrame containing discharge data with date and value columns. + :param RP: The return period in years. + :param date_col: Name of the column containing datetime information (default: 'ValidTime'). + :param value_col: Name of the column containing discharge values (default: 'Value'). + :return: The discharge value corresponding to the return period. + """ + loc, scale = Q_Gumbel_fit (hydro_df, value_col, timescale='Year') + return gumbel_r.ppf(1 - 1 / RP, loc, scale) + + +def Q_GEV_fit_RP(hydro_df, value_col, RP): + """ + Fits a GEV distribution to a time series of discharge values and calculates + the discharge values corresponding to specified return periods. + + :param series: A pandas Series containing discharge values. + :param return_periods: A list of return periods in years. + :return: A dictionary with return periods as keys and corresponding discharge values. + """ + + shape,loc, scale = Q_GEV_fit(hydro_df, value_col, timescale='Year') + discharge_value = genextreme.ppf(1-1/RP, shape, loc=loc, scale=scale) + return discharge_value + + return discharge_value + +def Q_GEV_fit_RP_EVA(series: pd.Series, return_periods: list = [2, 5, 10, 25, 50, 100]) -> dict: + """ + Fits a GEV distribution to a time series of discharge values and calculates + the discharge values corresponding to specified return periods. + + :param series: A pandas Series containing discharge values. + :param return_periods: A list of return periods in years. + :return: A dictionary with return periods as keys and corresponding discharge values. + """ + model = EVA(series) + model.get_extremes(method='BM', block_size="365D") gev_fit = model.fit_model() model.plot_diagnostic(alpha=0.95) summary = model.get_summary( - return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000], - alpha=0.95, # confidence interval + return_period=return_periods, + alpha=0.95, # confidence interval n_samples=1000, - ) - + ) return summary -def Q_GEV_fit_percentile (hydro_df, percentile): +def plot_diagnostics(series, shape_gev, loc_gev, scale_gev, loc_gumbel, scale_gumbel, rp_values): """ - Fits a GEV distribution to the daily discharge values and calculates the discharge - corresponding to a given percentile. + Generate diagnostic and comparison plots for GEV and Gumbel fits. Parameters: - hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns. - 'Date' should be a datetime object and 'Value' is the discharge value. - percentile (float): Percentile for which to compute the discharge value (between 0 and 100). - - Returns: - float: The discharge value corresponding to the given percentile. + series (pd.Series): Time series of discharge values. + shape_gev, loc_gev, scale_gev: GEV parameters. + loc_gumbel, scale_gumbel: Gumbel parameters. + rp_values (list): List of return periods. """ - hydro_df['Year'] = hydro_df['ValidTime'].dt.year - annual_max_discharge = hydro_df.groupby('Year')['Value'].max() - # Check if there are still issues - if series.empty: - raise ValueError("All data was non-finite after cleaning. Please check your dataset.") + # Sort data for plotting purposes + sorted_data = np.sort(series) + ecdf = np.arange(1, len(sorted_data) + 1) / len(sorted_data) + + # Theoretical values + gev_theoretical = genextreme.ppf(ecdf, shape_gev, loc=loc_gev, scale=scale_gev) + gumbel_theoretical = gumbel_r.ppf(ecdf, loc=loc_gumbel, scale=scale_gumbel) + + # Plot diagnostics + fig, axes = plt.subplots(2, 2, figsize=(10, 10)) + + # Q-Q plot + axes[0, 0].scatter(gev_theoretical, sorted_data, label='GEV', color='b') + axes[0, 0].scatter(gumbel_theoretical, sorted_data, label='Gumbel', color='r', alpha=0.6) + axes[0, 0].plot([min(sorted_data), max(sorted_data)], [min(sorted_data), max(sorted_data)], '--k') + axes[0, 0].set_title("Q-Q Plot") + axes[0, 0].set_xlabel("Theoretical") + axes[0, 0].set_ylabel("Observed") + axes[0, 0].legend() + + # P-P plot + axes[0, 1].scatter(ecdf, genextreme.cdf(sorted_data, shape_gev, loc=loc_gev, scale=scale_gev), label='GEV', color='b') + axes[0, 1].scatter(ecdf, gumbel_r.cdf(sorted_data, loc=loc_gumbel, scale=scale_gumbel), label='Gumbel', color='r', alpha=0.6) + axes[0, 1].plot([0, 1], [0, 1], '--k') + axes[0, 1].set_title("P-P Plot") + axes[0, 1].set_xlabel("Empirical CDF") + axes[0, 1].set_ylabel("Theoretical CDF") + axes[0, 1].legend() + + # Return Period Plot + gev_return = genextreme.ppf(1 - 1 / np.array(rp_values), shape_gev, loc=loc_gev, scale=scale_gev) + gumbel_return = gumbel_r.ppf(1 - 1 / np.array(rp_values), loc=loc_gumbel, scale=scale_gumbel) + axes[1, 0].scatter(rp_values, series.max(), color='k', label="Observed Data") + axes[1, 0].plot(rp_values, gev_return, '-b', label="GEV Fit") + axes[1, 0].plot(rp_values, gumbel_return, '-r', label="Gumbel Fit") + axes[1, 0].set_xscale('log') + axes[1, 0].set_title("Return Period Plot") + axes[1, 0].set_xlabel("Return Period (years)") + axes[1, 0].set_ylabel("Discharge") + axes[1, 0].legend() + + # Probability Density Function + x = np.linspace(min(series), max(series), 1000) + gev_pdf = genextreme.pdf(x, shape_gev, loc=loc_gev, scale=scale_gev) + gumbel_pdf = gumbel_r.pdf(x, loc=loc_gumbel, scale=scale_gumbel) + axes[1, 1].hist(series, bins=20, density=True, alpha=0.4, color='g', edgecolor='k') + axes[1, 1].plot(x, gev_pdf, '-b', label="GEV PDF") + axes[1, 1].plot(x, gumbel_pdf, '-r', label="Gumbel PDF") + axes[1, 1].set_title("Probability Density Function") + axes[1, 1].set_xlabel("Discharge") + axes[1, 1].set_ylabel("Density") + axes[1, 1].legend() + + plt.tight_layout() + plt.show() - # Fit a GEV distribution - shape, loc, scale = genextreme.fit(series) - # Calculate the discharge value corresponding to the specified percentile - discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale) - return discharge_value -if __name__ == '__main__': +if __name__ == '__main__': station = 'Bamako' - leadtime = 168 + leadtime = 168 + return_periods = [1, 1.5, 2, 5, 10, 25, 50, 100] Q_Bamako_csv = f"{cfg.stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{station}_{leadtime}.csv" + df = pd.read_csv( Q_Bamako_csv, - index_col=0, # setting ValidTime as index column - parse_dates=True, # parsing dates in ValidTime - ) + index_col=0, # setting ValidTime as index column + parse_dates=True, # parsing dates in ValidTime + ) df['percentile_40.0'] = df['percentile_40.0'].interpolate(method='time') series = csv_to_cleaned_series(Q_Bamako_csv) - model = EVA(series) - model.get_extremes( - method='BM', # Block Maxima method - block_size="365D", # One year per block - ) - #model.plot_extremes() + # GEV modeling + model = EVA(series) + model.get_extremes(method='BM', block_size="365D") gev_fit = model.fit_model() model.plot_diagnostic(alpha=0.95) summary = model.get_summary( - return_period=[1, 1.5, 2, 5, 10, 25, 50, 100, 250, 500, 1000], - alpha=0.95, # confidence interval + return_period=return_periods, + alpha=0.95, # confidence interval n_samples=1000, - ) + ) + print(summary) + + # Percentile calculations + percentile95_GEV = Q_GEV_fit_percentile(df, 95, value_col='percentile_40.0') + percentile95_Gumbel = Q_Gumbel_fit_percentile(df, 95,value_col='percentile_40.0') + - print (summary) - percentile95_GEV = Q_GEV_fit_percentile(df, 95) - percentile95_Gumbel = Q_Gumbel_fit_percentile (df, 95) - print (percentile95) + print(f"95th Percentile (GEV): {percentile95_GEV}") + print(f"95th Percentile (Gumbel): {percentile95_Gumbel}") + shape_gev, loc_gev, scale_gev = Q_GEV_fit (df, value_col='percentile_40.0', timescale='Year') + loc_gumbel, scale_gumbel = Q_Gumbel_fit(df, value_col='percentile_40.0') + plot_diagnostics(series, shape_gev, loc_gev, scale_gev, loc_gumbel, scale_gumbel, return_periods) \ No newline at end of file From b78e5521b23b05221a4fcac6b8ff198c25410366 Mon Sep 17 00:00:00 2001 From: ERKuipers Date: Mon, 16 Dec 2024 18:26:50 +0100 Subject: [PATCH 3/3] update for observational data --- comparison/observation/HydroImpact.py | 86 ++++++++++++++++++++------- comparison/observation/thresholds.py | 12 ++-- 2 files changed, 73 insertions(+), 25 deletions(-) diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py index 3758b34..ef8a14e 100644 --- a/comparison/observation/HydroImpact.py +++ b/comparison/observation/HydroImpact.py @@ -16,6 +16,7 @@ def parse_date_with_fallback(date_str, year): date = pd.to_datetime(f"{year} {date_str}", format="%Y %d/%m %H:%M") return date except ValueError: + # If the date is invalid (e.g., Feb 29 in non-leap years), return None #print(f"Invalid date skipped: {date_str} for year {year}") return None @@ -41,7 +42,7 @@ def create_datetime(row): # Display the reshaped DataFrame return hydro_df_long -def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value'): +def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value', date_col='Date'): """ Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold. @@ -55,16 +56,30 @@ def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, val Returns: - pd.DataFrame: Copy of the input DataFrame with an additional 'trigger' column. """ + # Assuming hydro_df is already loaded + if date_col in hydro_df.columns: + # Convert the date_col to datetime + hydro_df[date_col] = pd.to_datetime(hydro_df[date_col], format="%Y-%m-%d %H:%M", errors='coerce') + + # Check for any invalid dates + if hydro_df[date_col].isnull().any(): + print("Warning: Some dates in 'date_col' could not be parsed and are set to NaT.") + + # Set the index to the datetime column + hydro_df.set_index(date_col, inplace=True) + #print(f"Index successfully set to datetime using '{date_col}':\n{hydro_df.index}") + else: + print(f"Error: Column '{date_col}' does not exist in the DataFrame, hopefully teh date is the index") + # Ensure "Value" column exists in the DataFrame if value_col not in hydro_df.columns: raise ValueError("The input DataFrame must contain a 'Value' column, please specify the name correctly") #QRP = QRP_station(HydroStations_RP_file, StationName, RP) - Q_station = hydro_df[value_col] - + Q_station = hydro_df[value_col] # this is where it goes wrong!!! if type_of_extremity == 'RP': - Q_prob= Q_Gumbel_fit_RP (hydro_df, probability) + Q_prob= Q_Gumbel_fit_RP (hydro_df, probability, value_col=value_col) # results in infinite elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead - Q_prob = Q_GEV_fit (hydro_df, probability) + Q_prob = Q_Gumbel_fit_percentile (hydro_df, percentile, value_col) else: print ('no such type of extremity') #print (f"for {StationName} : return period Q= {QRP}") @@ -81,7 +96,15 @@ def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, val def createEvent(trigger_df, date_col='Date'): # Sort the dataframe by 'ValidTime', then by administrative unit: so that for each administrative unit, they are # first sort on valid time - trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True) + if date_col in trigger_df.columns: + trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True) + else: + try: + trigger_df = trigger_df.sort_index() + except: + raise ValueError (f'{date_col} not in trigger_df index and no set index either') + + # Prepare commune info with geometry for event creation event_data = [] @@ -92,18 +115,29 @@ def createEvent(trigger_df, date_col='Date'): r = 0 while r < len(trigger_df): row = trigger_df.iloc[r] + if processed_rows[r]: # If the row is already processed, continue to the next row r += 1 continue - # Checking if the current row and next row are both part of an event (Trigger == 1), and the trigger happens in the same adiministrative unit + # Checking if the current row and next row are both part of an event (Trigger == 1) + # and the trigger happens in the same administrative unit elif row['Trigger'] == 1 and r + 1 < len(trigger_df) and trigger_df.iloc[r + 1]['Trigger'] == 1: Event = 1 - StartDate = row[f'{date_col}'] + StartDate = row[f'{date_col}'] if date_col in trigger_df.columns else trigger_df.index[r] + # Continue until the end of the event where 'Trigger' is no longer 1 while r < len(trigger_df) and trigger_df.iloc[r]['Trigger'] == 1: - possible_endtime = trigger_df.iloc[r][f'{date_col}'] + # Check if the date column is part of the index or a column + if date_col in trigger_df.columns: + possible_endtime = trigger_df.iloc[r][f'{date_col}'] + else: # assuming it is the index + try: + possible_endtime = trigger_df.index[r] + except: + raise ValueError(f'{date_col} not in columns and not a valid index either') + processed_rows[r] = True # Mark row as processed row = trigger_df.iloc[r] r += 1 @@ -138,7 +172,7 @@ def createEvent(trigger_df, date_col='Date'): events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date']) return events_df -def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): +def loop_over_stations_obs(station_csv, DataDir, type_of_extremity, probability, value_col='Value'): probability = float(probability) station_df = pd.read_csv (station_csv, header=0) #print (station_df.columns) @@ -149,8 +183,8 @@ def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, val BasinName= stationrow['Basin'] stationPath = rf"{hydrodir}/{BasinName}_{StationName}.csv" try: - hydro_df = transform_hydro (stationPath) print (f'calculating {StationName}, in {BasinName}') + hydro_df = transform_hydro (stationPath) # this actually looks good except: # print (f'no discharge measures found for station {StationName} in {BasinName}') continue @@ -162,7 +196,10 @@ def loop_over_stations(station_csv, DataDir, type_of_extremity, probability, val #generate the gdf to merge with where the points are attributed to the respective administrative units all_events_df = pd.concat (all_events, ignore_index=True) - all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv") + if type_of_extremity =='RP': + all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{probability}yr.csv") + elif type_of_extremity =='percentile': + all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_percentile{probability}.csv") return all_events_df def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extremity, value_col, leadtime): @@ -170,14 +207,18 @@ def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extre station_df = pd.read_csv (station_csv, header=0) all_events = [] for _, stationrow in station_df.iterrows(): - StationName = stationrow['StationName'] + StationName = remove_accents(stationrow['StationName']) BasinName= stationrow['Basin'] - "C:\Users\els-2\MaliGloFAS\data\stations\GloFAS_Q\timeseries\discharge_timeseries_Guelelinkoro_72.csv" - glofas_csv = rf"{stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{StationName}_{leadtime}.csv" - glofas_df = pd.read_csv(glofas_csv, header=0) - + try: + glofas_df = pd.read_csv(glofas_csv, + header=0, + index_col=0, # setting ValidTime as index column + parse_dates=True) + except: + print (f'no discharge series for station {StationName}') + continue trigger_df = stampHydroTrigger (glofas_df, StationName, type_of_extremity, probability, value_col) event_df = createEvent (trigger_df) @@ -186,7 +227,10 @@ def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extre #generate the gdf to merge with where the points are attributed to the respective administrative units all_events_df = pd.concat (all_events, ignore_index=True) - all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv") + if type_of_extremity =='RP': + all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_RP{probability}yr_leadtime{leadtime}.csv") + elif type_of_extremity =='percentile': + all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_percentile{probability}_leadtime{leadtime}.csv") return all_events_df def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, all_events_df, model, RP): @@ -220,9 +264,11 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a all_events = [] for leadtime in cfg.leadtimes: for RP in cfg.RPsyr: - loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime) + loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'RP', RP, value_col='Value') + #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime) for percentile in cfg.percentiles: - loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime) + loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'percentile', percentile, value_col='Value') + #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime) # # running this script for the GloFAS # #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv")) diff --git a/comparison/observation/thresholds.py b/comparison/observation/thresholds.py index abab528..6d6762c 100644 --- a/comparison/observation/thresholds.py +++ b/comparison/observation/thresholds.py @@ -11,8 +11,8 @@ def calculate_max_discharge(hydro_df: pd.DataFrame, value_col: str = 'Value', timescale: str = 'Year', - incomplete_lastyear: (str,int) = 2023 # not considering the maximum of incomplete years, as these are not representative of the season - ) -> pd.Series: + incomplete_lastyear: (str,int) = 2023, # not considering the maximum of incomplete years, as these are not representative of the season + date_col: str ='Date') -> pd.Series: """ Calculates the maximum discharge values for a given timescale. @@ -25,6 +25,7 @@ def calculate_max_discharge(hydro_df: pd.DataFrame, pd.Series: Maximum discharge values with the corresponding timescale as the index. """ if not isinstance(hydro_df.index, pd.DatetimeIndex): + hydro_df [date_col] = hydro_df.index hydro_df.index = pd.to_datetime(hydro_df.index) hydro_df = hydro_df.copy() @@ -38,13 +39,14 @@ def calculate_max_discharge(hydro_df: pd.DataFrame, ) if timescale == 'Year': hydro_df['Year'] = hydro_df.index.year - return hydro_df.groupby('Year')[value_col].max() + + maximum = hydro_df.groupby('Year')[value_col].max() elif timescale == 'Day': hydro_df['Date'] = hydro_df.index.date - return hydro_df.groupby('Date')[value_col].max() + maximum = hydro_df.groupby('Date')[value_col].max() else: raise ValueError("Invalid timescale. Use 'Year' or 'Day'.") - + return maximum def csv_to_cleaned_series(csv: str) -> pd.Series: """ Reads a CSV file and prepares a cleaned pandas Series for analysis.