Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
valentijn7 committed Dec 16, 2024
2 parents 5953904 + b78e552 commit 060a80b
Show file tree
Hide file tree
Showing 3 changed files with 357 additions and 103 deletions.
3 changes: 2 additions & 1 deletion GloFAS/GloFAS_prep/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
settlements_tif = DataDir / "GlobalHumanSettlement/GHS_BUILT_S_E2030_GLOBE_R2023A_54009_100_V1_0.tif"

crs = f'EPSG:4326'
RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years
RPsyr = [1.5, 2.0, 5.0, 10.0] # return period threshold in years
percentiles = [95, 98, 99]
leadtimes = [72, 96, 120,144, 168] # add also 24hours
startYear = 2004 # could be 2004 but needs to be 2016 since there is no google data available before
endYear = 2023 # 00:00 1st of january of that year, so up to but not including
Expand Down
160 changes: 111 additions & 49 deletions comparison/observation/HydroImpact.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from GloFAS.GloFAS_prep.text_formatter import remove_accents, capitalize
import scipy.stats as stats
from comparison.pointMatching import attributePoints_to_Polygon
from comparison.observation.thresholds import Q_Gumbel_fit_percentile, Q_Gumbel_fit_RP
import geopandas as gpd
import numpy as np
from pyextremes import EVA
Expand All @@ -15,6 +16,7 @@ def parse_date_with_fallback(date_str, year):
date = pd.to_datetime(f"{year} {date_str}", format="%Y %d/%m %H:%M")
return date
except ValueError:

# If the date is invalid (e.g., Feb 29 in non-leap years), return None
#print(f"Invalid date skipped: {date_str} for year {year}")
return None
Expand All @@ -31,38 +33,16 @@ def create_datetime(row):
return parse_date_with_fallback(row['Date'], row['Year'])

hydro_df_long['ParsedDate'] = hydro_df_long.apply(create_datetime, axis=1)

# Drop rows where ParsedDate is None (invalid dates)
hydro_df_long = hydro_df_long.dropna(subset=["ParsedDate"])

# Drop the now redundant 'Year' and 'Date' columns if not needed
hydro_df_long = hydro_df_long.drop(columns=["Year", "Date"])

# Rename ParsedDate back to Date
hydro_df_long.rename(columns={"ParsedDate": "Date"}, inplace=True)

# Display the reshaped DataFrame
return hydro_df_long

def z_RP_station(HydroStations_RP_file, StationName, RP):
'''waterlevel
'''
HydroStations_RP_df = pd.read_csv(HydroStations_RP_file)

# Remove accents from StationName
StationName = remove_accents(StationName)

# Handle NaN values and ensure all are strings before applying the function
HydroStations_RP_df['StationName'] = HydroStations_RP_df['StationName'].fillna("").astype(str).apply(remove_accents)

# Filter or calculate QRP based on RP and StationName
z_RP = HydroStations_RP_df[HydroStations_RP_df['StationName'] == StationName][f'{RP}'].values

return z_RP



def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probability, distributionType):
def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, value_col='Value', date_col='Date'):
"""
Adds a 'trigger' column to the hydro_df DataFrame indicating whether the 'Value' exceeds the QRP threshold.
Expand All @@ -76,32 +56,55 @@ def stampHydroTrigger(hydro_df, StationName, temporality_of_extremity, probabili
Returns:
- pd.DataFrame: Copy of the input DataFrame with an additional 'trigger' column.
"""
# Assuming hydro_df is already loaded
if date_col in hydro_df.columns:
# Convert the date_col to datetime
hydro_df[date_col] = pd.to_datetime(hydro_df[date_col], format="%Y-%m-%d %H:%M", errors='coerce')

# Check for any invalid dates
if hydro_df[date_col].isnull().any():
print("Warning: Some dates in 'date_col' could not be parsed and are set to NaT.")

# Set the index to the datetime column
hydro_df.set_index(date_col, inplace=True)
#print(f"Index successfully set to datetime using '{date_col}':\n{hydro_df.index}")
else:
print(f"Error: Column '{date_col}' does not exist in the DataFrame, hopefully teh date is the index")

# Ensure "Value" column exists in the DataFrame
if "Value" not in hydro_df.columns:
raise ValueError("The input DataFrame must contain a 'Value' column.")
if value_col not in hydro_df.columns:
raise ValueError("The input DataFrame must contain a 'Value' column, please specify the name correctly")
#QRP = QRP_station(HydroStations_RP_file, StationName, RP)
Q_station = hydro_df["Value"]

if distributionType =='Gumbel':
QRP= QRP_fit (hydro_df, probability)
else: # assuming above 20 is percentile, RP is percentile instead
print ('applying a GEV, assuming your RP is in percentiles')
QRP = Q_GEV_fit (hydro_df, RP)
Q_station = hydro_df[value_col] # this is where it goes wrong!!!
if type_of_extremity == 'RP':
Q_prob= Q_Gumbel_fit_RP (hydro_df, probability, value_col=value_col) # results in infinite
elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead
Q_prob = Q_Gumbel_fit_percentile (hydro_df, percentile, value_col)
else:
print ('no such type of extremity')
#print (f"for {StationName} : return period Q= {QRP}")
if not isinstance(QRP, (int, float)):
raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(QRP)}.")
if not isinstance(Q_prob, (int, float)):
raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(Q_prob)}.")
# Calculate the QRP threshold


# Copy the DataFrame and add the 'trigger' column
hydro_trigger_df = hydro_df.copy()
hydro_trigger_df['Trigger'] = (hydro_trigger_df['Value'] > QRP).astype(int)
hydro_trigger_df['Trigger'] = (hydro_trigger_df[value_col] > Q_prob).astype(int)
return hydro_trigger_df

def createEvent(trigger_df):
def createEvent(trigger_df, date_col='Date'):
# Sort the dataframe by 'ValidTime', then by administrative unit: so that for each administrative unit, they are
# first sort on valid time
trigger_df = trigger_df.sort_values(by=[f'Date']).reset_index(drop=True)
if date_col in trigger_df.columns:
trigger_df = trigger_df.sort_values(by=[f'{date_col}']).reset_index(drop=True)
else:
try:
trigger_df = trigger_df.sort_index()
except:
raise ValueError (f'{date_col} not in trigger_df index and no set index either')



# Prepare commune info with geometry for event creation
event_data = []
Expand All @@ -112,18 +115,29 @@ def createEvent(trigger_df):
r = 0
while r < len(trigger_df):
row = trigger_df.iloc[r]

if processed_rows[r]:
# If the row is already processed, continue to the next row
r += 1
continue

# Checking if the current row and next row are both part of an event (Trigger == 1), and the trigger happens in the same adiministrative unit
# Checking if the current row and next row are both part of an event (Trigger == 1)
# and the trigger happens in the same administrative unit
elif row['Trigger'] == 1 and r + 1 < len(trigger_df) and trigger_df.iloc[r + 1]['Trigger'] == 1:
Event = 1
StartDate = row['Date']
StartDate = row[f'{date_col}'] if date_col in trigger_df.columns else trigger_df.index[r]

# Continue until the end of the event where 'Trigger' is no longer 1
while r < len(trigger_df) and trigger_df.iloc[r]['Trigger'] == 1:
possible_endtime = trigger_df.iloc[r]['Date']
# Check if the date column is part of the index or a column
if date_col in trigger_df.columns:
possible_endtime = trigger_df.iloc[r][f'{date_col}']
else: # assuming it is the index
try:
possible_endtime = trigger_df.index[r]
except:
raise ValueError(f'{date_col} not in columns and not a valid index either')

processed_rows[r] = True # Mark row as processed
row = trigger_df.iloc[r]
r += 1
Expand Down Expand Up @@ -158,8 +172,8 @@ def createEvent(trigger_df):
events_df = pd.DataFrame(columns=['Observation', 'Start Date', 'End Date'])
return events_df

def loop_over_stations(station_csv, DataDir, RP):
RP = float(RP)
def loop_over_stations_obs(station_csv, DataDir, type_of_extremity, probability, value_col='Value'):
probability = float(probability)
station_df = pd.read_csv (station_csv, header=0)
#print (station_df.columns)
hydrodir = rf"{DataDir}/DNHMali_2019/Q_stations"
Expand All @@ -169,20 +183,54 @@ def loop_over_stations(station_csv, DataDir, RP):
BasinName= stationrow['Basin']
stationPath = rf"{hydrodir}/{BasinName}_{StationName}.csv"
try:
hydro_df = transform_hydro (stationPath)
print (f'calculating {StationName}, in {BasinName}')
hydro_df = transform_hydro (stationPath) # this actually looks good
except:
# print (f'no discharge measures found for station {StationName} in {BasinName}')
continue

trigger_df = stampHydroTrigger (hydro_df, RP, StationName)
trigger_df = stampHydroTrigger (hydro_df, StationName, type_of_extremity, probability, value_col)
event_df = createEvent (trigger_df)
event_df ['StationName'] = StationName
all_events.append (event_df)

#generate the gdf to merge with where the points are attributed to the respective administrative units
all_events_df = pd.concat (all_events, ignore_index=True)
all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{RP}yr.csv")
if type_of_extremity =='RP':
all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_RP_{probability}yr.csv")
elif type_of_extremity =='percentile':
all_events_df.to_csv (f"{DataDir}/Observation/observationalStation_flood_events_percentile{probability}.csv")
return all_events_df

def loop_over_stations_pred(station_csv, stationsDir, probability, type_of_extremity, value_col, leadtime):
probability = float(probability)
station_df = pd.read_csv (station_csv, header=0)
all_events = []
for _, stationrow in station_df.iterrows():
StationName = remove_accents(stationrow['StationName'])
BasinName= stationrow['Basin']

glofas_csv = rf"{stationsDir}/GloFAS_Q/timeseries/discharge_timeseries_{StationName}_{leadtime}.csv"
try:
glofas_df = pd.read_csv(glofas_csv,
header=0,
index_col=0, # setting ValidTime as index column
parse_dates=True)
except:
print (f'no discharge series for station {StationName}')
continue

trigger_df = stampHydroTrigger (glofas_df, StationName, type_of_extremity, probability, value_col)
event_df = createEvent (trigger_df)
event_df ['StationName'] = StationName
all_events.append (event_df)

#generate the gdf to merge with where the points are attributed to the respective administrative units
all_events_df = pd.concat (all_events, ignore_index=True)
if type_of_extremity =='RP':
all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_RP{probability}yr_leadtime{leadtime}.csv")
elif type_of_extremity =='percentile':
all_events_df.to_csv (f"{stationsDir}/GloFAS_Q/GloFASstation_flood_events_percentile{probability}_leadtime{leadtime}.csv")
return all_events_df

def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, all_events_df, model, RP):
Expand All @@ -208,7 +256,21 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a


if __name__=="__main__":
# running this script for the GloFAS
#print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)

station_df = pd.read_csv (cfg.DNHstations, header=0)
#print (station_df.columns)
date_col = 'ValidTime'
value_col = 'percentile_40.0' # we're interested in 60% probability of being exceeded amongst the ensemble members
all_events = []
for leadtime in cfg.leadtimes:
for RP in cfg.RPsyr:
loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'RP', RP, value_col='Value')
#loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
for percentile in cfg.percentiles:
loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'percentile', percentile, value_col='Value')
#loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)

# # running this script for the GloFAS
# #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
# event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 1)
# event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 1)
Loading

0 comments on commit 060a80b

Please sign in to comment.