Skip to content

Commit

Permalink
collect over station not admin
Browse files Browse the repository at this point in the history
  • Loading branch information
ERKuipers committed Dec 18, 2024
1 parent a24f925 commit 942aba4
Show file tree
Hide file tree
Showing 6 changed files with 646 additions and 105 deletions.
443 changes: 443 additions & 0 deletions GloFAS/GloFAS_analysis/station_adm_performance_calculator.py

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion GloFAS/GloFAS_prep/vectorCheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def df_from_txt_or_csv (vector):
if vector.endswith ('.csv'):
df = pd.read_csv(vector)
elif vector.endswith ('.txt'):
df = pd.read_csv (vector, sep="; ", header=0)
df = pd.read_csv (vector, sep=",", header=0)
return df

def checkVectorFormat(vector, shapeType=None, crs='EPSG:4326', placement='real'):
Expand Down
6 changes: 3 additions & 3 deletions PTM/PTM_prep/ptm_events.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
from comparison.observation.HydroImpact import loop_over_stations, events_per_adm
from comparison.observation.HydroImpact import loop_over_stations_obs, events_per_adm
import pandas as pd
from datetime import datetime, timedelta
import GloFAS.GloFAS_prep.configuration as cfg
columnID = 'StationName'
# k = [upstream, downstream], v = days of propagation time


def ptm_events(DNHstations, DataDir, RP, StationCombos):
def ptm_events(DNHstations, DataDir, thresholdtype, threshold_value, StationCombos):
# Generate station events data
station_events_df = loop_over_stations(DNHstations, DataDir, RP)
station_events_df = loop_over_stations_obs(DNHstations, DataDir, f'{thresholdtype}',threshold_value,value_col='Value')
pred_ptm_events = []

# Iterate through station combinations
Expand Down
199 changes: 150 additions & 49 deletions comparison/collect_for_administrative_unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,134 @@
import pandas as pd
import numpy as np

def collect_performance_measures(admin_unit, DataDir, leadtimes, return_periods):
def collect_performance_measures_over_station(StationName, DataDir, leadtimes, return_periods, percentiles):
"""
Collects performance metrics (POD, FAR) for a given administrative unit and organizes them into 2D arrays.
Collects performance metrics (POD, FAR) for a given administrative unit and organizes them into arrays
for both return periods and percentiles.
Parameters:
- admin_unit (str): Name of the administrative unit (ADM2 field in the CSVs).
- DataDir (str): Base directory containing subdirectories for models (GloFAS, Google Floodhub, PTM).
- leadtimes (list): List of leadtimes to include (defines columns of the 2D array).
- return_periods (list): List of return periods to include (defines rows of the 2D array).
- leadtimes (list): List of leadtimes to include (defines one axis of the array).
- return_periods (list): List of return periods (defines thresholds for one type of analysis).
- percentiles (list): List of percentiles (defines thresholds for another type of analysis).
Returns:
- data (dict): Nested dictionary containing performance metrics in 2D arrays:
{
'leadtimes': [...],
'return_periods': [...],
'POD': {
'GloFAS': {'Observation': np.array([...]), 'impact': np.array([...])},
'Google Floodhub': {'Observation': np.array([...]), 'impact': np.array([...])},
'PTM': {'Observation': np.array([...]), 'Impact': np.array([...])}
},
'FAR': {
'GloFAS': {'Observation': np.array([...]), 'Impact': np.array([...])},
'Google Floodhub': {'Observation': np.array([...]), 'Impact': np.array([...])},
'PTM': {'Observation': np.array([...]), 'Impact': np.array([...])}
}
}
- data (dict): Nested dictionary containing performance metrics organized by leadtime and threshold type.
"""
models = ['GloFAS', 'GoogleFloodHub', 'PTM']
data = {
'leadtimes': leadtimes,
'return_periods': return_periods,
'POD': {model: {'Observation': np.full((len(return_periods), len(leadtimes)), np.nan),
'Impact': np.full((len(return_periods), len(leadtimes)), np.nan)} for model in models},
'FAR': {model: {'Observation': np.full((len(return_periods), len(leadtimes)), np.nan),
'Impact': np.full((len(return_periods), len(leadtimes)), np.nan)} for model in models}
'thresholds': {
'return_periods': return_periods,
'percentiles': percentiles
},
'POD': {model: {
'Observation': {
'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
}
} for model in models},
'FAR': {model: {
'Observation': {
'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
}
} for model in models}
}

for model in models:
model_dir = os.path.join(DataDir, model.replace(' ', '')) # Adjust directory naming if needed
if not os.path.exists(model_dir):
print(f"Directory not found for model: {model}, skipping.")
continue

for comparison_type in ['Observation']:# 'Impact']:
comp_dir = os.path.join(model_dir, comparison_type)
if not os.path.exists(comp_dir):
print(f"Directory not found for comparison type: {comparison_type} in {model}, skipping.")
continue

# Collect data for return periods
for rp_idx, return_period in enumerate(return_periods):
for lt_idx, leadtime in enumerate(leadtimes):
file_name = f'scores_byCommuneRP{return_period:.1f}_yr_leadtime{leadtime}.csv'
file_path = os.path.join(comp_dir, file_name)

if os.path.exists(file_path):
try:
df = pd.read_csv(file_path)
row = df[df['StationName'] == StationName]
if not row.empty:
pod_value = row['pod'].values[0] if 'pod' in row else np.nan
far_value = row['far'].values[0] if 'far' in row else np.nan
data['POD'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = pod_value
data['FAR'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = far_value
except Exception as e:
print(f"Error reading file {file_path}: {e}, skipping.")

# Collect data for percentiles
for pct_idx, percentile in enumerate(percentiles):
for lt_idx, leadtime in enumerate(leadtimes):#so a bit weird format but ok:
file_name = f'scores_byCommuneRP{percentile:.1f}_yr_leadtime{leadtime}.csv'
file_path = os.path.join(comp_dir, file_name)

if os.path.exists(file_path):
try:
df = pd.read_csv(file_path)
row = df[df['StationName'] == StationName]
if not row.empty:
pod_value = row['pod'].values[0] if 'pod' in row else np.nan
far_value = row['far'].values[0] if 'far' in row else np.nan
data['POD'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = pod_value
data['FAR'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = far_value
except Exception as e:
print(f"Error reading file {file_path}: {e}, skipping.")

return data

def collect_performance_measures_over_admin(admin_unit, DataDir, leadtimes, return_periods, percentiles):
"""
Collects performance metrics (POD, FAR) for a given administrative unit and organizes them into arrays
for both return periods and percentiles.
Parameters:
- admin_unit (str): Name of the administrative unit (ADM2 field in the CSVs).
- DataDir (str): Base directory containing subdirectories for models (GloFAS, Google Floodhub, PTM).
- leadtimes (list): List of leadtimes to include (defines one axis of the array).
- return_periods (list): List of return periods (defines thresholds for one type of analysis).
- percentiles (list): List of percentiles (defines thresholds for another type of analysis).
Returns:
- data (dict): Nested dictionary containing performance metrics organized by leadtime and threshold type.
"""
models = ['GloFAS', 'GoogleFloodHub', 'PTM']
data = {
'leadtimes': leadtimes,
'thresholds': {
'return_periods': return_periods,
'percentiles': percentiles
},
'POD': {model: {
'Observation': {
'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
},
'Impact': {
'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
}
} for model in models},
'FAR': {model: {
'Observation': {
'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
},
'Impact': {
'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
}
} for model in models}
}

for model in models:
Expand All @@ -51,38 +144,46 @@ def collect_performance_measures(admin_unit, DataDir, leadtimes, return_periods)
print(f"Directory not found for comparison type: {comparison_type} in {model}, skipping.")
continue

for i, return_period in enumerate(return_periods): # Row index for return periods
for j, leadtime in enumerate(leadtimes): # Column index for lead times
# Build the filename
# Collect data for return periods
for rp_idx, return_period in enumerate(return_periods):
for lt_idx, leadtime in enumerate(leadtimes):
file_name = f'scores_byCommuneRP{return_period}_yr_leadtime{leadtime}.csv'
file_path = os.path.join(comp_dir, file_name)

if not os.path.exists(file_path):
print(f"File not found: {file_path}, skipping.")
continue

# Read the CSV file
try:
df = pd.read_csv(file_path)
except Exception as e:
print(f"Error reading file {file_path}: {e}, skipping.")
continue
if os.path.exists(file_path):
try:
df = pd.read_csv(file_path)
row = df[df['ADM2'] == admin_unit]
if not row.empty:
pod_value = row['pod'].values[0] if 'pod' in row else np.nan
far_value = row['far'].values[0] if 'far' in row else np.nan
data['POD'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = pod_value
data['FAR'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = far_value
except Exception as e:
print(f"Error reading file {file_path}: {e}, skipping.")

# Collect data for percentiles
for pct_idx, percentile in enumerate(percentiles):
for lt_idx, leadtime in enumerate(leadtimes):#so a bit weird format but ok
file_name = f'scores_byCommuneRP{percentile}_yr_leadtime{leadtime}.csv'
file_path = os.path.join(comp_dir, file_name)

# Filter rows for the specified admin unit
row = df[df['ADM2'] == admin_unit]
if row.empty:
print(f"Admin unit {admin_unit} not found in file: {file_path}, skipping.")
else:
# Extract POD and FAR values
pod_value = row['pod'].values[0] if 'pod' in row else np.nan
far_value = row['far'].values[0] if 'far' in row else np.nan

data['POD'][model][comparison_type][i, j] = pod_value
data['FAR'][model][comparison_type][i, j] = far_value
if os.path.exists(file_path):
try:
df = pd.read_csv(file_path)
row = df[df['ADM2'] == admin_unit]
if not row.empty:
pod_value = row['pod'].values[0] if 'pod' in row else np.nan
far_value = row['far'].values[0] if 'far' in row else np.nan
data['POD'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = pod_value
data['FAR'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = far_value
except Exception as e:
print(f"Error reading file {file_path}: {e}, skipping.")

return data



if __name__ =='__main__':
# These regions include Bamako, Koulikoro, Ségou, Mopti, Timbouctou and Gao, which have historically experienced frequent and severe flood events.
# Regions such as Mopti and Ségou are of particular concern due to their high exposure to flooding as well as their dense populations,
Expand All @@ -91,5 +192,5 @@ def collect_performance_measures(admin_unit, DataDir, leadtimes, return_periods)
# Kolondieba in Sikasso, okay score in obs, no impact
# San in segou: impact data

GloFASdata = collect_performance_measures('BLA', cfg.DataDir, cfg.leadtimes, cfg.RPsyr)
GloFASdata = collect_performance_measures_over_station('BAMAKO', cfg.DataDir, cfg.leadtimes, cfg.RPsyr, cfg.percentiles)
print (GloFASdata)
14 changes: 5 additions & 9 deletions comparison/observation/HydroImpact.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def parse_date_with_fallback(date_str, year):
return None


def transform_hydro(csvPath, startYear, endYear):
def transform_hydro(csvPath, startYear=2004, endYear=2018):
hydro_df_wide = pd.read_csv(csvPath, header=0)

# Melt the wide-format DataFrame to long format
Expand Down Expand Up @@ -57,7 +57,7 @@ def create_datetime(row):
# Filter data for the given start and end year
hydro_df_long = hydro_df_long[(hydro_df_long.index.year >= startYear) & (hydro_df_long.index.year <= endYear)]
else:
print(f"Error: Column 'Date' does not exist in the DataFrame, hopefully the date is the index")
print(f"Warning: Column 'Date' does not exist in the DataFrame, hopefully the date is the index")

# Display the reshaped DataFrame
return hydro_df_long
Expand Down Expand Up @@ -100,15 +100,13 @@ def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, val
if type_of_extremity == 'RP':
Q_prob= Q_Gumbel_fit_RP (hydro_df, probability, value_col=value_col) # results in infinite
elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead
Q_prob = Q_Gumbel_fit_percentile (hydro_df, percentile, value_col)
Q_prob = Q_Gumbel_fit_percentile (hydro_df, probability, value_col)
else:
print ('no such type of extremity')
#print (f"for {StationName} : return period Q= {QRP}")
if not isinstance(Q_prob, (int, float)):
raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(Q_prob)}.")
# Calculate the QRP threshold


# Copy the DataFrame and add the 'trigger' column
hydro_trigger_df = hydro_df.copy()
hydro_trigger_df['Trigger'] = (hydro_trigger_df[value_col] > Q_prob).astype(int)
Expand All @@ -125,8 +123,6 @@ def createEvent(trigger_df, date_col='Date'):
except:
raise ValueError (f'{date_col} not in trigger_df index and no set index either')



# Prepare commune info with geometry for event creation
event_data = []

Expand Down Expand Up @@ -207,7 +203,7 @@ def loop_over_stations_obs(station_csv, DataDir, type_of_extremity, probability,
print (f'calculating {StationName}, in {BasinName}')
hydro_df = transform_hydro (stationPath) # this actually looks good
except:
# print (f'no discharge measures found for station {StationName} in {BasinName}')
print (f'no discharge measures found for station {StationName} in {BasinName}')
continue

trigger_df = stampHydroTrigger (hydro_df, StationName, type_of_extremity, probability, value_col)
Expand Down Expand Up @@ -289,7 +285,7 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
#loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
for percentile in cfg.percentiles:
loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'percentile', percentile, value_col='Value')
#loop_over_stations_pred (cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
loop_over_stations_pred (cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)

# # running this script for the GloFAS
# #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
Expand Down
Loading

0 comments on commit 942aba4

Please sign in to comment.