collect over station not admin

rodekruis · Dec 18, 2024 · 942aba4 · 942aba4
1 parent a24f925
commit 942aba4
Show file tree

Hide file tree

Showing 6 changed files with 646 additions and 105 deletions.
diff --git a/GloFAS/GloFAS_analysis/station_adm_performance_calculator.py b/GloFAS/GloFAS_analysis/station_adm_performance_calculator.py
diff --git a/GloFAS/GloFAS_prep/vectorCheck.py b/GloFAS/GloFAS_prep/vectorCheck.py
@@ -11,7 +11,7 @@ def df_from_txt_or_csv (vector):
     if vector.endswith ('.csv'):
         df = pd.read_csv(vector)
     elif vector.endswith ('.txt'):
-        df = pd.read_csv (vector, sep="; ", header=0)
+        df = pd.read_csv (vector, sep=",", header=0)
     return df
 
 def checkVectorFormat(vector, shapeType=None, crs='EPSG:4326', placement='real'):

diff --git a/PTM/PTM_prep/ptm_events.py b/PTM/PTM_prep/ptm_events.py
@@ -1,14 +1,14 @@
-from comparison.observation.HydroImpact import loop_over_stations, events_per_adm
+from comparison.observation.HydroImpact import loop_over_stations_obs, events_per_adm
 import pandas as pd 
 from datetime import datetime, timedelta
 import GloFAS.GloFAS_prep.configuration as cfg
 columnID = 'StationName'
 # k = [upstream, downstream], v = days of propagation time 
 
 
-def ptm_events(DNHstations, DataDir, RP, StationCombos):
+def ptm_events(DNHstations, DataDir, thresholdtype, threshold_value, StationCombos):
     # Generate station events data
-    station_events_df = loop_over_stations(DNHstations, DataDir, RP)
+    station_events_df = loop_over_stations_obs(DNHstations, DataDir, f'{thresholdtype}',threshold_value,value_col='Value')
     pred_ptm_events = []
 
     # Iterate through station combinations

diff --git a/comparison/collect_for_administrative_unit.py b/comparison/collect_for_administrative_unit.py
@@ -2,41 +2,134 @@
 import pandas as pd
 import numpy as np
 
-def collect_performance_measures(admin_unit, DataDir, leadtimes, return_periods):
+def collect_performance_measures_over_station(StationName, DataDir, leadtimes, return_periods, percentiles):
     """
-    Collects performance metrics (POD, FAR) for a given administrative unit and organizes them into 2D arrays.
+    Collects performance metrics (POD, FAR) for a given administrative unit and organizes them into arrays
+    for both return periods and percentiles.
 
     Parameters:
     - admin_unit (str): Name of the administrative unit (ADM2 field in the CSVs).
     - DataDir (str): Base directory containing subdirectories for models (GloFAS, Google Floodhub, PTM).
-    - leadtimes (list): List of leadtimes to include (defines columns of the 2D array).
-    - return_periods (list): List of return periods to include (defines rows of the 2D array).
+    - leadtimes (list): List of leadtimes to include (defines one axis of the array).
+    - return_periods (list): List of return periods (defines thresholds for one type of analysis).
+    - percentiles (list): List of percentiles (defines thresholds for another type of analysis).
 
     Returns:
-    - data (dict): Nested dictionary containing performance metrics in 2D arrays:
-      {
-          'leadtimes': [...],
-          'return_periods': [...],
-          'POD': {
-              'GloFAS': {'Observation': np.array([...]), 'impact': np.array([...])},
-              'Google Floodhub': {'Observation': np.array([...]), 'impact': np.array([...])},
-              'PTM': {'Observation': np.array([...]), 'Impact': np.array([...])}
-          },
-          'FAR': {
-              'GloFAS': {'Observation': np.array([...]), 'Impact': np.array([...])},
-              'Google Floodhub': {'Observation': np.array([...]), 'Impact': np.array([...])},
-              'PTM': {'Observation': np.array([...]), 'Impact': np.array([...])}
-          }
-      }
+    - data (dict): Nested dictionary containing performance metrics organized by leadtime and threshold type.
     """
     models = ['GloFAS', 'GoogleFloodHub', 'PTM']
     data = {
         'leadtimes': leadtimes,
-        'return_periods': return_periods,
-        'POD': {model: {'Observation': np.full((len(return_periods), len(leadtimes)), np.nan),
-                        'Impact': np.full((len(return_periods), len(leadtimes)), np.nan)} for model in models},
-        'FAR': {model: {'Observation': np.full((len(return_periods), len(leadtimes)), np.nan),
-                        'Impact': np.full((len(return_periods), len(leadtimes)), np.nan)} for model in models}
+        'thresholds': {
+            'return_periods': return_periods,
+            'percentiles': percentiles
+        },
+        'POD': {model: {
+                    'Observation': {
+                        'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
+                        'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
+                    }
+                } for model in models},
+        'FAR': {model: {
+                    'Observation': {
+                        'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
+                        'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
+                    }
+                } for model in models}
+    }
+
+    for model in models:
+        model_dir = os.path.join(DataDir, model.replace(' ', ''))  # Adjust directory naming if needed
+        if not os.path.exists(model_dir):
+            print(f"Directory not found for model: {model}, skipping.")
+            continue
+
+        for comparison_type in ['Observation']:# 'Impact']:
+            comp_dir = os.path.join(model_dir, comparison_type)
+            if not os.path.exists(comp_dir):
+                print(f"Directory not found for comparison type: {comparison_type} in {model}, skipping.")
+                continue
+
+            # Collect data for return periods
+            for rp_idx, return_period in enumerate(return_periods):
+                for lt_idx, leadtime in enumerate(leadtimes):
+                    file_name = f'scores_byCommuneRP{return_period:.1f}_yr_leadtime{leadtime}.csv'
+                    file_path = os.path.join(comp_dir, file_name)
+
+                    if os.path.exists(file_path):
+                        try:
+                            df = pd.read_csv(file_path)
+                            row = df[df['StationName'] == StationName]
+                            if not row.empty:
+                                pod_value = row['pod'].values[0] if 'pod' in row else np.nan
+                                far_value = row['far'].values[0] if 'far' in row else np.nan
+                                data['POD'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = pod_value
+                                data['FAR'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = far_value
+                        except Exception as e:
+                            print(f"Error reading file {file_path}: {e}, skipping.")
+
+            # Collect data for percentiles
+            for pct_idx, percentile in enumerate(percentiles):
+                for lt_idx, leadtime in enumerate(leadtimes):#so a bit weird format but ok:
+                    file_name = f'scores_byCommuneRP{percentile:.1f}_yr_leadtime{leadtime}.csv'
+                    file_path = os.path.join(comp_dir, file_name)
+
+                    if os.path.exists(file_path):
+                        try:
+                            df = pd.read_csv(file_path)
+                            row = df[df['StationName'] == StationName]
+                            if not row.empty:
+                                pod_value = row['pod'].values[0] if 'pod' in row else np.nan
+                                far_value = row['far'].values[0] if 'far' in row else np.nan
+                                data['POD'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = pod_value
+                                data['FAR'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = far_value
+                        except Exception as e:
+                            print(f"Error reading file {file_path}: {e}, skipping.")
+
+    return data
+
+def collect_performance_measures_over_admin(admin_unit, DataDir, leadtimes, return_periods, percentiles):
+    """
+    Collects performance metrics (POD, FAR) for a given administrative unit and organizes them into arrays
+    for both return periods and percentiles.
+
+    Parameters:
+    - admin_unit (str): Name of the administrative unit (ADM2 field in the CSVs).
+    - DataDir (str): Base directory containing subdirectories for models (GloFAS, Google Floodhub, PTM).
+    - leadtimes (list): List of leadtimes to include (defines one axis of the array).
+    - return_periods (list): List of return periods (defines thresholds for one type of analysis).
+    - percentiles (list): List of percentiles (defines thresholds for another type of analysis).
+
+    Returns:
+    - data (dict): Nested dictionary containing performance metrics organized by leadtime and threshold type.
+    """
+    models = ['GloFAS', 'GoogleFloodHub', 'PTM']
+    data = {
+        'leadtimes': leadtimes,
+        'thresholds': {
+            'return_periods': return_periods,
+            'percentiles': percentiles
+        },
+        'POD': {model: {
+                    'Observation': {
+                        'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
+                        'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
+                    },
+                    'Impact': {
+                        'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
+                        'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
+                    }
+                } for model in models},
+        'FAR': {model: {
+                    'Observation': {
+                        'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
+                        'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
+                    },
+                    'Impact': {
+                        'return_periods': np.full((len(return_periods), len(leadtimes)), np.nan),
+                        'percentiles': np.full((len(percentiles), len(leadtimes)), np.nan)
+                    }
+                } for model in models}
     }
 
     for model in models:
@@ -51,38 +144,46 @@ def collect_performance_measures(admin_unit, DataDir, leadtimes, return_periods)
                 print(f"Directory not found for comparison type: {comparison_type} in {model}, skipping.")
                 continue
 
-            for i, return_period in enumerate(return_periods):  # Row index for return periods
-                for j, leadtime in enumerate(leadtimes):  # Column index for lead times
-                    # Build the filename
+            # Collect data for return periods
+            for rp_idx, return_period in enumerate(return_periods):
+                for lt_idx, leadtime in enumerate(leadtimes):
                     file_name = f'scores_byCommuneRP{return_period}_yr_leadtime{leadtime}.csv'
                     file_path = os.path.join(comp_dir, file_name)
 
-                    if not os.path.exists(file_path):
-                        print(f"File not found: {file_path}, skipping.")
-                        continue
-
-                    # Read the CSV file
-                    try:
-                        df = pd.read_csv(file_path)
-                    except Exception as e:
-                        print(f"Error reading file {file_path}: {e}, skipping.")
-                        continue
+                    if os.path.exists(file_path):
+                        try:
+                            df = pd.read_csv(file_path)
+                            row = df[df['ADM2'] == admin_unit]
+                            if not row.empty:
+                                pod_value = row['pod'].values[0] if 'pod' in row else np.nan
+                                far_value = row['far'].values[0] if 'far' in row else np.nan
+                                data['POD'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = pod_value
+                                data['FAR'][model][comparison_type]['return_periods'][rp_idx, lt_idx] = far_value
+                        except Exception as e:
+                            print(f"Error reading file {file_path}: {e}, skipping.")
+
+            # Collect data for percentiles
+            for pct_idx, percentile in enumerate(percentiles):
+                for lt_idx, leadtime in enumerate(leadtimes):#so a bit weird format but ok
+                    file_name = f'scores_byCommuneRP{percentile}_yr_leadtime{leadtime}.csv'
+                    file_path = os.path.join(comp_dir, file_name)
 
-                    # Filter rows for the specified admin unit
-                    row = df[df['ADM2'] == admin_unit]
-                    if row.empty:
-                        print(f"Admin unit {admin_unit} not found in file: {file_path}, skipping.")
-                    else:
-                        # Extract POD and FAR values
-                        pod_value = row['pod'].values[0] if 'pod' in row else np.nan
-                        far_value = row['far'].values[0] if 'far' in row else np.nan
-
-                        data['POD'][model][comparison_type][i, j] = pod_value
-                        data['FAR'][model][comparison_type][i, j] = far_value
+                    if os.path.exists(file_path):
+                        try:
+                            df = pd.read_csv(file_path)
+                            row = df[df['ADM2'] == admin_unit]
+                            if not row.empty:
+                                pod_value = row['pod'].values[0] if 'pod' in row else np.nan
+                                far_value = row['far'].values[0] if 'far' in row else np.nan
+                                data['POD'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = pod_value
+                                data['FAR'][model][comparison_type]['percentiles'][pct_idx, lt_idx] = far_value
+                        except Exception as e:
+                            print(f"Error reading file {file_path}: {e}, skipping.")
 
     return data
 
 
+
 if __name__ =='__main__': 
     # These regions include Bamako, Koulikoro, Ségou, Mopti, Timbouctou and Gao, which have historically experienced frequent and severe flood events. 
     # Regions such as Mopti and Ségou are of particular concern due to their high exposure to flooding as well as their dense populations,
@@ -91,5 +192,5 @@ def collect_performance_measures(admin_unit, DataDir, leadtimes, return_periods)
     # Kolondieba in Sikasso, okay score in obs, no impact 
     # San in segou: impact data
 
-    GloFASdata = collect_performance_measures('BLA', cfg.DataDir, cfg.leadtimes, cfg.RPsyr)
+    GloFASdata = collect_performance_measures_over_station('BAMAKO', cfg.DataDir, cfg.leadtimes, cfg.RPsyr, cfg.percentiles)
     print (GloFASdata)
diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
@@ -21,7 +21,7 @@ def parse_date_with_fallback(date_str, year):
         return None
 
 
-def transform_hydro(csvPath, startYear, endYear): 
+def transform_hydro(csvPath, startYear=2004, endYear=2018): 
     hydro_df_wide = pd.read_csv(csvPath, header=0)
 
     # Melt the wide-format DataFrame to long format
@@ -57,7 +57,7 @@ def create_datetime(row):
         # Filter data for the given start and end year
         hydro_df_long = hydro_df_long[(hydro_df_long.index.year >= startYear) & (hydro_df_long.index.year <= endYear)]
     else:
-        print(f"Error: Column 'Date' does not exist in the DataFrame, hopefully the date is the index")
+        print(f"Warning: Column 'Date' does not exist in the DataFrame, hopefully the date is the index")
 
     # Display the reshaped DataFrame
     return hydro_df_long
@@ -100,15 +100,13 @@ def stampHydroTrigger(hydro_df, StationName, type_of_extremity, probability, val
     if type_of_extremity == 'RP':
         Q_prob= Q_Gumbel_fit_RP (hydro_df, probability, value_col=value_col) # results in infinite
     elif type_of_extremity == 'percentile': # assuming above 20 is percentile, RP is percentile instead 
-        Q_prob = Q_Gumbel_fit_percentile (hydro_df, percentile, value_col) 
+        Q_prob = Q_Gumbel_fit_percentile (hydro_df, probability, value_col) 
     else: 
         print ('no such type of extremity')
     #print (f"for {StationName} : return period Q= {QRP}")
     if not isinstance(Q_prob, (int, float)):
         raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(Q_prob)}.")
     # Calculate the QRP threshold
-
-
     # Copy the DataFrame and add the 'trigger' column
     hydro_trigger_df = hydro_df.copy()
     hydro_trigger_df['Trigger'] = (hydro_trigger_df[value_col] > Q_prob).astype(int)
@@ -125,8 +123,6 @@ def createEvent(trigger_df, date_col='Date'):
         except: 
             raise ValueError (f'{date_col} not in trigger_df index and no set index either')
 
-
-
     # Prepare commune info with geometry for event creation
     event_data = []
 
@@ -207,7 +203,7 @@ def loop_over_stations_obs(station_csv, DataDir, type_of_extremity, probability,
             print (f'calculating {StationName}, in {BasinName}')
             hydro_df = transform_hydro (stationPath) # this actually looks good
         except: 
-            # print (f'no discharge measures found for station {StationName} in {BasinName}')
+            print (f'no discharge measures found for station {StationName} in {BasinName}')
             continue
 
         trigger_df = stampHydroTrigger (hydro_df, StationName, type_of_extremity, probability, value_col)
@@ -289,7 +285,7 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
             #loop_over_stations_pred(cfg.DNHstations, cfg.stationsDir, RP, 'RP', value_col, leadtime)
         for percentile in cfg.percentiles:
             loop_over_stations_obs (cfg.DNHstations, cfg.DataDir, 'percentile', percentile, value_col='Value')
-            #loop_over_stations_pred (cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
+            loop_over_stations_pred (cfg.DNHstations, cfg.stationsDir, percentile, 'percentile', value_col, leadtime)
 
     # # running this script for the GloFAS 
     # #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))