Merge branch 'main' of https://github.com/rodekruis/river-flood-data-…

…analysis
rodekruis · Dec 4, 2024 · 25903d3 · 25903d3
2 parents d15bbb0 + 4bda33d
commit 25903d3
Show file tree

Hide file tree

Showing 4 changed files with 472 additions and 75 deletions.
diff --git a/GloFAS/GloFAS_prep/thresholds.py b/GloFAS/GloFAS_prep/thresholds.py
@@ -0,0 +1,194 @@
+"""
+Copyright 2019-2023 European Union
+Licensed under the EUPL, Version 1.2 or as soon they will be approved by the European Commission  subsequent versions of the EUPL (the "Licence");
+You may not use this work except in compliance with the Licence.
+You may obtain a copy of the Licence at:
+https://joinup.ec.europa.eu/sites/default/files/inline-files/EUPL%20v1_2%20EN(1).txt
+Unless required by applicable law or agreed to in writing, software distributed under the Licence is distributed on an "AS IS" basis,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the Licence for the specific language governing permissions and limitations under the Licence.
+
+"""
+
+import argparse
+import os
+import sys
+
+import numpy as np
+import xarray as xr
+
+
+def lmoments(values: np.ndarray) -> np.ndarray:
+    """
+    Compute first 2 L-moments of dataset (on first axis)
+    :param values: N-D array of values
+    :return: an estimate of the first two sample L-moments
+    """
+
+    nmoments = 3
+
+    # we need to have at least four values in order
+    # to make a sample L-moments estimation
+    nvalues = values.shape[0]
+    if nvalues < 4:
+        raise ValueError(
+            "Insufficient number of values to perform sample L-moments estimation"
+        )
+
+    # sort the values into ascending order
+    values = np.sort(values, axis=0)
+
+    sums = np.zeros((nmoments, *(values.shape[1:])))
+
+    for i in range(1, nvalues + 1):
+        z = i
+        term = values[i - 1]
+        sums[0] = sums[0] + term
+        for j in range(1, nmoments):
+            z -= 1
+            term = term * z
+            sums[j] = sums[j] + term
+
+    y = float(nvalues)
+    z = float(nvalues)
+    sums[0] = sums[0] / z
+    for j in range(1, nmoments):
+        y = y - 1.0
+        z = z * y
+        sums[j] = sums[j] / z
+
+    k = nmoments
+    p0 = -1.0
+    for _ in range(2):
+        ak = float(k)
+        p0 = -p0
+        p = p0
+        temp = p * sums[0]
+        for i in range(1, k):
+            ai = i
+            p = -p * (ak + ai - 1.0) * (ak - ai) / (ai * ai)
+            temp = temp + (p * sums[i])
+        sums[k - 1] = temp
+        k = k - 1
+
+    lmoments = np.zeros((2, *(values.shape[1:])))
+    lmoments[0] = sums[0]
+    lmoments[1] = sums[1]
+
+    return lmoments
+
+
+def gumbel_parameters_moments(dis):
+    sigma = np.sqrt(6) * np.nanstd(dis, ddof=1, axis=0) / np.pi
+    mu = np.nanmean(dis, axis=0) - 0.5772 * sigma
+    return sigma, mu
+
+
+def gumbel_parameters_lmoments(dis):
+    lambda_coef = lmoments(dis)
+    sigma = lambda_coef[1] / np.log(2)
+    mu = lambda_coef[0] - sigma * 0.5772
+    return sigma, mu
+
+
+def gumbel_function(y, sigma, mu):
+    return mu - sigma * np.log(np.log(y / (y - 1)))
+
+
+def find_main_var(ds, path):
+    variable_names = [k for k in ds.variables if len(ds.variables[k].dims) == 3]
+    if len(variable_names) > 1:
+        raise Exception("More than one variable in dataset {}".format(path))
+    elif len(variable_names) == 0:
+        raise Exception("Could not find a valid variable in dataset {}".format(path))
+    else:
+        var_name = variable_names[0]
+    return var_name
+
+
+def read_discharge(in_files):
+    ds = xr.open_dataset(in_files)
+    var = find_main_var(ds, in_files)
+    da = ds[var]
+    return da
+
+
+def unmask_array(mask, template, data):
+    data_unmask = np.empty_like(template)
+    data_unmask[...] = np.NaN
+    data_unmask[mask] = data
+    return data_unmask
+
+
+def create_dataset(dis_max, return_periods, mask, thresholds, sigma, mu):
+    print("Creating dataset")
+    ds_rp = xr.Dataset(
+        coords={"lat": dis_max.coords["lat"], "lon": dis_max.coords["lon"]}
+    )
+    for i, rp in enumerate(return_periods):
+        thres = unmask_array(mask, dis_max.isel(time=0).values, thresholds[i])
+        ds_rp[f"rl_{rp}"] = (["lat", "lon"], thres)
+
+    s = unmask_array(mask, dis_max.isel(time=0).values, sigma)
+    print(s.shape)
+    ds_rp[f"sigma"] = (["lat", "lon"], s)
+    m = unmask_array(mask, dis_max.isel(time=0).values, mu)
+    print(m.shape)
+    ds_rp[f"mu"] = (["lat", "lon"], m)
+
+    print(ds_rp)
+
+    return ds_rp
+
+
+def compute_thresholds_gumbel(dis_max, return_periods):
+    mask = np.isfinite(dis_max.isel(time=0).values)
+    dis_max_masked = dis_max.values[:, mask]
+
+    print("Computing Gumbel coefficients")
+    sigma, mu = gumbel_parameters_lmoments(dis_max_masked)
+
+    print("Computing return periods")
+    thresholds = []
+    for y in return_periods:
+        dis = gumbel_function(y, sigma, mu)
+        thresholds.append(dis)
+
+    ds_rp = create_dataset(dis_max, return_periods, mask, thresholds, sigma, mu)
+
+    return ds_rp
+
+
+def main(argv=sys.argv):
+    prog = os.path.basename(argv[0])
+    parser = argparse.ArgumentParser(
+        description="""
+        Utility to compute the discharge return period thresholds
+        using the method of L-moments.
+        Thresholds computed: [1.5, 2, 5, 10, 20, 50, 100, 200, 500]
+        """,
+        prog=prog,
+    )
+    parser.add_argument(
+        "-d", "--discharge", help="Input discharge files (annual maxima)"
+    )
+    parser.add_argument("-o", "--output", help="Output thresholds file")
+
+    args = parser.parse_args()
+
+    dis = read_discharge(args.discharge)
+    print(dis)
+
+    return_periods = np.array([1.5, 2, 5, 10, 20, 50, 100, 200, 500])
+
+    thresholds = compute_thresholds_gumbel(dis, return_periods)
+
+    thresholds.to_netcdf(args.output)
+
+
+def main_script():
+    sys.exit(main())
+
+
+if __name__ == "__main__":
+    main_script()
diff --git a/PTM/PTM_analysis/performance_calculator.py b/PTM/PTM_analysis/performance_calculator.py
@@ -81,7 +81,6 @@ def openObservation_gdf(self):
         # Check the first few rows
         # Filter rows between 2004 and 2022 ()
         df_filtered = df[(df['End Date'].dt.year >= self.startYear) & (df['End Date'].dt.year < self.endYear)]
-        print (df_filtered.head)
         # Remove non-string entries from ADM columns
         df_filtered = df_filtered[df_filtered[f'ADM{self.adminLevel}'].apply(lambda x: isinstance(x, str))]
 
@@ -300,17 +299,31 @@ def calc_performance_scores(self, obs, pred):
                 false_alarms += 1 # there is no use in calculating correct negatives as there are many
         print (f'hits: {hits}, misses: {misses}, false alarms: {false_alarms}, total {comparisonType} = {sum(obs==1)}, should be equal to {misses}+{hits}={misses+hits}')
         # Calculate metrics
-        output = {
-            'pod': hits / (hits + misses) if hits + misses != 0 else np.nan,  # Probability of Detection
-            'far': false_alarms / (hits + false_alarms) if hits + false_alarms != 0 else np.nan,  # False Alarm Ratio
-            #'pofd': false_alarms / (false_alarms + correct_negatives) if false_alarms + correct_negatives != 0 else np.nan,  # Probability of False Detection
-            'csi': hits / (hits + false_alarms + misses) if hits + false_alarms + misses != 0 else np.nan,  # Critical Success Index
-            # 'accuracy': (hits + correct_negatives) / (hits + correct_negatives + false_alarms + misses) if hits + correct_negatives + false_alarms + misses != 0 else np.nan,  # Accuracy
-            'precision': hits / (hits + false_alarms) if hits + false_alarms != 0 else np.nan,
-            'TP': hits,  
-            'FN': misses,
-            'FP': false_alarms
-        }
+        sum_predictions = false_alarms + hits
+        if sum_predictions == 0: 
+            output = {
+                'pod':  np.nan,  # Probability of Detection
+                'far': np.nan,  # False Alarm Ratio
+                #'pofd': false_alarms / (false_alarms + correct_negatives) if false_alarms + correct_negatives != 0 else np.nan,  # Probability of False Detection
+                'csi': np.nan,  # Critical Success Index
+                # 'accuracy': (hits + correct_negatives) / (hits + correct_negatives + false_alarms + misses) if hits + correct_negatives + false_alarms + misses != 0 else np.nan,  # Accuracy
+                'precision': np.nan,
+                'TP': hits,  
+                'FN': misses,
+                'FP': false_alarms
+            }
+        else: 
+            output = {
+                'pod': hits / (hits + misses) if hits + misses != 0 else np.nan,  # Probability of Detection
+                'far': false_alarms / (hits + false_alarms) if hits + false_alarms != 0 else np.nan,  # False Alarm Ratio
+                #'pofd': false_alarms / (false_alarms + correct_negatives) if false_alarms + correct_negatives != 0 else np.nan,  # Probability of False Detection
+                'csi': hits / (hits + false_alarms + misses) if hits + false_alarms + misses != 0 else np.nan,  # Critical Success Index
+                # 'accuracy': (hits + correct_negatives) / (hits + correct_negatives + false_alarms + misses) if hits + correct_negatives + false_alarms + misses != 0 else np.nan,  # Accuracy
+                'precision': hits / (hits + false_alarms) if hits + false_alarms != 0 else np.nan,
+                'TP': hits,  
+                'FN': misses,
+                'FP': false_alarms
+            }
 
         return pd.Series(output)
 
@@ -333,17 +346,17 @@ def calculateCommunePerformance(self):
         return scores_byCommune_gdf
 
 if __name__=='__main__':
-    # impact_csv = f'{cfg.DataDir}/Impact_data/impact_events_per_admin_529.csv'
-    # comparisonType ='Impact'
-    # for RPyr in cfg.RPsyr: 
-    #     # PTM_events = f'{cfg.DataDir}/PTM/floodevents_admUnit_RP{RPyr}yr.csv'
-    #     ptm_events_df = ptm_events (cfg.DNHstations, cfg.DataDir, RPyr, cfg.StationCombos)
-    #     PTM_events_per_adm = events_per_adm(cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, ptm_events_df, 'PTM', RPyr)
-    #     for leadtime in cfg.leadtimes:
-    #         # print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-    #         analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, impact_csv, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PTM_events_per_adm, comparisonType, cfg.actionLifetime, 'PTM', leadtime)
-    #         analyzer.matchImpact_and_Trigger()
-    #         analyzer.calculateCommunePerformance()
+    impact_csv = f'{cfg.DataDir}/Impact_data/impact_events_per_admin_529.csv'
+    comparisonType ='Impact'
+    for RPyr in cfg.RPsyr: 
+        # PTM_events = f'{cfg.DataDir}/PTM/floodevents_admUnit_RP{RPyr}yr.csv'
+        ptm_events_df = ptm_events (cfg.DNHstations, cfg.DataDir, RPyr, cfg.StationCombos)
+        PTM_events_per_adm = events_per_adm(cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, ptm_events_df, 'PTM', RPyr)
+        for leadtime in cfg.leadtimes:
+            # print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
+            analyzer = PredictedToImpactPerformanceAnalyzer(cfg.DataDir, RPyr, impact_csv, cfg.triggerProb, cfg.adminLevel, cfg.admPath, cfg.startYear, cfg.endYear, cfg.years, PTM_events_per_adm, comparisonType, cfg.actionLifetime, 'PTM', leadtime)
+            analyzer.matchImpact_and_Trigger()
+            analyzer.calculateCommunePerformance()
 
     comparisonType ='Observation'
     for RPyr in cfg.RPsyr: 

diff --git a/comparison/observation/HydroImpact.py b/comparison/observation/HydroImpact.py
@@ -5,6 +5,8 @@
 import scipy.stats as stats
 from comparison.pointMatching import attributePoints_to_Polygon
 import geopandas as gpd
+from scipy.stats import genextreme
+
 def parse_date_with_fallback(date_str, year):
     try:
         # Try to parse the date with the given year
@@ -58,16 +60,43 @@ def z_RP_station(HydroStations_RP_file, StationName, RP):
 
 def QRP_fit (hydro_df, RP): 
 
-    # 1. Extract the annual maximum discharge values
+    # Extract the annual maximum discharge values
     hydro_df['Year'] = hydro_df['Date'].dt.year
     annual_max_discharge = hydro_df.groupby('Year')['Value'].max()
 
-    # 2. Fit a Gumbel distribution to the annual maximum discharge values
+    # Fit a Gumbel distribution to the annual maximum discharge values
     loc, scale = stats.gumbel_r.fit(annual_max_discharge)
-    # 4. Calculate the discharge value corresponding to the return period
+    # Calculate the discharge value corresponding to the return period
     discharge_value = stats.gumbel_r.ppf(1 - 1/RP, loc, scale)
     return discharge_value
 
+def Q_GEV_fit_daily(hydro_df, percentile): 
+    """
+    Fits a GEV distribution to the daily discharge values and calculates the discharge 
+    corresponding to a given percentile.
+
+    Parameters:
+        hydro_df (pd.DataFrame): A dataframe with at least 'Date' and 'Value' columns.
+            'Date' should be a datetime object and 'Value' is the discharge value.
+        percentile (float): Percentile for which to compute the discharge value (between 0 and 100).
+
+    Returns:
+        float: The discharge value corresponding to the given percentile.
+    """
+    # Ensure 'Date' column is a datetime object
+    hydro_df['Date'] = pd.to_datetime(hydro_df['Date'])
+
+    # Extract daily discharge values
+    daily_discharge = hydro_df['Value']
+
+    # Fit a GEV distribution to the daily discharge values
+    shape, loc, scale = genextreme.fit(daily_discharge)
+
+    # Calculate the discharge value corresponding to the specified percentile
+    discharge_value = genextreme.ppf(percentile / 100, shape, loc=loc, scale=scale)
+
+    return discharge_value
+
 
 def stampHydroTrigger(hydro_df, RP, StationName): 
     """
@@ -89,8 +118,11 @@ def stampHydroTrigger(hydro_df, RP, StationName):
     #QRP = QRP_station(HydroStations_RP_file, StationName, RP)
     Q_station = hydro_df["Value"] 
 
-
-    QRP= QRP_fit (hydro_df, RP)
+    if RP < 21:
+        QRP= QRP_fit (hydro_df, RP)
+    else: # assuming above 20 is percentile, RP is percentile instead 
+        print ('applying a GEV, assuming your RP is in percentiles')
+        QRP = Q_GEV_fit (hydro_df, RP) 
     #print (f"for {StationName} : return period Q= {QRP}")
     if not isinstance(QRP, (int, float)):
         raise ValueError(f"Expected QRP to be a scalar (int or float), but got {type(QRP)}.")
@@ -100,7 +132,6 @@ def stampHydroTrigger(hydro_df, RP, StationName):
     # Copy the DataFrame and add the 'trigger' column
     hydro_trigger_df = hydro_df.copy()
     hydro_trigger_df['Trigger'] = (hydro_trigger_df['Value'] > QRP).astype(int)
-
     return hydro_trigger_df
 
 def createEvent(trigger_df):
@@ -109,7 +140,6 @@ def createEvent(trigger_df):
     trigger_df = trigger_df.sort_values(by=[f'Date']).reset_index(drop=True)
 
     # Prepare commune info with geometry for event creation
-
     event_data = []
 
     # Keep track of which rows have been processed to avoid rechecking
@@ -213,7 +243,7 @@ def events_per_adm (DataDir, admPath, adminLevel, station_csv, StationDataDir, a
 
 
 if __name__=="__main__": 
-    for RP in cfg.RPsyr: 
-        #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
-        event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, RP)
-        event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, all_events_df, 'Observation', RP)
+
+    #print (readcsv(f"{DataDir}/Données partagées - DNH Mali - 2019/Donne╠ües partage╠ües - DNH Mali - 2019/De╠übit du Niger a╠Ç Ansongo.csv"))
+    event_df = loop_over_stations (cfg.DNHstations, cfg.DataDir, 95)
+    event_gdf = events_per_adm (cfg.DataDir, cfg.admPath, cfg.adminLevel, cfg.DNHstations, cfg.stationsDir, event_df, 'Observation', 95)