From 062297b08a4a2b6b32e138dc96f1924b60d0b311 Mon Sep 17 00:00:00 2001
From: dt-woods <davisler@gmail.com>
Date: Fri, 6 Dec 2024 15:50:36 -0500
Subject: [PATCH] new coal transportation from 2023 coal model

addresses #243. note that distance data (kg*km) and modes of transport are significantly changed from 2016 using 2020 data
---
 electricitylci/coal_upstream.py               | 279 +++++++++++++-----
 .../data/coal/2023/coal_transport_dist.csv    |  43 +++
 2 files changed, 252 insertions(+), 70 deletions(-)
 create mode 100644 electricitylci/data/coal/2023/coal_transport_dist.csv

diff --git a/electricitylci/coal_upstream.py b/electricitylci/coal_upstream.py
index 43d2344..581677d 100644
--- a/electricitylci/coal_upstream.py
+++ b/electricitylci/coal_upstream.py
@@ -17,8 +17,10 @@
 from electricitylci.globals import paths
 from electricitylci.globals import data_dir
 from electricitylci.globals import STATE_ABBREV
-from electricitylci.eia923_generation import eia923_download  # +model_specs
+from electricitylci.eia860_facilities import eia860_balancing_authority
+from electricitylci.eia923_generation import eia923_download
 from electricitylci.eia923_generation import eia923_generation_and_fuel
+from electricitylci.model_config import model_specs
 import electricitylci.PhysicalQuantities as pq
 from electricitylci.utils import download
 from electricitylci.utils import find_file_in_folder
@@ -46,7 +48,7 @@
 For the 2023 coal model, see: https://www.osti.gov/biblio/2370100.
 
 Last updated:
-    2024-10-25
+    2024-12-06
 """
 __all__ = [
     "COAL_MINING_LCI_VINTAGE",
@@ -114,7 +116,7 @@
     'Avg Railroad Ton*Miles': 'Railroad',
     'Avg Truck Ton*Miles': 'Truck',
 }
-'''dict : A map between coal model transport columns and their short names.'''
+'''dict : A map from 2016 coal model transport columns to their short names.'''
 
 
 ##############################################################################
@@ -211,47 +213,114 @@ def _process_2023_coal_transport_lci(df, name):
 
 
 def _make_2023_coal_transport_data(year):
-    # IN PROGRESS.
-    #
-    # The 2023 transport distances are by NERC region and coal basin.
-    # The goal is to get a data frame with facility IDs matched to their
-    # NERC region and coal basin.
-
-    # Get NERC regions for coal facilities
-    coal_reg = eia923_generation_and_fuel(year)
-    coal_reg = coal_reg[
-        ['plant_id', 'nerc_region', 'state', 'reported_fuel_type_code']].copy()
-    # Keep only coal facilities
-    coal_filt = coal_reg['reported_fuel_type_code'].isin(coal_type_codes.keys())
-    coal_reg = coal_reg.loc[coal_filt, :]
-    # Lose facilities without a NERC region
-    coal_reg = coal_reg.dropna(subset='nerc_region')
-
-    # Get the basin map.
-    basin_map = pd.read_csv(os.path.join(data_dir, 'eia_to_netl_basin.csv'))
-    basin_map = basin_map.set_index('eia_basin')
-    basin_map = basin_map['netl_basin']
-    basin_map = basin_map.to_dict()
-
-    # Get the state to basin map.
-    state_map = pd.read_csv(os.path.join(data_dir, 'coal_state_to_basin.csv'))
-    state_map = state_map.set_index('state')
-    state_map = state_map['basin1']
-    state_map = state_map.to_dict()
-
-    coal_reg['basin'] = coal_reg['state'].map(state_map)
-    coal_reg.dropna(subset='basin')
-
-    # TODO;
-
-    # The 2023 coal baseline model's 'Transportation' worksheet was
-    # saved to CSV. All distances are in miles.
-    # Columns include 'Basin', 'NERC Region', 'Belt', 'Truck', 'Barge',
-    # 'Ocean Vessel', 'Railroad' (renamed from 'Train'), and 'Total (mi)'.
-    coal_distance = pd.read_csv(
-        os.path.join(
-            data_dir, "coal", "2023", "coal_transportation_distances.csv")
+    """Generate essentially the same the data as the CSV file from the 2016
+    baseline, updated with transportation data from the 2023 coal model,
+    where gaps are filled using the U.S. average.
+
+    Transportation data units are kg*km
+    (kilograms of coal x kilometers of distance transported).
+
+    Parameters
+    ----------
+    year : int
+        The year used for facility data from EIA 860.
+
+    Returns
+    -------
+    pandas.DataFrame
+        A data frame with plant IDs, coal basins, NERC regions, and kg coal*km
+        coal transported data for: Belt, Truck, Barge, Ocean Vessel, and Train.
+
+    Raises
+    ------
+    OSError
+        If the data file is not found.
+    """
+    # Generate the coal upstream map, which labels each facility with its
+    # coal source code: a three-part combo of coal basin, coal type, and
+    # mine type. We only want the coal basin data from this.
+    coal_map_df = generate_upstream_coal_map(year)
+    coal_map_df["Basin"] = coal_map_df["coal_source_code"].str.split("-").str[0]
+
+    # Now, let's find the NERC region for each facility.
+    ba_region_df = eia860_balancing_authority(year, regional_aggregation=None)
+
+    # Let's create a dictionary that maps facilities to their NERC region,
+    # fixing the plant ID from string to integer along the way.
+    # We don't need the heat input or the old coal source code, so let's drop
+    # them.
+    region_dict = dict(
+        zip(ba_region_df["Plant Id"], ba_region_df["NERC Region"])
     )
+    region_dict = {int(k): v for k, v in region_dict.items()}
+    coal_map_df['NERC Region'] = coal_map_df['plant_id'].map(region_dict)
+    coal_map_df = coal_map_df.drop(columns=['coal_source_code', 'heat_input'])
+
+    # Read the 2023 coal model transportation data
+    # Source: https://github.com/USEPA/ElectricityLCI/discussions/273
+    coal_dir = os.path.join(data_dir, "coal", "2023")
+    coal_file = os.path.join(coal_dir, "coal_transport_dist.csv")
+    if not os.path.isfile(coal_file):
+        raise OSError(
+            "Failed to find 2023 coal transportation "
+            "data file, '%s'" % coal_file)
+    coal_trans_df = pd.read_csv(coal_file)
+
+    # NOTE: the 2023 coal model uses a slightly different naming scheme
+    # for WNW coal basin, so let's fix it.
+    basin_codes_new = {k:v for k, v in basin_codes.items()}
+    del basin_codes_new["West/Northwest"]
+    basin_codes_new["West/North West"] = "WNW"
+
+    # Now, map the basin names to their basin codes.
+    # NOTE this works for all basins except for "U.S. Average"
+    coal_trans_df["Basin"] = coal_trans_df["Basin"].map(basin_codes_new)
+
+    # Some facilities may not map to our coal model, so let's save the
+    # U.S. average and use it for them.
+    # TODO: Consider saving the weighted averages for regions as well!
+    us_ave_coal_trans = coal_trans_df.loc[coal_trans_df['Basin'].isna(), :]
+    us_ave_coal_trans = us_ave_coal_trans.reset_index(drop=True)
+
+    # Drop the NaNs from our coal transportation data frame
+    # (i.e., the U.S. average that we saved separately).
+    coal_trans_df = coal_trans_df.dropna().copy()
+
+    # Put it all together by merging our transportation data and the
+    # coal data using the NERC region and coal basin codes as the
+    # common attributes.
+    final_df = pd.merge(
+        left=coal_map_df,
+        right=coal_trans_df,
+        on=['Basin', 'NERC Region'],
+        how='left',
+    )
+
+    # there are facilities not mapped to transportation; let's give them the
+    # U.S. average values
+    # TODO: consider using weighted-average regional values.
+    final_df = final_df.fillna({
+        'Belt': us_ave_coal_trans.loc[0, 'Belt'],
+        'Truck': us_ave_coal_trans.loc[0, 'Truck'],
+        'Barge': us_ave_coal_trans.loc[0, 'Barge'],
+        'Ocean Vessel': us_ave_coal_trans.loc[0, 'Ocean Vessel'],
+        'Train': us_ave_coal_trans.loc[0, 'Train'],
+    })
+
+    # The transportation data from the coal model are in miles.
+    # Let's convert miles to kilometers, and calculate the kg*km values by
+    # multiplying the quantity (kg of coal) by transportation distance
+    # (miles converted to km).
+    mi_to_km = pq.convert(1, 'mi', 'km')
+
+    trans_cols = ["Belt", "Truck", "Barge", "Ocean Vessel", "Train"]
+    final_df[trans_cols] = final_df[trans_cols].mul(mi_to_km)
+    final_df[trans_cols] = final_df[trans_cols].mul(
+        final_df["quantity"],
+        axis=0
+    )
+
+    return final_df
 
 
 def _make_ave_transport(trans_df, lci_df):
@@ -969,6 +1038,100 @@ def get_2023_ave_coal_transport(trans_df, input_df):
     return trans_lci
 
 
+def get_coal_transportation():
+    """Create the coal transport data frame in kilograms of coal by kilometers
+    of distance transported for each facility by transportation type
+    (e.g. 'Barge' or 'Truck').
+
+    Returns
+    -------
+    pandas.DataFrame
+        A three-column data frame of 'plant_id', 'coal_source_code'
+        (i.e., tranportation type like 'Truck' or 'Barge'), and 'quantity'
+        (i.e., transportation of kilograms of coal by kilometers of distance).
+
+        The 2020 version has five types of transportation (i.e., 'Barge', 'Lake
+        Vessel', 'Ocean Vessel', 'Railroad', and 'Truck).
+
+        The 2023 version has five types of transportation (i.e., 'Barge',
+        'Belt', 'Ocean Vessel', 'Railroad', and 'Truck').
+
+    Raises
+    ------
+    ValueError
+        If the global parameter year is not correctly assigned.
+
+    Notes
+    -----
+    Method depends on the global parameter, `COAL_TRANSPORT_LCI_VINTAGE`.
+    For 2020, the 2016 baseline's ABB data file is referenced (i.e.,
+    '2016_Coal_Trans_By_Plant_ABB_Data.csv').
+    For 2023, the 2023 coal baseline data file is referenced
+    (i.e., 'coal_transport_dist.csv' in the coal/2023 folder of data).
+    """
+    # IN PROGRESS
+    if COAL_TRANSPORT_LCI_VINTAGE == 2020:
+        # The 2016 transportation data by facility.
+        logging.info("Using 2016 coal baseline transportation distance data.")
+        coal_transportation = pd.read_csv(
+            os.path.join(data_dir, '2016_Coal_Trans_By_Plant_ABB_Data.csv')
+        )
+        # Make rows facility IDs with Transport column (modes) and
+        # value (ton*mi)
+        coal_transportation = coal_transportation.melt(
+            'Plant Government ID',
+            var_name='Transport'
+        )
+        # NOTE: the 2016 transportation functional unit is ton*miles;
+        # convert ton*mi to kg*km
+        coal_transportation["value"] = (
+            coal_transportation["value"]
+            * pq.convert(1, "ton", "kg")
+            * pq.convert(1, "mi", "km")
+        )
+        # Rename transport columns
+        coal_transportation = coal_transportation.rename(columns={
+            'Plant Government ID': 'plant_id',
+            'Transport': 'coal_source_code',
+            'value': 'quantity',
+        })
+        # Correct coal_transportation codes
+        coal_transportation['coal_source_code'] = coal_transportation.apply(
+            _transport_code, axis=1)
+    elif COAL_TRANSPORT_LCI_VINTAGE == 2023:
+        logging.info("Using 2023 coal model transportation distance data")
+        coal_transportation = _make_2023_coal_transport_data(
+            model_specs.eia_gen_year)
+
+        # NOTE: the 2016 baseline uses 'Railroad' in place of 'Train'
+        coal_transportation = coal_transportation.rename(
+            columns={'Train': 'Railroad'}
+        )
+
+        # The data frame needs melted to match the 2016 data frame, which has
+        # three columns: plant_id, coal_source_code (i.e., transportation type),
+        # and quantity (i.e., the kg*km values).
+        coal_transportation = coal_transportation.melt(
+            id_vars=("plant_id",),
+            value_vars=('Belt', 'Truck', 'Barge', 'Ocean Vessel', 'Railroad')
+        )
+
+        # To allow facilities receiving coal from more than one region/basin,
+        # group by facility and sum by transportation type.
+        coal_transportation = coal_transportation.groupby(by=['plant_id', 'variable']).agg({'value': 'sum'}).reset_index(drop=False)
+
+        # Rename to match the 2016 data frame
+        coal_transportation = coal_transportation.rename(
+            columns={'variable': 'coal_source_code', 'value': 'quantity'}
+        )
+    else:
+        raise ValueError(
+            "The coal transport year, %d, "
+            "is unknown!" % COAL_TRANSPORT_LCI_VINTAGE)
+
+    return coal_transportation
+
+
 def read_coal_mining():
     """Read coal mining (extraction and processing) life cycle inventory.
 
@@ -1071,33 +1234,8 @@ def read_coal_transportation():
         - 'input', whether flow is resource (true) or emission (false)
 
     """
-    # Presently, we only have the 2016 transportation data by facility.
-    logging.info("Using 2016 coal plant transportation data.")
-    coal_transportation = pd.read_csv(
-        os.path.join(data_dir, '2016_Coal_Trans_By_Plant_ABB_Data.csv')
-    )
-    # Make rows facility IDs with Transport column (modes) and
-    # value (ton*mi)
-    coal_transportation = coal_transportation.melt(
-        'Plant Government ID',
-        var_name='Transport'
-    )
-    # NOTE: the 2016 transportation functional unit is ton*miles;
-    # convert ton*mi to kg*km
-    coal_transportation["value"] = (
-        coal_transportation["value"]
-        * pq.convert(1, "ton", "kg")
-        * pq.convert(1, "mi", "km")
-    )
-    # Rename transport columns
-    coal_transportation = coal_transportation.rename(columns={
-        'Plant Government ID': 'plant_id',
-        'Transport': 'coal_source_code',
-        'value': 'quantity',
-    })
-    # Correct coal_transportation codes
-    coal_transportation['coal_source_code'] = coal_transportation.apply(
-        _transport_code, axis=1)
+    # Get the appropriate coal transportation distance data:
+    coal_transportation = get_coal_transportation()
 
     # FORK IN THE ROAD
     if COAL_TRANSPORT_LCI_VINTAGE == 2023:
@@ -1467,6 +1605,7 @@ def wtd_mean(pdser, total_db):
     invent_plants = coal_mining_inventory_df['plant_id'].unique()
 
     # Check for any inventory plants that don't have transportation LCI.
+    # NOTE: this should not occur unless the LCI vintage years are mis-matched.
     missing_plants = [
         int(x) for x in invent_plants if x not in trans_plants]
     num_miss_plants = len(missing_plants)
diff --git a/electricitylci/data/coal/2023/coal_transport_dist.csv b/electricitylci/data/coal/2023/coal_transport_dist.csv
new file mode 100644
index 0000000..283da68
--- /dev/null
+++ b/electricitylci/data/coal/2023/coal_transport_dist.csv
@@ -0,0 +1,43 @@
+﻿Basin,NERC Region,Belt,Truck,Barge,Ocean Vessel,Train
+Central Appalachia,FRCC,0,0,0,0,1107.977752
+Central Appalachia,MRO,0,0,0,307,560.0852524
+Central Appalachia,NPCC,0,0,0,0,885.1087031
+Central Appalachia,RFC,0,5.746550107,104.3861809,2.733886858,82.4622286
+Central Appalachia,SERC,0,1.212929678,7.506436059,0,432.0265178
+Central Appalachia,Weighted average,0,2.481161639,35.04156235,0.925235054,345.1554444
+Central Interior,SPP,0,18.17334589,0,0,18.44396159
+Gulf Lignite,ERCOT,0.636293625,5.68677811,0,0,3.57183154
+Gulf Lignite,SERC,0,0.5,0,0,0
+Gulf Lignite,SPP,4.451175234,0.528544448,0,0,0
+Gulf Lignite,Weighted average,1.220014482,4.465128744,0,0,2.727310722
+Illinois Basin,FRCC,0,7.329290563,365.4292272,1.516801349,621.4660044
+Illinois Basin,MRO,0,0,0,0,343.2172334
+Illinois Basin,RFC,0,10.21258341,135.6002717,0.115027265,48.01586662
+Illinois Basin,SERC,0.384204428,6.31479746,92.99234444,0,207.5130215
+Illinois Basin,SPP,0,14.21745361,878.4490668,0,27.80648186
+Illinois Basin,Weighted average,0.196243583,7.893121021,138.4167554,0.194932872,188.1140205
+Lignite,MRO,3.247430368,0.749585777,0,0,7.341607743
+Northern Appalachia,FRCC,0,0,0,0,1387.352562
+Northern Appalachia,MRO,0,0,0,165,695
+Northern Appalachia,NPCC,0,1.830338124,0,0,553.1587493
+Northern Appalachia,RFC,0.166831582,7.542097031,75.14958068,1.691337393,78.66021175
+Northern Appalachia,SERC,0,6.086323608,22.51291511,0,550.1558208
+Northern Appalachia,Weighted average,0.139705124,7.232441815,66.26487988,1.436492556,160.4347721
+Powder River Basin,ERCOT,0,0,0,0,1520.471575
+Powder River Basin,MRO,0,0,8.904027538,0.06946903,876.263257
+Powder River Basin,RFC,0,0,0,211.7130418,1182.554026
+Powder River Basin,SERC,0,0,21.8143319,0,1362.973426
+Powder River Basin,SPP,0,0,0,0,927.4462063
+Powder River Basin,WECC,1.283299364,0,0,0,398.1586126
+Powder River Basin,Weighted average,0.185069808,0,7.418959303,31.03121824,1059.925682
+Rocky Mountain,FRCC,0,0,0,436.894845,1992.837173
+Rocky Mountain,MRO,0,0,0,343.8040108,1296.735856
+Rocky Mountain,RFC,0,0,0.113565012,0,1787.926627
+Rocky Mountain,SERC,0,0,0,0,2029
+Rocky Mountain,WECC,0.87258711,13.07121003,0,0,83.3385493
+Rocky Mountain,Weighted average,0.846069985,12.6739879,0.001283021,5.150114206,138.1862896
+Southern Appalachia,SERC,0,32.62822162,28.1533781,0,1.349550346
+West/North West,ASCC,0,4,0,0,0
+West/North West,WECC,0,4,0,0,0
+West/North West,Weighted average,0,4,0,0,0
+U.S. Average,U.S. Average,0.39809141,3.778318915,35.09228677,42.13749849,577.2729147
\ No newline at end of file