datacommonsorg · kurus21 · Dec 22, 2024 · Dec 27, 2024 · Dec 29, 2024 · Dec 29, 2024
diff --git a/scripts/us_epa/ghgrp/download.py b/scripts/us_epa/ghgrp/download.py
@@ -1,93 +1,123 @@
-# Copyright 2021 Google LLC
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Module to download and do light processing on import data."""
-# TODO(beets): Add tests
-
 import io
+import logging
 import os
 import ssl
-
-import pandas as pd
+import re
 import requests
+from datetime import datetime
+import pandas as pd
 import zipfile
 
-DOWNLOAD_URI = 'https://www.epa.gov/sites/default/files/2020-11/2019_data_summary_spreadsheets.zip'
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+# URL templates
+download_url = 'https://www.epa.gov/system/files/other-files/{year}-10/{year_minus_1}_data_summary_spreadsheets.zip'
+crosswalk_url = 'https://www.epa.gov/system/files/documents/{yr}-04/ghgrp_oris_power_plant_crosswalk_12_13_21.xlsx'
+
+# Constants
 YEAR_DATA_FILENAME = 'ghgp_data_{year}.xlsx'
 HEADER_ROW = 3
-CROSSWALK_URI = 'https://www.epa.gov/sites/default/files/2020-12/ghgrp_oris_power_plant_crosswalk_11_24_20.xlsx'
 CROSSWALK_COLS_TO_KEEP = [
     'GHGRP Facility ID', 'ORIS CODE', 'ORIS CODE 2', 'ORIS CODE 3',
     'ORIS CODE 4', 'ORIS CODE 5'
 ]
 GHGRP_ID_COL = 'Facility Id'
 
-_DIRECT_EMITTERS_SHEET = 'Direct Emitters'
+_DIRECT_EMITTERS_SHEET = r"^Direct.*Emitters$"
+
 SHEET_NAMES_TO_CSV_FILENAMES = {
-    _DIRECT_EMITTERS_SHEET: 'direct_emitters.csv',
     'Onshore Oil & Gas Prod.': 'oil_and_gas.csv',
     'Gathering & Boosting': 'gathering_and_boosting.csv',
     'LDC - Direct Emissions': 'local_distribution.csv',
     'SF6 from Elec. Equip.': 'elec_equip.csv',
-    # Needs schema:
-    # - 'Transmission Pipelines',
-    # The following sheets are skipped due to sparse data:
-    # - 'Suppliers',
-    # - 'CO2 Injection',
-    # - 'Geologic Sequestration of CO2',
 }
 
+def get_csv_filename(sheet_name):
+    """
+    Determines the CSV filename for a given sheet name.
+    Sheets matching the DIRECT_EMITTERS_PATTERN are saved as 'direct_emitters.csv'.
+    """
+    if re.match(_DIRECT_EMITTERS_SHEET, sheet_name):
+        return 'direct_emitters.csv'
+    return SHEET_NAMES_TO_CSV_FILENAMES.get(sheet_name)
 
 class Downloader:
     """
-    The following must be called in order. Earlier steps can be skipped if it has successfully completed in a previous run.
-    - download_data
-    - extract_all_years
-    - save_all_crosswalks
+    Handles downloading, extracting, and processing data files.
     """
 
     def __init__(self, save_path):
-        self.years = list(range(2010, 2020))
+        self.years = list(range(2010, datetime.now().year))
         self.current_year = None
         self.files = []  # list of (year, filename) of all extracted files
         self.save_path = save_path
 
-    def download_data(self):
-        """Downloads and unzips excel files from DOWNLOAD_URI."""
-        print(f'Downloading data')
-        r = requests.get(DOWNLOAD_URI)
-        z = zipfile.ZipFile(io.BytesIO(r.content))
-        z.extractall(self.save_path)
+        # Ensure the save directory exists
+        os.makedirs(self.save_path, exist_ok=True)
+
+    def check_url(self, url):
+        """
+        Checks if a given URL is accessible.
+        """
+        try:
+            response = requests.head(url)
+            response.raise_for_status()
+            logging.info(f"URL is valid: {url}")
+            return True
+        except requests.RequestException as e:
+            logging.warning(f"URL check failed: {url}. Error: {e}")
+            return False
+
+    def generate_and_validate(self, template, **kwargs):
+        """
+        Generates a URL using a template and validates its existence.
+        """
+        url = template.format(**kwargs)
+        if not self.check_url(url):
+            raise ValueError(f"URL not valid: {url}")
+        return url
+
+    def download_data(self, year, year_minus_1):
+        """
+        Downloads and unzips Excel files from dynamically generated DOWNLOAD_URI.
+        """
+        uri = self.generate_and_validate(download_url, year = year, year_minus_1 = year_minus_1)
+        logging.info(f'Downloading data from {uri}')
+        try:
+            r = requests.get(uri)
+            r.raise_for_status()  # Raise an error for unsuccessful responses
+            z = zipfile.ZipFile(io.BytesIO(r.content))
+            for file in z.namelist():
+                # Skip directories
+                if not file.endswith('/'):
+                    target_path = os.path.join(self.save_path, os.path.basename(file))
+                    with z.open(file) as source, open(target_path, 'wb') as target:
+                        target.write(source.read())
+        except Exception as e:
+            logging.error(f"Failed to download or extract data for {year}: {e}")
 
     def extract_all_years(self):
-        """Saves relevant sheets from each year's Excel file to a csv."""
+        """
+        Saves relevant sheets from each year's Excel file to a CSV.
+        """
         headers = {}
         for sheet, _ in SHEET_NAMES_TO_CSV_FILENAMES.items():
             headers[sheet] = {}
         for current_year in self.years:
-            print(f'Extracting data for {current_year}')
+            logging.info(f'Extracting data for {current_year}')
             self.current_year = current_year
             self._extract_data(headers)
         for sheet, csv_name in SHEET_NAMES_TO_CSV_FILENAMES.items():
             headers_df = pd.DataFrame.from_dict(headers[sheet], orient='index')
-            headers_df.transpose().to_csv(os.path.join(self.save_path,
-                                                       f'cols_{csv_name}'),
-                                          index=None)
+            headers_df.transpose().to_csv(os.path.join(self.save_path, f'cols_{csv_name}'), index=None)
         return self.files
 
     def save_all_crosswalks(self, filepath):
-        """Builds individual year crosswalks, as well as a join crosswalk for all years."""
-        print(f'Saving all ID crosswalks')
+        """
+        Builds individual year crosswalks, as well as a joint crosswalk for all years.
+        """
+        logging.info('Saving all ID crosswalks')
         crosswalks = []
         for current_year in self.years:
             crosswalks.append(self._gen_crosswalk())
@@ -105,47 +135,67 @@ def _csv_path(self, csv_filename, year=None):
 
     def _extract_data(self, headers):
         summary_filename = os.path.join(
-            self.save_path, YEAR_DATA_FILENAME.format(year=self.current_year))
+                self.save_path, YEAR_DATA_FILENAME.format(year=self.current_year)
+                        )
+
         xl = pd.ExcelFile(summary_filename, engine='openpyxl')
+        logging.info(f"Available sheets in {summary_filename}: {xl.sheet_names}")
+        check_list=[]
         for sheet in xl.sheet_names:
-            csv_filename = SHEET_NAMES_TO_CSV_FILENAMES.get(sheet, None)
+            csv_filename = get_csv_filename(sheet)
+            check_list.append(csv_filename)
             if not csv_filename:
-                print(f'Skipping sheet: {sheet}')
+                logging.info(f'Skipping sheet: {sheet}')
                 continue
             summary_file = xl.parse(sheet, header=HEADER_ROW, dtype=str)
-            csv_filename = self._csv_path(csv_filename)
-            summary_file.to_csv(csv_filename, index=None, header=True)
-            headers[sheet][self.current_year] = summary_file.columns
-            self.files.append((self.current_year, csv_filename))
+            csv_path = self._csv_path(csv_filename)
+            summary_file.to_csv(csv_path, index=None, header=True)
+            headers.setdefault(sheet, {})[self.current_year] = summary_file.columns
+            self.files.append((self.current_year, csv_path))
+        if "direct_emitters.csv" not in check_list:
+                logging.error(f"'direct_emitters.csv' not found in the sheets for {self.current_year}. Aborting!")
+                raise SystemExit(f"Missing required sheet for 'direct_emitters.csv' in year {self.current_year}. Exiting.")
+
 
     def _gen_crosswalk(self):
-        # Per https://stackoverflow.com/a/56230607
         ssl._create_default_https_context = ssl._create_unverified_context
+        try:
+            oris_df = pd.read_excel(
+                self.generate_and_validate(crosswalk_url, yr=self.current_year),
+                'ORIS Crosswalk',
+                header=0,
+                dtype=str,
+                usecols=CROSSWALK_COLS_TO_KEEP,
+                engine='openpyxl'
+            )
+        except Exception:
+            logging.warning(f"Using fallback CROSSWALK_URI for 2022")
+            oris_df = pd.read_excel(
+                self.generate_and_validate(crosswalk_url, yr=2022),
+                'ORIS Crosswalk',
+                header=0,
+                dtype=str,
+                usecols=CROSSWALK_COLS_TO_KEEP,
+                engine='openpyxl'
+            )
 
-        oris_df = pd.read_excel(CROSSWALK_URI,
-                                'ORIS Crosswalk',
-                                header=0,
-                                dtype=str,
-                                usecols=CROSSWALK_COLS_TO_KEEP,
-                                engine='openpyxl')
         oris_df = oris_df.rename(columns={'GHGRP Facility ID': GHGRP_ID_COL})
         all_facilities_df = pd.DataFrame()
         for sheet, csv_filename in SHEET_NAMES_TO_CSV_FILENAMES.items():
             csv_path = self._csv_path(csv_filename)
             if not os.path.exists(csv_path):
                 continue
-            df = pd.read_csv(csv_path,
-                             usecols=[GHGRP_ID_COL, 'FRS Id'],
-                             dtype=str)
-            all_facilities_df = all_facilities_df.append(df)
+            df = pd.read_csv(csv_path, usecols=[GHGRP_ID_COL, 'FRS Id'], dtype=str)
+            all_facilities_df = pd.concat([all_facilities_df, df], ignore_index=True)
         all_facilities_df = all_facilities_df.join(
             oris_df.set_index(GHGRP_ID_COL), on=GHGRP_ID_COL, how='left')
         return all_facilities_df
 
 
 if __name__ == '__main__':
     downloader = Downloader('tmp_data')
-    downloader.download_data()
+    url_year= datetime.now().year
+    if url_year < 2030:
+        downloader.download_data(url_year,url_year-1)
     downloader.extract_all_years()
-    downloader.save_all_crosswalks(
-        os.path.join(self.save_path, 'crosswalks.csv'))
+    downloader.save_all_crosswalks(os.path.join(downloader.save_path, 'crosswalks.csv'))
diff --git a/scripts/us_epa/ghgrp/gen_data.py b/scripts/us_epa/ghgrp/gen_data.py
@@ -0,0 +1,40 @@
+from gas import *
+from process import *
+from sources import *
+from download import *
+
+import os
+import sys
+import unittest
+import csv
+
+_FACILITY_ID = 'Facility Id'
+_DCID = 'dcid'
+_SV = 'sv'
+_YEAR = 'year'
+_VALUE = 'value'
+_OUT_FIELDNAMES = [_DCID, _SV, _YEAR, _VALUE]
+_SAVE_PATH = 'tmp_data'
+_OUT_PATH = 'import_data'
+
+if __name__ == '__main__':
+    with open(os.path.join('import_data', 'gas_node.mcf'), 'w') as fp:
+        append_gas_mcf(fp)
+    with open(os.path.join('import_data', 'gas_sv.mcf'), 'w') as fp:
+        append_sv_mcf(fp)
+
+    with open(os.path.join('import_data', 'sources_node.mcf'), 'w') as fp:
+        append_source_mcf(fp)
+    with open(os.path.join('import_data', 'sources_sv.mcf'), 'w') as fp:
+        append_sv_mcf(fp)
+
+    downloader = Downloader('tmp_data')
+    downloader = download.Downloader(_SAVE_PATH)
+    url_year = datetime.now().year
+    if url_year < 2030:
+        downloader.download_data(url_year, url_year - 1)
+    files = downloader.extract_all_years()
+    crosswalk_file = os.path.join(_SAVE_PATH, 'crosswalks.csv')
+    downloader.save_all_crosswalks(crosswalk_file)
+    crosswalk = cw.Crosswalk(crosswalk_file)
+    process_data(files, crosswalk, os.path.join(_OUT_PATH, 'all_data.csv'))
diff --git a/scripts/us_epa/ghgrp/manifest.json b/scripts/us_epa/ghgrp/manifest.json
@@ -0,0 +1,19 @@
+{
+  "import_specifications": [
+    {
+      "import_name": "EPA_GHGRP",
+      "curator_emails": ["[email protected]"],
+      "provenance_url": "https://www.epa.gov/ghgreporting",
+      "provenance_description": "EPA emissions data reported as part of the Greenhouse Gas Reporting Program (GHGRP).",
+      "scripts": ["gen_data.py"],
+      "import_inputs": [
+        {
+          "template_mcf": "import_data/observations.tmcf",
+          "cleaned_csv": "import_data/all_data.csv"
+        }
+      ],
+      "cron_schedule": "0 03 * * 1"
+    }
+  ]
+}
+