Skip to content

Commit

Permalink
cleanup the excel extractor to no longer look for a single file
Browse files Browse the repository at this point in the history
  • Loading branch information
cmgosnell committed Dec 29, 2023
1 parent d26b4aa commit dd47c1f
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 28 deletions.
40 changes: 12 additions & 28 deletions src/pudl/extract/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,36 +327,20 @@ def load_excel_file(self, page, **partition):
pd.ExcelFile instance with the parsed excel spreadsheet frame
"""
xlsx_filename = self.excel_filename(page, **partition)

if xlsx_filename not in self._file_cache:
excel_file = None
try:
# eia860m exports the resources as raw xlsx files that are not
# embedded in zip archives. To support this, we will first try
# to retrieve the resource directly. If this fails, we will attempt
# to open zip archive and locate the xlsx file inside that.

# TODO(rousik): if we can make it so, it would be useful to normalize
# the eia860m and zip the xlsx files. Then we could simplify this code.
res = self.ds.get_unique_resource(
self._dataset_name, name=xlsx_filename
zf = self.ds.get_zipfile_resource(self._dataset_name, **partition)
# try to open a dbf file.
extension = pathlib.Path(xlsx_filename).suffix.lower()
if extension == ".dbf":
dbf_filepath = zf.open(xlsx_filename)
df = pd.DataFrame(
iter(dbfread.DBF(xlsx_filename, filedata=dbf_filepath))
)
excel_file = pd.ExcelFile(res)
except KeyError:
zf = self.ds.get_zipfile_resource(self._dataset_name, **partition)

# If loading the excel file from the zip fails then try to open a dbf file.
extension = pathlib.Path(xlsx_filename).suffix.lower()
if extension == ".dbf":
dbf_filepath = zf.open(xlsx_filename)
df = pd.DataFrame(
iter(dbfread.DBF(xlsx_filename, filedata=dbf_filepath))
)
excel_file = pudl.helpers.convert_df_to_excel_file(df, index=False)
else:
excel_file = pd.ExcelFile(BytesIO(zf.read(xlsx_filename)))
finally:
self._file_cache[xlsx_filename] = excel_file
excel_file = pudl.helpers.convert_df_to_excel_file(df, index=False)
else:
excel_file = pd.ExcelFile(BytesIO(zf.read(xlsx_filename)))

self._file_cache[xlsx_filename] = excel_file
# TODO(rousik): this _file_cache could be replaced with @cache or @memoize annotations
return self._file_cache[xlsx_filename]

Expand Down
1 change: 1 addition & 0 deletions src/pudl/workspace/datastore.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,7 @@ def get_unique_resource(self, dataset: str, **filters: Any) -> bytes:

def get_zipfile_resource(self, dataset: str, **filters: Any) -> zipfile.ZipFile:
"""Retrieves unique resource and opens it as a ZipFile."""
# TODO: add some check if the result of get_unique_resource isn't a zip file.
return zipfile.ZipFile(io.BytesIO(self.get_unique_resource(dataset, **filters)))

def get_zipfile_resources(
Expand Down

0 comments on commit dd47c1f

Please sign in to comment.