Skip to content

Commit

Permalink
Merge pull request OSOceanAcoustics#268 from brandynlucca/biological_…
Browse files Browse the repository at this point in the history
…data_column_naming

Translation dictionary for data column names
  • Loading branch information
brandynlucca authored Sep 13, 2024
2 parents 7e18a9a + 125d63a commit 819cf80
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 75 deletions.
39 changes: 35 additions & 4 deletions echopop/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,37 @@
},
}

# Name configuration dictionary
NAME_CONFIG = {
"Age": "age",
"Cell portion": "fraction_cell_in_polygon",
"Frequency": "length_count",
"haul": "haul_num",
"haul end": "haul_end",
"haul start": "haul_start",
"Haul": "haul_num",
"Latitude": "latitude",
"Latitude (upper limit)": "northlimit_latitude",
"Latitude of centroid": "centroid_latitude",
"Length": "length",
"Longitude": "longitude",
"Longitude of centroid": "centroid_longitude",
"strata": "stratum_num",
"Sex": "sex",
"Ship": "ship_id",
"Spacing": "transect_spacing",
"Species_Code": "species_id",
"Species_Name": "species_name",
"Strata Index": "stratum_num",
"Stratum": "stratum_num",
"Transect": "transect_num",
"VL start": "vessel_log_start",
"VL end": "vessel_log_end",
"wt": "fraction_hake",
"Weight": "weight",
"Weight_In-Haul": "haul_weight",
}

# `Survey` object data structure
CONFIG_MAP = {
"biological": {
Expand Down Expand Up @@ -115,14 +146,14 @@
"geo_strata": {
"stratum_num": int,
"northlimit_latitude": np.float64,
"haul start": int,
"haul end": int,
"haul_start": int,
"haul_end": int,
},
"inpfc_strata": {
"stratum_num": int,
"northlimit_latitude": np.float64,
"haul start": int,
"haul end": int,
"haul_start": int,
"haul_end": int,
},
},
"NASC": {
Expand Down
87 changes: 19 additions & 68 deletions echopop/utils/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import numpy as np
import pandas as pd
import yaml
from openpyxl import load_workbook

from ..core import (
BIODATA_HAUL_MAP,
Expand All @@ -14,6 +13,7 @@
CONFIG_MAP,
DATA_STRUCTURE,
LAYER_NAME_MAP,
NAME_CONFIG,
)
from .data_structure_utils import map_imported_datasets

Expand Down Expand Up @@ -199,9 +199,6 @@ def load_dataset(
else:
config_map[2] = region_id

# Validate column names of this iterated file
validate_data_columns(file_name, sheet_name, config_map, validation_settings)

# Validate datatypes within dataset and make appropriate changes to dtypes
# ---- This first enforces the correct dtype for each imported column
# ---- This then assigns the imported data to the correct class attribute
Expand Down Expand Up @@ -237,12 +234,6 @@ def load_dataset(
# Update configuration key map
config_map = [dataset, datalayer]

# Validate datatypes within dataset and make appropriate changes to dtypes
# (if necessary)
# ---- This first enforces the correct dtype for each imported column
# ---- This then assigns the imported data to the correct class attribute
validate_data_columns(file_name, sheets, config_map, validation_settings)

# Read in data and add to `Survey` object
read_validated_data(
input_dict,
Expand Down Expand Up @@ -296,11 +287,14 @@ def read_validated_data(
df_initial = df_initial.drop(0)

# Slice only the columns that are relevant to the echopop module functionality
valid_columns = list(set(validation_settings.keys()).intersection(set(df_initial.columns)))
df_filtered = df_initial[valid_columns]
df_filtered = df_initial.filter(validation_settings)

# Ensure the order of columns in df_filtered matches df_initial
df_filtered = df_filtered[df_initial.columns]
# Error evaluation and print message (if applicable)
if not set(validation_settings).issubset(set(df_filtered)):
missing_columns = set(validation_settings.keys()) - set(df_filtered)
raise ValueError(
f"Missing kriging/variogram parameters in the Excel file: {missing_columns}"
)

# Apply data types from validation_settings to the filtered DataFrame
df = df_filtered.apply(
Expand All @@ -311,7 +305,15 @@ def read_validated_data(

else:
# Read Excel file into memory -- this only reads in the required columns
df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=validation_settings.keys())
# df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=validation_settings.keys())
df = pd.read_excel(file_name, sheet_name=sheet_name)
# ---- Rename the columns, if needed, and then filter them
df = df.rename(columns=NAME_CONFIG).filter(validation_settings)

# Error evaluation and print message (if applicable)
if not set(validation_settings).issubset(set(df)):
missing_columns = set(validation_settings.keys()) - set(df)
raise ValueError(f"Missing columns in the Excel file: {missing_columns}")

# Apply data types from validation_settings to the filtered DataFrame
df = df.apply(lambda col: col.astype(validation_settings.get(col.name, type(col[0]))))
Expand Down Expand Up @@ -373,62 +375,11 @@ def read_validated_data(
input_dict["acoustics"]["nasc_df"][column_to_add] = df[column_to_add]
else:
raise ValueError(
"""Unexpected data attribute structure. Check API settings located in"""
"""the configuration YAML and core.py"""
"Unexpected data attribute structure. Check the settings in "
"the configuration YAML and core.py."
)


def validate_data_columns(
file_name: Path, sheet_name: str, config_map: list, validation_settings: dict
):
"""
Opens a virtual instance of each .xlsx file to validate the presence
of require data column/variable names
Parameters
----------
file_name: Path
File path of data
sheet_name: str
Name of Excel sheet containing data
config_map: list
A list parsed from the file name that indicates how data attributes
within `self` are organized
validation_settings: dict
The subset CONFIG_MAP settings that contain the target column names
"""

# Open connection with the workbook and specific sheet
# This is useful for not calling the workbook into memory and allows for parsing
# only the necessary rows/column names
try:
workbook = load_workbook(file_name, read_only=True)

# If multiple sheets, iterate through
sheet_name = [sheet_name] if isinstance(sheet_name, str) else sheet_name

for sheets in sheet_name:
sheet = workbook[sheets]

# Validate that the expected columns are contained within the parsed
# column names of the workbook
if "vario_krig_para" in config_map:
data_columns = [list(row) for row in zip(*sheet.iter_rows(values_only=True))][0]
else:
data_columns = {col.value for col in sheet[1]}

# Error evaluation and print message (if applicable)
if not set(validation_settings.keys()).issubset(set(data_columns)):
missing_columns = set(validation_settings.keys()) - set(data_columns)
raise ValueError(f"Missing columns in the Excel file: {missing_columns}")

# Close connection to the work book
workbook.close()

except Exception as e:
print(f"Error reading file '{str(file_name)}': {e}")


def write_haul_to_transect_key(configuration_dict: dict, verbose: bool):
"""
Function for writing the haul-transect mapping key .xlsx file.
Expand Down
2 changes: 0 additions & 2 deletions environment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ dependencies:
# Computational stack
- geopandas
- geopy
- openpyxl
# 9/27/23: Pin to <2 to avoid new warning occurring in several cases
- pandas<2
- PyYAML
- shapely<2
- scipy
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ traitlets
geopandas
geopy
lmfit
openpyxl>=3.1.3
# 9/27/23: Pin to <2 to avoid new warning occurring in several cases
pandas
python-dateutil
Expand Down

0 comments on commit 819cf80

Please sign in to comment.