Merge pull request OSOceanAcoustics#268 from brandynlucca/biological_…

…data_column_naming Translation dictionary for data column names
brandynlucca · Sep 13, 2024 · 819cf80 · 819cf80
2 parents 7e18a9a + 125d63a
commit 819cf80
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 75 deletions.
diff --git a/echopop/core.py b/echopop/core.py
@@ -72,6 +72,37 @@
     },
 }
 
+# Name configuration dictionary
+NAME_CONFIG = {
+    "Age": "age",
+    "Cell portion": "fraction_cell_in_polygon",
+    "Frequency": "length_count",
+    "haul": "haul_num",
+    "haul end": "haul_end",
+    "haul start": "haul_start",
+    "Haul": "haul_num",
+    "Latitude": "latitude",
+    "Latitude (upper limit)": "northlimit_latitude",
+    "Latitude of centroid": "centroid_latitude",
+    "Length": "length",
+    "Longitude": "longitude",
+    "Longitude of centroid": "centroid_longitude",
+    "strata": "stratum_num",
+    "Sex": "sex",
+    "Ship": "ship_id",
+    "Spacing": "transect_spacing",
+    "Species_Code": "species_id",
+    "Species_Name": "species_name",
+    "Strata Index": "stratum_num",
+    "Stratum": "stratum_num",
+    "Transect": "transect_num",
+    "VL start": "vessel_log_start",
+    "VL end": "vessel_log_end",
+    "wt": "fraction_hake",
+    "Weight": "weight",
+    "Weight_In-Haul": "haul_weight",
+}
+
 # `Survey` object data structure
 CONFIG_MAP = {
     "biological": {
@@ -115,14 +146,14 @@
         "geo_strata": {
             "stratum_num": int,
             "northlimit_latitude": np.float64,
-            "haul start": int,
-            "haul end": int,
+            "haul_start": int,
+            "haul_end": int,
         },
         "inpfc_strata": {
             "stratum_num": int,
             "northlimit_latitude": np.float64,
-            "haul start": int,
-            "haul end": int,
+            "haul_start": int,
+            "haul_end": int,
         },
     },
     "NASC": {

diff --git a/echopop/utils/load.py b/echopop/utils/load.py
@@ -5,7 +5,6 @@
 import numpy as np
 import pandas as pd
 import yaml
-from openpyxl import load_workbook
 
 from ..core import (
     BIODATA_HAUL_MAP,
@@ -14,6 +13,7 @@
     CONFIG_MAP,
     DATA_STRUCTURE,
     LAYER_NAME_MAP,
+    NAME_CONFIG,
 )
 from .data_structure_utils import map_imported_datasets
 
@@ -199,9 +199,6 @@ def load_dataset(
                     else:
                         config_map[2] = region_id
 
-                    # Validate column names of this iterated file
-                    validate_data_columns(file_name, sheet_name, config_map, validation_settings)
-
                     # Validate datatypes within dataset and make appropriate changes to dtypes
                     # ---- This first enforces the correct dtype for each imported column
                     # ---- This then assigns the imported data to the correct class attribute
@@ -237,12 +234,6 @@ def load_dataset(
                         # Update configuration key map
                         config_map = [dataset, datalayer]
 
-                    # Validate datatypes within dataset and make appropriate changes to dtypes
-                    # (if necessary)
-                    # ---- This first enforces the correct dtype for each imported column
-                    # ---- This then assigns the imported data to the correct class attribute
-                    validate_data_columns(file_name, sheets, config_map, validation_settings)
-
                     # Read in data and add to `Survey` object
                     read_validated_data(
                         input_dict,
@@ -296,11 +287,14 @@ def read_validated_data(
         df_initial = df_initial.drop(0)
 
         # Slice only the columns that are relevant to the echopop module functionality
-        valid_columns = list(set(validation_settings.keys()).intersection(set(df_initial.columns)))
-        df_filtered = df_initial[valid_columns]
+        df_filtered = df_initial.filter(validation_settings)
 
-        # Ensure the order of columns in df_filtered matches df_initial
-        df_filtered = df_filtered[df_initial.columns]
+        # Error evaluation and print message (if applicable)
+        if not set(validation_settings).issubset(set(df_filtered)):
+            missing_columns = set(validation_settings.keys()) - set(df_filtered)
+            raise ValueError(
+                f"Missing kriging/variogram parameters in the Excel file: {missing_columns}"
+            )
 
         # Apply data types from validation_settings to the filtered DataFrame
         df = df_filtered.apply(
@@ -311,7 +305,15 @@ def read_validated_data(
 
     else:
         # Read Excel file into memory -- this only reads in the required columns
-        df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=validation_settings.keys())
+        # df = pd.read_excel(file_name, sheet_name=sheet_name, usecols=validation_settings.keys())
+        df = pd.read_excel(file_name, sheet_name=sheet_name)
+        # ---- Rename the columns, if needed, and then filter them
+        df = df.rename(columns=NAME_CONFIG).filter(validation_settings)
+
+        # Error evaluation and print message (if applicable)
+        if not set(validation_settings).issubset(set(df)):
+            missing_columns = set(validation_settings.keys()) - set(df)
+            raise ValueError(f"Missing columns in the Excel file: {missing_columns}")
 
         # Apply data types from validation_settings to the filtered DataFrame
         df = df.apply(lambda col: col.astype(validation_settings.get(col.name, type(col[0]))))
@@ -373,62 +375,11 @@ def read_validated_data(
             input_dict["acoustics"]["nasc_df"][column_to_add] = df[column_to_add]
     else:
         raise ValueError(
-            """Unexpected data attribute structure. Check API settings located in"""
-            """the configuration YAML and core.py"""
+            "Unexpected data attribute structure. Check the settings in "
+            "the configuration YAML and core.py."
         )
 
 
-def validate_data_columns(
-    file_name: Path, sheet_name: str, config_map: list, validation_settings: dict
-):
-    """
-    Opens a virtual instance of each .xlsx file to validate the presence
-    of require data column/variable names
-
-    Parameters
-    ----------
-    file_name: Path
-        File path of data
-    sheet_name: str
-        Name of Excel sheet containing data
-    config_map: list
-        A list parsed from the file name that indicates how data attributes
-        within `self` are organized
-    validation_settings: dict
-        The subset CONFIG_MAP settings that contain the target column names
-    """
-
-    # Open connection with the workbook and specific sheet
-    # This is useful for not calling the workbook into memory and allows for parsing
-    # only the necessary rows/column names
-    try:
-        workbook = load_workbook(file_name, read_only=True)
-
-        # If multiple sheets, iterate through
-        sheet_name = [sheet_name] if isinstance(sheet_name, str) else sheet_name
-
-        for sheets in sheet_name:
-            sheet = workbook[sheets]
-
-            # Validate that the expected columns are contained within the parsed
-            # column names of the workbook
-            if "vario_krig_para" in config_map:
-                data_columns = [list(row) for row in zip(*sheet.iter_rows(values_only=True))][0]
-            else:
-                data_columns = {col.value for col in sheet[1]}
-
-            # Error evaluation and print message (if applicable)
-            if not set(validation_settings.keys()).issubset(set(data_columns)):
-                missing_columns = set(validation_settings.keys()) - set(data_columns)
-                raise ValueError(f"Missing columns in the Excel file: {missing_columns}")
-
-        # Close connection to the work book
-        workbook.close()
-
-    except Exception as e:
-        print(f"Error reading file '{str(file_name)}': {e}")
-
-
 def write_haul_to_transect_key(configuration_dict: dict, verbose: bool):
     """
     Function for writing the haul-transect mapping key .xlsx file.

diff --git a/environment.yaml b/environment.yaml
@@ -14,9 +14,7 @@ dependencies:
   # Computational stack
   - geopandas
   - geopy
-  - openpyxl
   # 9/27/23: Pin to <2 to avoid new warning occurring in several cases
   - pandas<2
-  - PyYAML
   - shapely<2
   - scipy
diff --git a/requirements.txt b/requirements.txt
@@ -7,7 +7,6 @@ traitlets
 geopandas
 geopy
 lmfit
-openpyxl>=3.1.3
 # 9/27/23: Pin to <2 to avoid new warning occurring in several cases
 pandas
 python-dateutil