Feat: Data Uploader Refactor #365 (#374)

* feat: (partial) adds components for displaying issues, modifies popup * -feat: new issues component working as expected with errors and warnings split up * -adds issue type to detail ie 'warning name' or 'error name' * First version of backend refactor * Cleanup --------- Co-authored-by: emi-hi <[email protected]> Co-authored-by: Emily <[email protected]>
bcgov · Jul 25, 2024 · ed70a41 · ed70a41
1 parent 6424ca9
commit ed70a41
Show file tree

Hide file tree

Showing 6 changed files with 305 additions and 321 deletions.
diff --git a/django/api/constants/constants.py b/django/api/constants/constants.py
@@ -655,11 +655,11 @@ class GoElectricRebatesColumnMapping(Enum):
         "sheet_name": "Distribution List - Master",
         "preparation_functions": [prepare_go_electric_rebates],
         "validation_functions": [
-            {"error_type": "Phone Error", "function": validate_phone_numbers, "columns": ["Phone Number"], "kwargs": {"indices_offset": 2}},
-            {"error_type": "Potential Typo", "function": typo_checker, "columns": ["Applicant Name"], "kwargs": {"cutoff": 0.8, "indices_offset": 2}},
-            {"error_type": "Location Not Found", "function": location_checker, "columns": ["City"], "kwargs": {"indices_offset":2}},
-            {"error_type": "Invalid Email", "function": email_validator, "columns": ["Email"], "kwargs": {"indices_offset":2, "get_resolver": get_google_resolver}},
-            {"error_type": "Invalid Value", "function": validate_field_values, "columns": [], "kwargs": {"indices_offset":2, "fields_and_values": GER_VALID_FIELD_VALUES}}
+            {"function": validate_phone_numbers, "columns": ["Phone Number"], "kwargs": {"indices_offset": 2}},
+            {"function": typo_checker, "columns": ["Applicant Name"], "kwargs": {"cutoff": 0.8, "indices_offset": 2}},
+            {"function": location_checker, "columns": ["City"], "kwargs": {"indices_offset":2}},
+            {"function": email_validator, "columns": ["Email"], "kwargs": {"indices_offset":2, "get_resolver": get_google_resolver}},
+            {"function": validate_field_values, "columns": [], "kwargs": {"indices_offset":2, "fields_and_values": GER_VALID_FIELD_VALUES}}
         ]
     },
 }
diff --git a/django/api/services/spreadsheet_uploader.py b/django/api/services/spreadsheet_uploader.py
@@ -39,6 +39,8 @@ def transform_data(
     df,
     dataset_columns,
     column_mapping_enum,
+    field_types,
+    model,
     preparation_functions=[],
     validation_functions=[],
 ):
@@ -47,45 +49,17 @@ def transform_data(
     df = df[[col for col in df.columns if col in required_columns]]
 
     missing_columns = [col for col in required_columns if col not in df.columns]
-    if missing_columns:
+    if (missing_columns):
         raise ValueError(f"Missing columns: {', '.join(missing_columns)}")
 
     for prep_func in preparation_functions:
         df = prep_func(df)
 
-    validation_errors = {}
-    for x in validation_functions:
-        validate = x["function"]
-        columns = x["columns"]
-        kwargs = x["kwargs"]
-        key = x["error_type"]
-        errors = validate(df, *columns, **kwargs)
-        if errors:
-            validation_errors[key] = errors
-
-    column_mapping = {col.name: col.value for col in column_mapping_enum}
-    # Need to use the inverse (keys) for mapping the columns to what the database expects in order to use enums
-    inverse_column_mapping = {v: k for k, v in column_mapping.items()}
-    df.rename(columns=inverse_column_mapping, inplace=True)
-
-    return df, validation_errors
-
-
-@transaction.atomic
-def load_data(df, model, field_types, replace_data, user, validation_errors):
-    row_count = 0
-    records_inserted = 0
-    errors = []
     nullable_fields = get_nullable_fields(model)
-
-    # validation_error_rows = get_validation_error_rows(errors) This may be used going forward for validation errors that cannot be overwritten.
-
-    if replace_data:
-        model.objects.all().delete()
+    errors_and_warnings = {}
 
     for index, row in df.iterrows():
         row_dict = row.to_dict()
-        valid_row = True
 
         for column, value in row_dict.items():
             expected_type = field_types.get(column)
@@ -95,11 +69,17 @@ def load_data(df, model, field_types, replace_data, user, validation_errors):
                 if is_nullable:
                     row_dict[column] = None
                 else:
-                    errors.append(f"Row {index + 1}: Has an empty cell where one is expected in '{column}'")
-                    valid_row = False
-                    continue
-
-            if expected_type in [int, float, Decimal] and value != None and pd.notna(value):
+                    if column not in errors_and_warnings:
+                        errors_and_warnings[column] = {}
+                    if "Empty Value" not in errors_and_warnings[column]:
+                        errors_and_warnings[column]["Empty Value"] = {
+                            "Expected Type": "Expected value where there isn't one.",
+                            "Rows": [],
+                            "Severity": "Error"
+                        }
+                    errors_and_warnings[column]["Empty Value"]["Rows"].append(index + 1)
+
+            if expected_type in [int, float, Decimal] and value is not None and pd.notna(value):
                 value = str(value).replace(',', '').strip()
                 try:
                     if expected_type == int:
@@ -109,39 +89,73 @@ def load_data(df, model, field_types, replace_data, user, validation_errors):
                     else:
                         row_dict[column] = float(value)
                 except ValueError:
-                    errors.append(
-                        f"Row {index + 1}: Unable to convert value to {expected_type.__name__} for '{column}'. Value was '{value}'."
-                    )
-                    valid_row = False
-                    continue
-
-            elif not isinstance(row_dict[column], expected_type) and value != "":
-                errors.append(
-                    f"Row {index + 1}: Incorrect type for '{column}'. Expected {expected_type.__name__}, got {type(row_dict[column]).__name__}."
-                )
-                valid_row = False
-                continue
-
-            # if index + 1 in validation_error_rows:
-            #     valid_row = False
-            #     continue
-
-        if valid_row:
-            try:
-                row_dict["update_user"] = user
-                model_instance = model(**row_dict)
-                model_instance.full_clean()
-                model_instance.save()
-                records_inserted += 1
-            except Exception as e:
-                errors.append(f"Row {index + 1}: {e}")
-
-        row_count += 1
+                    if column not in errors_and_warnings:
+                        errors_and_warnings[column] = {}
+                    if "Incorrect Type" not in errors_and_warnings[column]:
+                        errors_and_warnings[column]["Incorrect Type"] = {
+                            "Expected Type": "The following rows contained incorrect value types for the " + column + " column",
+                            "Rows": [],
+                            "Severity": "Error"
+                        }
+                    errors_and_warnings[column]["Incorrect Type"]["Rows"].append(index + 1)
+
+            # Check if expected_type is valid before using isinstance
+            elif expected_type is not None and isinstance(expected_type, type) and not isinstance(row_dict[column], expected_type) and value != "":
+                if column not in errors_and_warnings:
+                    errors_and_warnings[column] = {}
+                if "Incorrect Type" not in errors_and_warnings[column]:
+                    errors_and_warnings[column]["Incorrect Type"] = {
+                        "Expected Type": "The following rows contained incorrect value types for the " + column + " column",
+                        "Rows": [],
+                        "Severity": "Error"
+                    }
+                errors_and_warnings[column]["Incorrect Type"]["Rows"].append(index + 1)
+
+    for x in validation_functions:
+        validate = x["function"]
+        columns = x["columns"]
+        kwargs = x["kwargs"]
+        warnings = validate(df, *columns, **kwargs)
+
+        if warnings:
+            for column, issues in warnings.items():
+                if column not in errors_and_warnings:
+                    errors_and_warnings[column] = {}
+                for issue, details in issues.items():
+                    if issue not in errors_and_warnings[column]:
+                        errors_and_warnings[column][issue] = {
+                            "Expected Type": details.get("Expected Type", "Unknown"),
+                            "Rows": details.get("Rows", []),
+                            "Severity": details.get("Severity", "Error")
+                        }
+                    else:
+                        errors_and_warnings[column][issue]["Rows"].extend(details.get("Rows", []))
+
+    column_mapping = {col.name: col.value for col in column_mapping_enum}
+    inverse_column_mapping = {v: k for k, v in column_mapping.items()}
+    df.rename(columns=inverse_column_mapping, inplace=True)
+
+    return df, errors_and_warnings
+
+
+@transaction.atomic
+def load_data(df, model, replace_data, user):
+    records_inserted = 0
+
+    if replace_data:
+        model.objects.all().delete()
+
+    for index, row in df.iterrows():
+        row_dict = row.to_dict()
+        row_dict["update_user"] = user
+
+        model_instance = model(**row_dict)
+        model_instance.save()
+        records_inserted += 1
 
     return {
-        "row_count": row_count,
+        "row_count": len(df),
         "records_inserted": records_inserted,
-        "errors": sorted(errors, key=lambda x: int(x.split()[1][:-1])),
     }
 
 
@@ -157,57 +171,44 @@ def import_from_xls(
     user,
     preparation_functions=[],
     validation_functions=[],
-    check_for_warnings=False,
+    check_for_warnings=True,
 ):
     try:
         df = extract_data(excel_file, sheet_name, header_row)
-        df, validation_errors = transform_data(
+        df, errors_and_warnings = transform_data(
             df,
             dataset_columns,
             column_mapping_enum,
+            field_types,
+            model,
             preparation_functions,
             validation_functions,
         )
 
         if check_for_warnings:
             ## do the error checking
 
-            if validation_errors:
+            if errors_and_warnings:
                 return {
                     "success": True,
                     "message": "We encountered some potential errors in your data. Please choose whether to ignore them and continue inserting data or cancel upload and make edits to the data before reuploading",
                     "warning": True,
-                    "warnings": validation_errors,
+                    "errors_and_warnings": errors_and_warnings,
                 }
             else:
                 print('no warnings')
 
-        result = load_data(df, model, field_types, replace_data, user, validation_errors)
+        result = load_data(df, model, replace_data, user, errors_and_warnings)
 
         total_rows = result["row_count"]
         inserted_rows = result["records_inserted"]
 
-        if result["errors"] and result["records_inserted"] > 0:
-            return {
-                "success": True,
-                "message": f"{inserted_rows} out of {total_rows} rows successfully inserted with some errors encountered.",
-                "errors": result["errors"],
-                "rows_processed": result["row_count"],
-            }
-        elif len(result["errors"]) > 0:
-            return {
-                "success": False,
-                "message": "Errors encountered with no successful insertions.",
-                "errors": result["errors"],
-                "rows_processed": result["row_count"],
-            }
-
-        else:
-            return {
-                "success": True,
-                "message": f"All {inserted_rows} records successfully inserted out of {total_rows}.",
-                "rows_processed": result["row_count"],
+        return {
+            "success": True,
+            "message": f"All {inserted_rows} records successfully inserted out of {total_rows}.",
+            "rows_processed": result["row_count"],
             }
+
     except Exception as error:
         traceback.print_exc()
         error_msg = f"Unexpected error: {str(error)}"

diff --git a/django/api/services/spreadsheet_uploader_prep.py b/django/api/services/spreadsheet_uploader_prep.py
@@ -208,7 +208,7 @@ def typo_checker(df, *columns, **kwargs):
             matches = dl.get_close_matches(
                 value,
                 unique_vals.difference(singleton),
-                cutoff = kwargs["cutoff"]
+                cutoff=kwargs["cutoff"]
             )
             if matches:
                 value_indices = map_of_values_to_indices[value]
@@ -219,21 +219,16 @@ def typo_checker(df, *columns, **kwargs):
                     match_indices = map_of_values_to_indices[match]
                     indices.extend(match_indices)
         if indices:
-            result[column] = sorted(list(set(indices)))
+            result[column] = {
+                "Similar Values Detected": {
+                    "Expected Type": "We detected applicant names that sound very similar. If these names refer to the same person/entity, please replace the applicant names in your dataset to the preferred spelling to ensure consistency",
+                    "Rows": sorted(list(set(indices))),
+                    "Severity": "Warning"
+                }
+            }
     return result
 
 
-def get_validation_error_rows(errors):
-    row_numbers = set()
-    for error in errors:
-        try:
-            row_number = int(error.split()[1][:-1])
-            row_numbers.add(row_number)
-        except (IndexError, ValueError):
-            continue
-    return row_numbers
-
-
 def validate_phone_numbers(df, *columns, **kwargs):
     result = {}
     for column in columns:
@@ -244,7 +239,13 @@ def validate_phone_numbers(df, *columns, **kwargs):
             if formatted_number == '' or len(formatted_number) != 10 or int(formatted_number[:3]) not in AREA_CODES:
                 indices.append(index + kwargs.get("indices_offset", 0))
         if indices:
-            result[column] = indices
+            result[column] = {
+                "Phone Number Appears Incorrect": {
+                    "Expected Type": "Ensure phone numbers match the Canadian format (XXX-XXX-XXXX)",
+                    "Rows": indices,
+                    "Severity": "Warning"
+                }
+            }
     return result
 
 
@@ -265,8 +266,13 @@ def location_checker(df, *columns, **kwargs):
             indices_to_add = map_of_values_to_indices[name]
             indices.extend(indices_to_add)
         if indices:
-            indices.sort()
-            result[column] = indices
+            result[column] = {
+                "Unrecognized City Names": {
+                    "Expected Type": "The following city names are not in the list of geographic names. Please double check that these places exist or have correct spelling and adjust your dataset accordingly.",
+                    "Rows": sorted(list(set(indices))),
+                    "Severity": "Warning"
+                }
+            }
     return result
 
 
@@ -285,11 +291,17 @@ def email_validator(df, *columns, **kwargs):
             except EmailNotValidError:
                 indices.append(index + kwargs.get("indices_offset", 0))
         if indices:
-            result[column] = indices
+            result[column] = {
+                "Possible Errors in Email Addresses": {
+                    "Expected Type": "Verify email addresses are valid",
+                    "Rows": indices,
+                    "Severity": "Warning"
+                }
+            }
     return result
 
-def validate_field_values(df, *columns, **kwargs):
 
+def validate_field_values(df, *columns, **kwargs):
     allowed_values = kwargs.get("fields_and_values")
 
     result = {}
@@ -298,9 +310,15 @@ def validate_field_values(df, *columns, **kwargs):
             indices = []
             series = df[column]
             for index, value in series.items():
-                if str(value) not in allowed_values[column] and value != '' and value != None and not pd.isna(value):
+                if str(value) not in allowed_values[column] and value != '' and value is not None and not pd.isna(value):
                     indices.append(index + kwargs.get("indices_offset", 0))
             if indices:
-                result[column] = indices
+                result[column] = {
+                    "Invalid Values": {
+                        "Expected Type": "The following rows only allow specific values",
+                        "Rows": indices,
+                        "Severity": "Error"
+                    }
+                }
 
     return result