Skip to content

Commit

Permalink
Feat: Data Uploader Refactor #365 (#374)
Browse files Browse the repository at this point in the history
* feat: (partial) adds components for displaying issues, modifies popup

* -feat: new issues component working as expected with errors and warnings split up

* -adds issue type to detail ie 'warning name' or 'error name'

* First version of backend refactor

* Cleanup

---------

Co-authored-by: emi-hi <[email protected]>
Co-authored-by: Emily <[email protected]>
  • Loading branch information
3 people authored Jul 25, 2024
1 parent 6424ca9 commit ed70a41
Show file tree
Hide file tree
Showing 6 changed files with 305 additions and 321 deletions.
10 changes: 5 additions & 5 deletions django/api/constants/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,11 +655,11 @@ class GoElectricRebatesColumnMapping(Enum):
"sheet_name": "Distribution List - Master",
"preparation_functions": [prepare_go_electric_rebates],
"validation_functions": [
{"error_type": "Phone Error", "function": validate_phone_numbers, "columns": ["Phone Number"], "kwargs": {"indices_offset": 2}},
{"error_type": "Potential Typo", "function": typo_checker, "columns": ["Applicant Name"], "kwargs": {"cutoff": 0.8, "indices_offset": 2}},
{"error_type": "Location Not Found", "function": location_checker, "columns": ["City"], "kwargs": {"indices_offset":2}},
{"error_type": "Invalid Email", "function": email_validator, "columns": ["Email"], "kwargs": {"indices_offset":2, "get_resolver": get_google_resolver}},
{"error_type": "Invalid Value", "function": validate_field_values, "columns": [], "kwargs": {"indices_offset":2, "fields_and_values": GER_VALID_FIELD_VALUES}}
{"function": validate_phone_numbers, "columns": ["Phone Number"], "kwargs": {"indices_offset": 2}},
{"function": typo_checker, "columns": ["Applicant Name"], "kwargs": {"cutoff": 0.8, "indices_offset": 2}},
{"function": location_checker, "columns": ["City"], "kwargs": {"indices_offset":2}},
{"function": email_validator, "columns": ["Email"], "kwargs": {"indices_offset":2, "get_resolver": get_google_resolver}},
{"function": validate_field_values, "columns": [], "kwargs": {"indices_offset":2, "fields_and_values": GER_VALID_FIELD_VALUES}}
]
},
}
181 changes: 91 additions & 90 deletions django/api/services/spreadsheet_uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ def transform_data(
df,
dataset_columns,
column_mapping_enum,
field_types,
model,
preparation_functions=[],
validation_functions=[],
):
Expand All @@ -47,45 +49,17 @@ def transform_data(
df = df[[col for col in df.columns if col in required_columns]]

missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
if (missing_columns):
raise ValueError(f"Missing columns: {', '.join(missing_columns)}")

for prep_func in preparation_functions:
df = prep_func(df)

validation_errors = {}
for x in validation_functions:
validate = x["function"]
columns = x["columns"]
kwargs = x["kwargs"]
key = x["error_type"]
errors = validate(df, *columns, **kwargs)
if errors:
validation_errors[key] = errors

column_mapping = {col.name: col.value for col in column_mapping_enum}
# Need to use the inverse (keys) for mapping the columns to what the database expects in order to use enums
inverse_column_mapping = {v: k for k, v in column_mapping.items()}
df.rename(columns=inverse_column_mapping, inplace=True)

return df, validation_errors


@transaction.atomic
def load_data(df, model, field_types, replace_data, user, validation_errors):
row_count = 0
records_inserted = 0
errors = []
nullable_fields = get_nullable_fields(model)

# validation_error_rows = get_validation_error_rows(errors) This may be used going forward for validation errors that cannot be overwritten.

if replace_data:
model.objects.all().delete()
errors_and_warnings = {}

for index, row in df.iterrows():
row_dict = row.to_dict()
valid_row = True

for column, value in row_dict.items():
expected_type = field_types.get(column)
Expand All @@ -95,11 +69,17 @@ def load_data(df, model, field_types, replace_data, user, validation_errors):
if is_nullable:
row_dict[column] = None
else:
errors.append(f"Row {index + 1}: Has an empty cell where one is expected in '{column}'")
valid_row = False
continue

if expected_type in [int, float, Decimal] and value != None and pd.notna(value):
if column not in errors_and_warnings:
errors_and_warnings[column] = {}
if "Empty Value" not in errors_and_warnings[column]:
errors_and_warnings[column]["Empty Value"] = {
"Expected Type": "Expected value where there isn't one.",
"Rows": [],
"Severity": "Error"
}
errors_and_warnings[column]["Empty Value"]["Rows"].append(index + 1)

if expected_type in [int, float, Decimal] and value is not None and pd.notna(value):
value = str(value).replace(',', '').strip()
try:
if expected_type == int:
Expand All @@ -109,39 +89,73 @@ def load_data(df, model, field_types, replace_data, user, validation_errors):
else:
row_dict[column] = float(value)
except ValueError:
errors.append(
f"Row {index + 1}: Unable to convert value to {expected_type.__name__} for '{column}'. Value was '{value}'."
)
valid_row = False
continue

elif not isinstance(row_dict[column], expected_type) and value != "":
errors.append(
f"Row {index + 1}: Incorrect type for '{column}'. Expected {expected_type.__name__}, got {type(row_dict[column]).__name__}."
)
valid_row = False
continue

# if index + 1 in validation_error_rows:
# valid_row = False
# continue

if valid_row:
try:
row_dict["update_user"] = user
model_instance = model(**row_dict)
model_instance.full_clean()
model_instance.save()
records_inserted += 1
except Exception as e:
errors.append(f"Row {index + 1}: {e}")

row_count += 1
if column not in errors_and_warnings:
errors_and_warnings[column] = {}
if "Incorrect Type" not in errors_and_warnings[column]:
errors_and_warnings[column]["Incorrect Type"] = {
"Expected Type": "The following rows contained incorrect value types for the " + column + " column",
"Rows": [],
"Severity": "Error"
}
errors_and_warnings[column]["Incorrect Type"]["Rows"].append(index + 1)

# Check if expected_type is valid before using isinstance
elif expected_type is not None and isinstance(expected_type, type) and not isinstance(row_dict[column], expected_type) and value != "":
if column not in errors_and_warnings:
errors_and_warnings[column] = {}
if "Incorrect Type" not in errors_and_warnings[column]:
errors_and_warnings[column]["Incorrect Type"] = {
"Expected Type": "The following rows contained incorrect value types for the " + column + " column",
"Rows": [],
"Severity": "Error"
}
errors_and_warnings[column]["Incorrect Type"]["Rows"].append(index + 1)

for x in validation_functions:
validate = x["function"]
columns = x["columns"]
kwargs = x["kwargs"]
warnings = validate(df, *columns, **kwargs)

if warnings:
for column, issues in warnings.items():
if column not in errors_and_warnings:
errors_and_warnings[column] = {}
for issue, details in issues.items():
if issue not in errors_and_warnings[column]:
errors_and_warnings[column][issue] = {
"Expected Type": details.get("Expected Type", "Unknown"),
"Rows": details.get("Rows", []),
"Severity": details.get("Severity", "Error")
}
else:
errors_and_warnings[column][issue]["Rows"].extend(details.get("Rows", []))

column_mapping = {col.name: col.value for col in column_mapping_enum}
inverse_column_mapping = {v: k for k, v in column_mapping.items()}
df.rename(columns=inverse_column_mapping, inplace=True)

return df, errors_and_warnings


@transaction.atomic
def load_data(df, model, replace_data, user):
records_inserted = 0

if replace_data:
model.objects.all().delete()

for index, row in df.iterrows():
row_dict = row.to_dict()
row_dict["update_user"] = user

model_instance = model(**row_dict)
model_instance.save()
records_inserted += 1

return {
"row_count": row_count,
"row_count": len(df),
"records_inserted": records_inserted,
"errors": sorted(errors, key=lambda x: int(x.split()[1][:-1])),
}


Expand All @@ -157,57 +171,44 @@ def import_from_xls(
user,
preparation_functions=[],
validation_functions=[],
check_for_warnings=False,
check_for_warnings=True,
):
try:
df = extract_data(excel_file, sheet_name, header_row)
df, validation_errors = transform_data(
df, errors_and_warnings = transform_data(
df,
dataset_columns,
column_mapping_enum,
field_types,
model,
preparation_functions,
validation_functions,
)

if check_for_warnings:
## do the error checking

if validation_errors:
if errors_and_warnings:
return {
"success": True,
"message": "We encountered some potential errors in your data. Please choose whether to ignore them and continue inserting data or cancel upload and make edits to the data before reuploading",
"warning": True,
"warnings": validation_errors,
"errors_and_warnings": errors_and_warnings,
}
else:
print('no warnings')

result = load_data(df, model, field_types, replace_data, user, validation_errors)
result = load_data(df, model, replace_data, user, errors_and_warnings)

total_rows = result["row_count"]
inserted_rows = result["records_inserted"]

if result["errors"] and result["records_inserted"] > 0:
return {
"success": True,
"message": f"{inserted_rows} out of {total_rows} rows successfully inserted with some errors encountered.",
"errors": result["errors"],
"rows_processed": result["row_count"],
}
elif len(result["errors"]) > 0:
return {
"success": False,
"message": "Errors encountered with no successful insertions.",
"errors": result["errors"],
"rows_processed": result["row_count"],
}

else:
return {
"success": True,
"message": f"All {inserted_rows} records successfully inserted out of {total_rows}.",
"rows_processed": result["row_count"],
return {
"success": True,
"message": f"All {inserted_rows} records successfully inserted out of {total_rows}.",
"rows_processed": result["row_count"],
}

except Exception as error:
traceback.print_exc()
error_msg = f"Unexpected error: {str(error)}"
Expand Down
58 changes: 38 additions & 20 deletions django/api/services/spreadsheet_uploader_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def typo_checker(df, *columns, **kwargs):
matches = dl.get_close_matches(
value,
unique_vals.difference(singleton),
cutoff = kwargs["cutoff"]
cutoff=kwargs["cutoff"]
)
if matches:
value_indices = map_of_values_to_indices[value]
Expand All @@ -219,21 +219,16 @@ def typo_checker(df, *columns, **kwargs):
match_indices = map_of_values_to_indices[match]
indices.extend(match_indices)
if indices:
result[column] = sorted(list(set(indices)))
result[column] = {
"Similar Values Detected": {
"Expected Type": "We detected applicant names that sound very similar. If these names refer to the same person/entity, please replace the applicant names in your dataset to the preferred spelling to ensure consistency",
"Rows": sorted(list(set(indices))),
"Severity": "Warning"
}
}
return result


def get_validation_error_rows(errors):
row_numbers = set()
for error in errors:
try:
row_number = int(error.split()[1][:-1])
row_numbers.add(row_number)
except (IndexError, ValueError):
continue
return row_numbers


def validate_phone_numbers(df, *columns, **kwargs):
result = {}
for column in columns:
Expand All @@ -244,7 +239,13 @@ def validate_phone_numbers(df, *columns, **kwargs):
if formatted_number == '' or len(formatted_number) != 10 or int(formatted_number[:3]) not in AREA_CODES:
indices.append(index + kwargs.get("indices_offset", 0))
if indices:
result[column] = indices
result[column] = {
"Phone Number Appears Incorrect": {
"Expected Type": "Ensure phone numbers match the Canadian format (XXX-XXX-XXXX)",
"Rows": indices,
"Severity": "Warning"
}
}
return result


Expand All @@ -265,8 +266,13 @@ def location_checker(df, *columns, **kwargs):
indices_to_add = map_of_values_to_indices[name]
indices.extend(indices_to_add)
if indices:
indices.sort()
result[column] = indices
result[column] = {
"Unrecognized City Names": {
"Expected Type": "The following city names are not in the list of geographic names. Please double check that these places exist or have correct spelling and adjust your dataset accordingly.",
"Rows": sorted(list(set(indices))),
"Severity": "Warning"
}
}
return result


Expand All @@ -285,11 +291,17 @@ def email_validator(df, *columns, **kwargs):
except EmailNotValidError:
indices.append(index + kwargs.get("indices_offset", 0))
if indices:
result[column] = indices
result[column] = {
"Possible Errors in Email Addresses": {
"Expected Type": "Verify email addresses are valid",
"Rows": indices,
"Severity": "Warning"
}
}
return result

def validate_field_values(df, *columns, **kwargs):

def validate_field_values(df, *columns, **kwargs):
allowed_values = kwargs.get("fields_and_values")

result = {}
Expand All @@ -298,9 +310,15 @@ def validate_field_values(df, *columns, **kwargs):
indices = []
series = df[column]
for index, value in series.items():
if str(value) not in allowed_values[column] and value != '' and value != None and not pd.isna(value):
if str(value) not in allowed_values[column] and value != '' and value is not None and not pd.isna(value):
indices.append(index + kwargs.get("indices_offset", 0))
if indices:
result[column] = indices
result[column] = {
"Invalid Values": {
"Expected Type": "The following rows only allow specific values",
"Rows": indices,
"Severity": "Error"
}
}

return result
Loading

0 comments on commit ed70a41

Please sign in to comment.