Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IMPORT] Feat context parameter import #3277

Draft
wants to merge 15 commits into
base: feat/import-monitorings
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions backend/geonature/core/gn_synthese/imports/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def check_transient_data(task, logger, imprt: TImports):
selected_fields = {
field_name: fields[field_name]
for field_name, source_field in imprt.fieldmapping.items()
if source_field in imprt.columns
if source_field.get("column_src", None) in imprt.columns
}
init_rows_validity(imprt)
task.update_state(state="PROGRESS", meta={"progress": 0.05})
Expand Down Expand Up @@ -218,7 +218,15 @@ def update_batch_progress(batch, step):
do_nomenclatures_mapping(
imprt,
entity,
selected_fields,
{
field_name: fields[field_name]
for field_name, mapping in imprt.fieldmapping.items()
if field_name in fields
and (
mapping.get("column_src", None) in imprt.columns
or mapping.get("default_value") is not None
)
},
fill_with_defaults=current_app.config["IMPORT"][
"FILL_MISSING_NOMENCLATURE_WITH_DEFAULT_VALUE"
],
Expand Down Expand Up @@ -339,11 +347,15 @@ def import_data_to_destination(imprt: TImports) -> None:
if field_name not in fields: # not a destination field
continue
field = fields[field_name]
column_src = source_field.get("column_src", None)
if field.multi:
if not set(source_field).isdisjoint(imprt.columns):
if not set(column_src).isdisjoint(imprt.columns):
insert_fields |= {field}
else:
if source_field in imprt.columns:
if (
column_src in imprt.columns
or source_field.get("default_value", None) is not None
):
insert_fields |= {field}

insert_fields -= {fields["unique_dataset_id"]} # Column only used for filling `id_dataset`
Expand Down
6 changes: 3 additions & 3 deletions backend/geonature/core/imports/checks/dataframe/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from geonature.utils.env import db

from geonature.core.imports.models import ImportUserError, ImportUserErrorType
from geonature.core.imports.models import ImportUserError, ImportUserErrorType, TImports
from geonature.core.imports.utils import generated_fields


Expand Down Expand Up @@ -101,7 +101,7 @@ def __error_replace(*args, **kwargs):
return _error_replace


def report_error(imprt, entity, df, error):
def report_error(imprt: TImports, entity, df, error):
"""
Reports an error found in the dataframe, updates the validity column and insert
the error in the `t_user_errors` table.
Expand Down Expand Up @@ -147,7 +147,7 @@ def report_error(imprt, entity, df, error):
# f'{error_type.name}' # FIXME comment
ordered_invalid_rows = sorted(invalid_rows["line_no"])
column = generated_fields.get(error["column"], error["column"])
column = imprt.fieldmapping.get(column, column)
column = imprt.fieldmapping.get(column, {}).get("column_src", column)
# If an error for same import, same column and of the same type already exists,
# we concat existing erroneous rows with current rows.
stmt = pg_insert(ImportUserError).values(
Expand Down
16 changes: 9 additions & 7 deletions backend/geonature/core/imports/checks/sql/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,10 @@ def init_rows_validity(imprt: TImports, dataset_name_field: str = "id_dataset"):
# as rows with multi-entity field only will raise an ORPHAN_ROW error
selected_fields_names = []
for field_name, source_field in imprt.fieldmapping.items():
if type(source_field) == list:
selected_fields_names.extend(set(source_field) & set(imprt.columns))
elif source_field in imprt.columns:
column_src = source_field.get("column_src", None)
if type(column_src) == list:
selected_fields_names.extend(set(column_src) & set(imprt.columns))
elif column_src in imprt.columns:
selected_fields_names.append(field_name)
for entity in entities:
# Select fields associated to this entity *and only to this entity*
Expand All @@ -64,15 +65,16 @@ def init_rows_validity(imprt: TImports, dataset_name_field: str = "id_dataset"):
)


def check_orphan_rows(imprt):
def check_orphan_rows(imprt: TImports):
transient_table = imprt.destination.get_transient_table()
# TODO: handle multi-source fields
# This is actually not a big issue as multi-source fields are unlikely to also be multi-entity fields.
selected_fields_names = []
for field_name, source_field in imprt.fieldmapping.items():
if type(source_field) == list:
selected_fields_names.extend(set(source_field) & set(imprt.columns))
elif source_field in imprt.columns:
column_src = source_field.get("column_src", None)
if type(column_src) == list:
selected_fields_names.extend(set(column_src) & set(imprt.columns))
elif column_src in imprt.columns:
selected_fields_names.append(field_name)
# Select fields associated to multiple entities
AllEntityField = sa.orm.aliased(EntityField)
Expand Down
2 changes: 1 addition & 1 deletion backend/geonature/core/imports/checks/sql/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def report_erroneous_rows(
transient_table = imprt.destination.get_transient_table()
error_type = ImportUserErrorType.query.filter_by(name=error_type).one()
error_column = generated_fields.get(error_column, error_column)
error_column = imprt.fieldmapping.get(error_column, error_column)
error_column = imprt.fieldmapping.get(error_column, {}).get("column_src", error_column)
if error_type.level in level_validity_mapping:
assert entity is not None
cte = (
Expand Down
26 changes: 22 additions & 4 deletions backend/geonature/core/imports/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,7 +608,7 @@ def optional_conditions_to_jsonschema(name_field: str, optional_conditions: Iter
"if": {
"not": {
"properties": {
field_opt: {"type": "string"} for field_opt in optional_conditions
field_opt: {"type": "object"} for field_opt in optional_conditions
}
}
},
Expand Down Expand Up @@ -726,9 +726,27 @@ def validate_values(field_mapping_json):
"type": "object",
"properties": {
field.name_field: {
"type": (
"boolean" if field.autogenerated else ("array" if field.multi else "string")
),
"type": "object",
"properties": {
"column_src": {
"type": (
"boolean"
if field.autogenerated
else ("array" if field.multi else "string")
),
},
"default_value": {
"oneOf": [
{"type": "boolean"},
{"type": "number"},
{"type": "string"},
{"type": "array"},
]
},
},
"required": [],
"additionalProperties": False,
"anyOf": [{"required": ["column_src"]}, {"required": ["default_value"]}],
}
for field in fields
},
Expand Down
1 change: 1 addition & 0 deletions backend/geonature/core/imports/routes/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def get_fields(scope, destination):
"name_field",
"fr_label",
"eng_label",
"mnemonique",
"mandatory",
"autogenerated",
"multi",
Expand Down
20 changes: 18 additions & 2 deletions backend/geonature/core/imports/routes/imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,17 @@
assert destination
author = g.current_user
f = request.files["file"]
field_to_map_str = request.form.get("fieldsToMap")
if field_to_map_str:
fields_to_map = json.loads(field_to_map_str)

Check warning on line 182 in backend/geonature/core/imports/routes/imports.py

View check run for this annotation

Codecov / codecov/patch

backend/geonature/core/imports/routes/imports.py#L182

Added line #L182 was not covered by tests
# NOTES: Pas possible d'utiliser le validate value ici
# try:
# FieldMapping.validate_values(fields_to_map)
# except ValueError as e:
# raise BadRequest(*e.args)
else:
fields_to_map = {}

size = get_file_size(f)
# value in config file is in Mo
max_file_size = current_app.config["IMPORT"]["MAX_FILE_SIZE"] * 1024 * 1024
Expand All @@ -203,6 +214,8 @@
if not dataset.active:
raise Forbidden("Le jeu de données est fermé.")
imprt = TImports(destination=destination, dataset=dataset)
if fields_to_map:
imprt.fieldmapping = fields_to_map

Check warning on line 218 in backend/geonature/core/imports/routes/imports.py

View check run for this annotation

Codecov / codecov/patch

backend/geonature/core/imports/routes/imports.py#L218

Added line #L218 was not covered by tests
imprt.authors.append(author)
db.session.add(imprt)
else:
Expand Down Expand Up @@ -368,8 +381,11 @@
# this nomenclated field is not mapped
continue
source = imprt.fieldmapping[field.name_field]
if source not in imprt.columns:
# the file do not contain this field expected by the mapping
if (
source.get("column_src", None) not in imprt.columns
and source.get("default_value", None) is None
):
# the file do not contain this field expected by the mapping and there is no default value
continue
# TODO: vérifier que l’on a pas trop de valeurs différentes ?
column = field.source_column
Expand Down
53 changes: 39 additions & 14 deletions backend/geonature/core/imports/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import json
from enum import IntEnum
from datetime import datetime, timedelta
from typing import IO, Any, Dict, Iterable, List, Optional, Set, Tuple
from typing import IO, Any, Dict, Iterable, List, Optional, Set, Tuple, Union

from flask import current_app, render_template
import sqlalchemy as sa
Expand Down Expand Up @@ -163,7 +163,9 @@
return dialect.delimiter


def preprocess_value(dataframe: pd.DataFrame, field: BibFields, source_col: str) -> pd.Series:
def preprocess_value(
dataframe: pd.DataFrame, field: BibFields, source_col: Union[str, List[str]], default_value: Any
) -> pd.Series:
"""
Preprocesses values in a DataFrame depending if the field contains multiple values (e.g. additional_data) or not.

Expand All @@ -184,8 +186,14 @@
"""

def build_additional_data(columns: dict):
try:
default_values = json.loads(default_value)
except Exception:
default_values = {}
result = {}
for key, value in columns.items():
if value is None or value == "":
value = default_values.get(key, None)
if value is None:
continue
try:
Expand All @@ -198,9 +206,17 @@

if field.multi:
assert type(source_col) is list
for col in source_col:
if col not in dataframe.columns:
dataframe[col] = None

Check warning on line 211 in backend/geonature/core/imports/utils.py

View check run for this annotation

Codecov / codecov/patch

backend/geonature/core/imports/utils.py#L211

Added line #L211 was not covered by tests
col = dataframe[source_col].apply(build_additional_data, axis=1)
else:
if source_col not in dataframe.columns:
dataframe[source_col] = None
col = dataframe[source_col]
if default_value is not None:
col = col.replace({"": default_value, None: default_value})

return col


Expand Down Expand Up @@ -244,8 +260,10 @@
}
data.update(
{
dest_field: preprocess_value(chunk, source_field["field"], source_field["value"])
for dest_field, source_field in fieldmapping.items()
dest_field: preprocess_value(
chunk, mapping["field"], mapping["column_src"], mapping["default_value"]
)
for dest_field, mapping in fieldmapping.items()
}
)
# XXX keep extra_fields in t_imports_synthese? or add config argument?
Expand Down Expand Up @@ -293,21 +311,25 @@

for field in fields:
if field.name_field in imprt.fieldmapping:
mapping = imprt.fieldmapping[field.name_field]
column_src = mapping.get("column_src", None)
default_value = mapping.get("default_value", None)
if field.multi:
correct = list(set(columns) & set(imprt.fieldmapping[field.name_field]))
correct = list(set(columns) & set(column_src))
if len(correct) > 0:
fieldmapping[field.source_column] = {
"value": correct,
"field": field,
"column_src": correct,
"default_value": default_value,
}
used_columns.extend(correct)
else:
if imprt.fieldmapping[field.name_field] in columns:
fieldmapping[field.source_column] = {
"value": imprt.fieldmapping[field.name_field],
"field": field,
}
used_columns.append(imprt.fieldmapping[field.name_field])
fieldmapping[field.source_column] = {
"field": field,
"column_src": column_src,
"default_value": default_value,
}
used_columns.append(column_src)
return fieldmapping, used_columns


Expand Down Expand Up @@ -442,8 +464,11 @@
fields = {ef.field.name_field: ef.field for ef in entity.fields}
selected_fields = {
field_name: fields[field_name]
for field_name, source_field in import_.fieldmapping.items()
if source_field in import_.columns and field_name in fields
for field_name, mapping in import_.fieldmapping.items()
if (
mapping.get("column_src") in import_.columns or mapping.get("default_value") is not None
)
and field_name in fields
}
source_cols = set()
for field in selected_fields.values():
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
"""fieldmapping default values

Revision ID: e43b01a18850
Revises: 2b0b3bd0248c
Create Date: 2024-11-28 17:33:06.243150

"""

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision = "e43b01a18850"
down_revision = "2b0b3bd0248c"
branch_labels = None
depends_on = None


def upgrade():
op.execute(
"""UPDATE gn_imports.t_fieldmappings
SET "values" = (
SELECT json_object_agg(key, json_build_object('column_src', value))
FROM json_each("values")
)
WHERE "values" IS NOT NULL;"""
)
op.execute(
"""UPDATE gn_imports.t_imports
SET fieldmapping = (
SELECT json_object_agg(key, json_build_object('column_src', value))
FROM json_each(fieldmapping)
)
WHERE fieldmapping IS NOT NULL;"""
)


def downgrade():
op.execute(
"""UPDATE gn_imports.t_fieldmappings
SET "values" = (
SELECT json_object_agg(key, value->'column_src')
FROM json_each("values")
)
WHERE "values" IS NOT NULL;"""
)
op.execute(
"""UPDATE gn_imports.t_imports
SET fieldmapping = (
SELECT json_object_agg(key, value->'column_src')
FROM json_each(fieldmapping)
)
WHERE fieldmapping IS NOT NULL;"""
)
2 changes: 1 addition & 1 deletion backend/geonature/tests/imports/test_imports_occhab.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def fieldmapping(occhab_destination):
.unique()
.all()
)
return {field.name_field: field.name_field for field in fields}
return {field.name_field: {"column_src": field.name_field} for field in fields}


@pytest.fixture()
Expand Down
Loading
Loading