From 2c8cee3d3d8412c02216092f78ea54f00d9bf8d2 Mon Sep 17 00:00:00 2001 From: VincentCauchois Date: Thu, 31 Oct 2024 18:54:33 +0100 Subject: [PATCH] [WIP](mtd): several TODOs ... --- .../geonature/core/gn_meta/mtd/__init__.py | 4 + .../geonature/core/gn_meta/mtd/mtd_utils.py | 91 +++++++++++++++++++ .../geonature/core/gn_meta/mtd/xml_parser.py | 12 +++ backend/geonature/core/gn_meta/routes.py | 2 + 4 files changed, 109 insertions(+) diff --git a/backend/geonature/core/gn_meta/mtd/__init__.py b/backend/geonature/core/gn_meta/mtd/__init__.py index 58c9bb66ed..4b53992c78 100644 --- a/backend/geonature/core/gn_meta/mtd/__init__.py +++ b/backend/geonature/core/gn_meta/mtd/__init__.py @@ -41,6 +41,7 @@ class MTDInstanceApi: af_path = "/mtd/cadre/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}" ds_path = "/mtd/cadre/jdd/export/xml/GetRecordsByInstanceId?id={ID_INSTANCE}" + # TODO: check if there are endpoints to retrieve metadata for a given user and instance, and not only a given user and whatever instance ds_user_path = "/mtd/cadre/jdd/export/xml/GetRecordsByUserId?id={ID_ROLE}" af_user_path = "/mtd/cadre/export/xml/GetRecordsByUserId?id={ID_ROLE}" single_af_path = "/mtd/cadre/export/xml/GetRecordById?id={ID_AF}" # NOTE: `ID_AF` is actually an UUID and not an ID from the point of view of geonature database. @@ -202,6 +203,8 @@ def process_af_and_ds(af_list, ds_list, id_role=None): add_unexisting_digitizer(af["id_digitizer"] if not id_role else id_role) user_add_total_time += time.time() - start_add_user_time af = sync_af(af) + # TODO: choose whether or not to commit retrieval of the AF before association of actors + # and possibly retrieve an AF without any actor associated to it # Commit here to retrieve the AF even if the association of actors that follows is to fail db.session.commit() # If the AF has not been retrieved, associated actors cannot be retrieved either @@ -215,6 +218,7 @@ def process_af_and_ds(af_list, ds_list, id_role=None): af.id_acquisition_framework, af.unique_acquisition_framework_id, ) + # TODO: check the following TODO: # TODO: remove actors removed from MTD db.session.commit() logger.debug("MTD - PROCESS DS LIST") diff --git a/backend/geonature/core/gn_meta/mtd/mtd_utils.py b/backend/geonature/core/gn_meta/mtd/mtd_utils.py index 59406bc528..c40872d6c7 100644 --- a/backend/geonature/core/gn_meta/mtd/mtd_utils.py +++ b/backend/geonature/core/gn_meta/mtd/mtd_utils.py @@ -141,6 +141,9 @@ def sync_af(af): # Handle case where af_uuid is None, as it would raise an error at database level when executing the statement below. # None value for `af_uuid`, i.e. af UUID is missing, could be due to no UUID specified in `` tag in the XML file. # If so, we skip the retrieval of the AF. + # TODO: choose between the two following alternatives: + # - (RETAINED) Just skip the retrieval of the AF + # - Generate a UUID for the AF if not af_uuid: logger.warning( f"No UUID provided for the AF with UUID '{af_uuid}' and name '{name_af}' - SKIPPING SYNCHRONIZATION FOR THIS AF." @@ -245,6 +248,7 @@ def associate_actors( if uuid_organism: with DB.session.begin_nested(): # create or update organisme + # TODO: check the following FIXME # FIXME: prevent update of organism email from actor email ! Several actors may be associated to the same organism and still have different mails ! id_organism = add_or_update_organism( uuid=uuid_organism, @@ -256,6 +260,7 @@ def associate_actors( # /!\ Handle case where there is also an organism with the name equals to the value of `name_organism` # - check if there already is an organism with the name `organism_name` # - if there is one: + # - TODO: choose wether to update the organism with the values `organism_name` and `email_actor` or not # - set `id_organism` with the ID of the existing organism # - if there is not: # - set `id_organism` with the ID of a newly created organism @@ -264,6 +269,18 @@ def associate_actors( exists().where(BibOrganismes.nom_organisme == organism_name).select() ) if is_exists_organism: + # TODO: choose whether to keep the following section - update of the organism with the values `organism_name` and `email_actor` ? + # --- If yes: uncomment the two following statements + # uuid_organism = DB.session.scalar( + # select(BibOrganismes.uuid_organisme) + # .where(BibOrganismes.nom_organisme == organism_name) + # .limit(1) + # ) + # id_organism = add_or_update_organism( + # uuid=uuid_organism, + # nom=organism_name, + # email=email_actor, + # ) id_organism = DB.session.scalar( select(BibOrganismes.id_organisme) .where(BibOrganismes.nom_organisme == organism_name) @@ -287,9 +304,17 @@ def associate_actors( id_nomenclature_actor_role=id_nomenclature_actor_role, **{pk_name: pk_value}, ) + # TODO: choose wether to: + # - (RETAINED) Try to associate to an organism first and then to a user + # - Try to associate to a user first and then to an organism # Try to associate to an organism first, and if that is impossible, to a user if id_organism: values["id_organism"] = id_organism + # TODO: handle case where no user is retrieved for the actor email: + # - (retained) If the actor role is "Contact Principal" associate to a new user with only a UUID and an ID, else just do not try to associate the actor with the metadata + # - Try to retrieve an id_organism from the organism name - field `organism` + # - Try to retrieve an id_organism from the actor email considered as the organism email - field `email` + # - Try to insert a new user from the actor name - field `name` - and possibly also email - field `email` else: id_user_from_email = DB.session.scalar( select(User.id_role).filter_by(email=email_actor).where(User.groupe.is_(False)) @@ -306,8 +331,73 @@ def associate_actors( # in particular: # - we do not specify field `email` even if `email_actor` is to be set # - only the field `dec_role` will be written to a non-default value, so as to identify this particular "Contact principal"-for-orphan-metadata user + # TODO: FUNCTIONAL - verify: + # - whether to: + # - (alternative 0) systematically add a new user for each concerned metadata + # - (RETAINED FOR THE MOMENT) or rather use a single user for all metadata without retrieved "Contact Principal" + # - whether it is right to use only (strictly) negative values for those new users: + # - must ensure that we will not already have (strictly) negative values in the `utilisateurs.t_roles` table for concerned instances (GINCO, DEPOBIO, ...) + # - what are the limits for 'serial4' PostgreSQL type: minimum and maximum values, ... + # - alternatives: + # - using the nextval sequence several times until an unused value is found BUT possible future conflict with new entries from INPN users + # - take the first value superior to 0 and which is not already used BUT possible future conflict with new entries from INPN users + # - take a base value which is enough high to avoid conflicts with INPN users IDs + # - start from the maximum or minimum value allowed by 'serial4' PostgreSQL type + # - whether to add a new user or a new organism... cd_nomenclature_actor_role_for_contact_principal = "1" if cd_nomenclature_actor_role == cd_nomenclature_actor_role_for_contact_principal: + ### Alternative 0 (see TODO above): + ### UNCOMMENT the folowing section if this alternative is eventually chosen + # # Get an integer that is equals to 1 less than the minimum of `utilisateurs.id_role` field in database + # # to ensure that the new ID is not already used + # # - we cannot rely on the next value from the sequence `utilisateurs.t_roles.t_roles_id_role_seq` because we can have, + # # and actually have for GINCO and DEPOBIO instances, ID values higher than this next value: this is due + # # to the fact that entries are inserted in the table `utilisateurs.t_roles` specifying the ID rather than + # # using the nextval sequence. + # # - moreover, we cannot actually take a (strictly) positive integer value as we risk to take an ID that could + # # later be needed for the retrieval of a user from the INPN as users are retrieved from the INPN taking + # # values associated to them in the INPN, especially the ID, for the insert in `utilisateurs.t_roles`. + # # - in other words, the field `utilisateurs.t_roles.id_roles` is actually not localized to the GeoNature instance, at + # # least for instances integrated with the INPN such as GINCO and DEPOBIO instances. + # # We take a value that is at most equal to -1 so as to clearly identify those generated users as the users + # # with (strictly) negative IDs. + # min_id_role = DB.session.scalar( + # select(func.min(User.id_role)) + # ) + # if min_id_role == 1: + # min_id_role = -1 + # if min_id_role: + # id_generated_user = min_id_role - 1 + # else: + # id_generated_user = 1 + ### RETAINED FOR THE MOMENT (see TODO above): + # TODO: + # - i. TODO: Choose of ID for user created for metadata with no "Contact Principal" that could be retrieved - alternatives: + # - (RETAINED FOR THE MOMENT)(alternative 0.1) value 0 - assuming that 0 is never used amongst the different instances + # - (alternative 0.2) value -1 - assuming that -1 is never used amongst the different instances AND that it is a valid value for 'serial4' + # actually it should not be possible to have negative values for the 'serial4' PostgreSQL type, + # which should allow values ranging from 1 to 2147483647 + # // documentation PG 16: https://www.postgresql.org/docs/current/datatype-numeric.html#DATATYPE-NUMERIC + # BUT a test creating a user with ID = -1 in a GN database has been done successfully... + # TODO: determiner why is it possible to have a value of -1, and other negative values (-2 has been tested also) + # - (alternative 0.3) value 2147483647 - maximum value for 'serial4' PostgreSQL type + # - (alternative 0.4) minimum negative value [to be determined] + # - ii. TODO: Choose whether to + # - add textual information to the created user + # - which information to provide: + # - (RETAINED FOR THE MOMENT) "Contact Principal for 'orphan' metadata - with no 'Contact Principal' that could be retrieved during MTD INPN synchronisation" + # - (alternatives) ... + # - which fields in `utilisateurs.t_roles` to be written: + # - (?) `identifiant` + # - (?) `nom_role` + # - (?) `prenom_role` + # - (?) `email` + # - (?) `id_organisme` + # - (RETAINED FOR THE MOMENT) `desc_role` (text) + # - (RETAINED FOR THE MOMENT) "Contact Principal for 'orphan' metadata - i.e. with no 'Contact Principal' that could be retrieved during INPN MTD synchronisation" + # - (??) `champs_addi` + # - (?) `remarques` (text) + # - (alternative) or not - the user will only have: an ID, a UUID # Retrieve the "Contact principal"-for-orphan-metadata user desc_role_for_user_contact_principal_for_orphan_metadata = "Contact principal for 'orphan' metadata - i.e. with no 'Contact Principal' that could be retrieved during INPN MTD synchronisation" id_user_contact_principal_for_orphan_metadata = 0 @@ -337,6 +427,7 @@ def associate_actors( dict_data_generated_user = insert_or_update_role( data=dict_data_generated_user ) + # TODO: verify it is ok to commit here # Commit to ensure that the insert from previous statement is actually committed DB.session.commit() values["id_role"] = id_user_contact_principal_for_orphan_metadata diff --git a/backend/geonature/core/gn_meta/mtd/xml_parser.py b/backend/geonature/core/gn_meta/mtd/xml_parser.py index 4b745da584..fbdd64dbae 100644 --- a/backend/geonature/core/gn_meta/mtd/xml_parser.py +++ b/backend/geonature/core/gn_meta/mtd/xml_parser.py @@ -62,6 +62,14 @@ def parse_actors_xml(actors): return actor_list +# TODO: IMPORTANT - filter the list of acquisition frameworks with `ID_INSTANCE_FILTER` as made for the list of datasets + +# TODO: refactorize functions for XML parsing: +# - Rename function : `parse_jdd_xml` --> `parse_datasets_xml` +# - Add functions : `parse_dataset` +# - Eventually split into distinct functions the XML parsing and the mapping of fields + + def parse_acquisition_frameworks_xml(xml: str) -> list: """ Parse an XML of acquisition frameworks from a string. @@ -175,6 +183,10 @@ def format_acquisition_framework_id_from_xml(provided_af_uuid) -> Union[str, Non for jdd in root.findall(".//" + namespace + "JeuDeDonnees"): # We extract all the required informations from the different tags of the XML file jdd_uuid = get_tag_content(jdd, "identifiantJdd") + # TODO: handle case where value for the tag `` in the XML file is not of the form `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` + # Solutions - if in the form `http://oafs.fr/meta/ca/xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` (has some entries for INPN MTD PREPROD and instance 'Thématique') : + # - (RETAINED) Format by keeping only the `xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx` part + # - Add a check further in the MTD sync to process only if ca_uuid is in the right format ca_uuid = format_acquisition_framework_id_from_xml(get_tag_content(jdd, "identifiantCadre")) dataset_name = get_tag_content(jdd, "libelle") dataset_shortname = get_tag_content(jdd, "libelleCourt", default_value="") diff --git a/backend/geonature/core/gn_meta/routes.py b/backend/geonature/core/gn_meta/routes.py index 5bb91bbee0..15af2e15ae 100644 --- a/backend/geonature/core/gn_meta/routes.py +++ b/backend/geonature/core/gn_meta/routes.py @@ -79,12 +79,14 @@ @routes.before_request def synchronize_mtd(): + # TODO: add `if request.method != "OPTIONS" and [...]` in the following condition if request.endpoint in ["gn_meta.get_datasets", "gn_meta.get_acquisition_frameworks_list"]: from flask_login import current_user if current_user.is_authenticated: params = request.json if request.is_json else request.args try: + # TODO: trigger a user sync without id_af in case no id_af is provided list_id_af = params.get("id_acquisition_frameworks", []) for id_af in list_id_af: sync_af_and_ds_by_user(id_role=current_user.id_role, id_af=id_af)