From a62fe7df0a33399b46fe65919c7456eccf5c3aec Mon Sep 17 00:00:00 2001 From: "John T. Wodder II" Date: Fri, 1 Mar 2024 12:40:52 -0500 Subject: [PATCH 1/2] Clean up URL parsing in `extract_species()` --- dandi/metadata/util.py | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py index b9cb33aa5..e517928e4 100644 --- a/dandi/metadata/util.py +++ b/dandi/metadata/util.py @@ -329,66 +329,68 @@ def extract_cellLine(metadata: dict) -> str | None: return None +SPECIES_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" + # common_names, prefix, uri, name species_map = [ ( ["mouse"], "mus", - "http://purl.obolibrary.org/obo/NCBITaxon_10090", + SPECIES_URI_TEMPLATE.format("10090"), "Mus musculus - House mouse", ), ( ["human"], "homo", - "http://purl.obolibrary.org/obo/NCBITaxon_9606", + SPECIES_URI_TEMPLATE.format("9606"), "Homo sapiens - Human", ), ( ["rat", "norvegicus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_10116", + SPECIES_URI_TEMPLATE.format("10116"), "Rattus norvegicus - Norway rat", ), ( ["rattus rattus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_10117", + SPECIES_URI_TEMPLATE.format("10117"), "Rattus rattus - Black rat", ), ( ["mulatta", "rhesus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_9544", + SPECIES_URI_TEMPLATE.format("9544"), "Macaca mulatta - Rhesus monkey", ), ( ["jacchus"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_9483", + SPECIES_URI_TEMPLATE.format("9483"), "Callithrix jacchus - Common marmoset", ), ( ["melanogaster", "fruit fly"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_7227", + SPECIES_URI_TEMPLATE.format("7227"), "Drosophila melanogaster - Fruit fly", ), ( ["danio", "zebrafish", "zebra fish"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_7955", + SPECIES_URI_TEMPLATE.format("7955"), "Danio rerio - Zebra fish", ), ( ["c. elegans", "caenorhabditis elegans"], "caenorhabditis", - "http://purl.obolibrary.org/obo/NCBITaxon_6239", + SPECIES_URI_TEMPLATE.format("6239"), "Caenorhabditis elegans", ), ( ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"], None, - "http://purl.obolibrary.org/obo/NCBITaxon_9545", + SPECIES_URI_TEMPLATE.format("9545"), "Macaca nemestrina", ), ] @@ -434,14 +436,18 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: value_orig = metadata.get("species", None) value_id = None if value_orig is not None and value_orig != "": - value = value_orig.lower().rstrip("/") - if value.startswith("http://purl.obolibrary.org/obo/NCBITaxon_".lower()): - for common_names, prefix, uri, name in species_map: - if value.split("//")[1] == uri.lower().rstrip("/").split("//")[1]: + if m := re.fullmatch( + r"https?://purl\.obolibrary\.org/obo/NCBITaxon_([0-9]+)/?", + value_orig, + flags=re.I, + ): + normed_value = SPECIES_URI_TEMPLATE.format(m[1]) + for _common_names, _prefix, uri, name in species_map: + if uri == normed_value: value_id = uri value = name break - if value_id is None: + else: value_id = value_orig lookup = ("rdfs:label", "oboInOwl:hasExactSynonym") try: @@ -457,9 +463,10 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: [result[key] for key in lookup if key in result] ) else: + lower_value = value_orig.lower() for common_names, prefix, uri, name in species_map: - if any(key in value for key in common_names) or ( - prefix and value.startswith(prefix) + if any(key in lower_value for key in common_names) or ( + prefix is not None and lower_value.startswith(prefix) ): value_id = uri value = name From a6704745f3cb037112437b65d39521f01acfef84 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 1 Mar 2024 17:16:20 -0500 Subject: [PATCH 2/2] [DATALAD RUNCMD] Rename SPECIES_URI_TEMPLATE into NCBITAXON_URI_TEMPLATE since otherwise while looking at the use not clear what that index is. May be later it would be even some other level (but still the same template) than species so best to just mention that it is NCBITAXON URL === Do not change lines below === { "chain": [], "cmd": "git-sedi SPECIES_URI_TEMPLATE NCBITAXON_URI_TEMPLATE", "exit": 0, "extra_inputs": [], "inputs": [], "outputs": [], "pwd": "." } ^^^ Do not change lines above ^^^ --- dandi/metadata/util.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/dandi/metadata/util.py b/dandi/metadata/util.py index e517928e4..8849a9164 100644 --- a/dandi/metadata/util.py +++ b/dandi/metadata/util.py @@ -329,68 +329,68 @@ def extract_cellLine(metadata: dict) -> str | None: return None -SPECIES_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" +NCBITAXON_URI_TEMPLATE = "http://purl.obolibrary.org/obo/NCBITaxon_{}" # common_names, prefix, uri, name species_map = [ ( ["mouse"], "mus", - SPECIES_URI_TEMPLATE.format("10090"), + NCBITAXON_URI_TEMPLATE.format("10090"), "Mus musculus - House mouse", ), ( ["human"], "homo", - SPECIES_URI_TEMPLATE.format("9606"), + NCBITAXON_URI_TEMPLATE.format("9606"), "Homo sapiens - Human", ), ( ["rat", "norvegicus"], None, - SPECIES_URI_TEMPLATE.format("10116"), + NCBITAXON_URI_TEMPLATE.format("10116"), "Rattus norvegicus - Norway rat", ), ( ["rattus rattus"], None, - SPECIES_URI_TEMPLATE.format("10117"), + NCBITAXON_URI_TEMPLATE.format("10117"), "Rattus rattus - Black rat", ), ( ["mulatta", "rhesus"], None, - SPECIES_URI_TEMPLATE.format("9544"), + NCBITAXON_URI_TEMPLATE.format("9544"), "Macaca mulatta - Rhesus monkey", ), ( ["jacchus"], None, - SPECIES_URI_TEMPLATE.format("9483"), + NCBITAXON_URI_TEMPLATE.format("9483"), "Callithrix jacchus - Common marmoset", ), ( ["melanogaster", "fruit fly"], None, - SPECIES_URI_TEMPLATE.format("7227"), + NCBITAXON_URI_TEMPLATE.format("7227"), "Drosophila melanogaster - Fruit fly", ), ( ["danio", "zebrafish", "zebra fish"], None, - SPECIES_URI_TEMPLATE.format("7955"), + NCBITAXON_URI_TEMPLATE.format("7955"), "Danio rerio - Zebra fish", ), ( ["c. elegans", "caenorhabditis elegans"], "caenorhabditis", - SPECIES_URI_TEMPLATE.format("6239"), + NCBITAXON_URI_TEMPLATE.format("6239"), "Caenorhabditis elegans", ), ( ["pig-tailed macaque", "pigtail monkey", "pigtail macaque"], None, - SPECIES_URI_TEMPLATE.format("9545"), + NCBITAXON_URI_TEMPLATE.format("9545"), "Macaca nemestrina", ), ] @@ -441,7 +441,7 @@ def extract_species(metadata: dict) -> models.SpeciesType | None: value_orig, flags=re.I, ): - normed_value = SPECIES_URI_TEMPLATE.format(m[1]) + normed_value = NCBITAXON_URI_TEMPLATE.format(m[1]) for _common_names, _prefix, uri, name in species_map: if uri == normed_value: value_id = uri