Format authors cleanup (#3068)

* Clean up defaults for author-formatting * make dictionary not list of dictionaries * fix tests
loculus-project · Oct 25, 2024 · 8fc0a47 · 8fc0a47
1 parent 9385fbd
commit 8fc0a47
Show file tree

Hide file tree

Showing 3 changed files with 61 additions and 80 deletions.
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -24,44 +24,44 @@ keep:
   - sequence_md5
   - genbankAccession
   - jointAccession
-simple_mappings:
-  "ncbiReleaseDate": "releaseDate"
-  "ncbiIsAnnotated": "isAnnotated"
-  "ncbiIsLabHost": "isLabHost"
-  "ncbiProteinCount": "proteinCount"
-  "ncbiSourceDb": "sourceDatabase"
-  "ncbiIsComplete": "completeness"
-  "ncbiLabHost": "labHost"
-  "ncbiUpdateDate": "updateDate"
-  "genbankAccession": "accession"
-  "biosampleAccession": "biosample"
-  "ncbi_gene_count": "geneCount"
-  "bioprojects": "bioprojects"
-  "ncbiSraAccessions": "sraAccessions"
-location_mappings:
-  "ncbiGeoLocation": "geographicLocation"
-  "ncbiGeoRegion": "geographicRegion"
-submitter_mappings:
-  "ncbiSubmitterAffiliation": "affiliation"
-  "ncbiSubmitterNames": "names"
-  "ncbiSubmitterCountry": "country"
-isolate_mappings:
-  "ncbiIsolateName": "name"
-  "ncbiIsolateSource": "source"
-  "ncbiCollectionDate": "collectionDate"
-virus_mappings:
-  "ncbiVirusName": "organismName"
-  "ncbiVirusTaxId": "taxId"
-host_mappings:
-  "ncbiHostTaxId": "taxId"
-  "ncbiHostName": "organismName"
-parse_list:
-  - bioprojects
-  - ncbiSraAccessions
-unknown_mappings: # I don't know yet where these fields come from
-  - ncbiHostCommonName
-  - ncbiPurposeOfSampling
-  - ncbiHostSex
+ncbi_mappings:
+  string_to_string_mappings:
+    "ncbiReleaseDate": "releaseDate"
+    "ncbiIsAnnotated": "isAnnotated"
+    "ncbiIsLabHost": "isLabHost"
+    "ncbiProteinCount": "proteinCount"
+    "ncbiSourceDb": "sourceDatabase"
+    "ncbiIsComplete": "completeness"
+    "ncbiLabHost": "labHost"
+    "ncbiUpdateDate": "updateDate"
+    "genbankAccession": "accession"
+    "biosampleAccession": "biosample"
+    "ncbi_gene_count": "geneCount"
+  string_to_list_mappings:
+    "bioprojects": "bioprojects"
+    "ncbiSraAccessions": "sraAccessions"
+  string_to_dict_mappings:
+    location:
+      "ncbiGeoLocation": "geographicLocation"
+      "ncbiGeoRegion": "geographicRegion"
+    submitter:
+      "ncbiSubmitterAffiliation": "affiliation"
+      "ncbiSubmitterNames": "names"
+      "ncbiSubmitterCountry": "country"
+    isolate:
+      "ncbiIsolateName": "name"
+      "ncbiIsolateSource": "source"
+      "ncbiCollectionDate": "collectionDate"
+    virus:
+      "ncbiVirusName": "organismName"
+      "ncbiVirusTaxId": "taxId"
+    host:
+      "ncbiHostTaxId": "taxId"
+      "ncbiHostName": "organismName"
+  unknown_mappings: # Keep for backward compatibility with old ingest pipeline
+    - ncbiHostCommonName
+    - ncbiPurposeOfSampling
+    - ncbiHostSex
 group_name: insdc_ingest_group  # Used only to set the group name, never read
 username: insdc_ingest_user
 password: insdc_ingest_user

diff --git a/ingest/scripts/format_ncbi_metadata.py b/ingest/scripts/format_ncbi_metadata.py
@@ -18,15 +18,11 @@
 
 
 @dataclass
-class Config:
-    simple_mappings: dict[str, str]
-    location_mappings: dict[str, str]
-    submitter_mappings: dict[str, str]
-    isolate_mappings: dict[str, str]
-    virus_mappings: dict[str, str]
-    host_mappings: dict[str, str]
+class NCBIMappings:
+    string_to_string_mappings: dict[str, str]
+    string_to_list_mappings: dict[str, str]
+    string_to_dict_mappings: dict[str, dict[str, str]]
     unknown_mappings: list[str]
-    parse_list: list[str]
 
 
 def convert_to_title_case(name: str) -> str:
@@ -113,57 +109,42 @@ def reformat_authors_from_genbank_to_loculus(
     return formatted_authors
 
 
-def extract_fields(row, config: Config) -> dict:
+def extract_fields(row, ncbi_mappings: NCBIMappings) -> dict:
     try:
         extracted = {}
-        extracted.update({key: row.get(value) for key, value in config.simple_mappings.items()})
-        location = row.get("location", {})
         extracted.update(
-            {key: location.get(value) for key, value in config.location_mappings.items()}
+            {key: row.get(value) for key, value in ncbi_mappings.string_to_string_mappings.items()}
         )
-        submitter = row.get("submitter", {})
         extracted.update(
-            {key: submitter.get(value) for key, value in config.submitter_mappings.items()}
-        )
-        isolate = row.get("isolate", {})
-        extracted.update(
-            {key: isolate.get(value) for key, value in config.isolate_mappings.items()}
-        )
-
-        host_lineage = row.get("host", {})
-        extracted.update(
-            {key: host_lineage.get(value) for key, value in config.host_mappings.items()}
+            {key: row.get(value) for key, value in ncbi_mappings.string_to_list_mappings.items()}
         )
-
-        virus_lineage = row.get("virus", {})
-        extracted.update(
-            {key: virus_lineage.get(value) for key, value in config.virus_mappings.items()}
-        )
-
-        extracted.update(dict.fromkeys(config.unknown_mappings))
+        for field in ncbi_mappings.string_to_list_mappings:
+            if extracted[field]:
+                extracted[field] = ",".join(extracted[field])
+            else:
+                extracted[field] = ""
+        for field, sub_dict in ncbi_mappings.string_to_dict_mappings.items():
+            dict_as_string = row.get(field, {})
+            extracted.update({key: dict_as_string.get(value) for key, value in sub_dict.items()})
+        extracted.update(dict.fromkeys(ncbi_mappings.unknown_mappings))
 
     except KeyError as e:
         print(f"Missing key: {e}")
         extracted = {}
     return extracted
 
 
-def jsonl_to_tsv(jsonl_file: str, tsv_file: str, config: Config) -> None:
+def jsonl_to_tsv(jsonl_file: str, tsv_file: str, ncbi_mappings: NCBIMappings) -> None:
     extracted_rows: list[dict[str, str]] = []
     with (
         open(jsonl_file, encoding="utf-8") as infile,
     ):
         for line in infile:
             row = json.loads(line.strip())
-            extracted = extract_fields(row, config)
+            extracted = extract_fields(row, ncbi_mappings)
             extracted["ncbiSubmitterNames"] = reformat_authors_from_genbank_to_loculus(
                 extracted["ncbiSubmitterNames"], extracted["genbankAccession"]
             )
-            for field in config.parse_list:
-                if extracted[field]:
-                    extracted[field] = ",".join(extracted[field])
-                else:
-                    extracted[field] = ""
             extracted_rows.append(extracted)
     df = pd.DataFrame(extracted_rows)
     df.to_csv(
@@ -190,9 +171,11 @@ def main(config_file: str, input: str, output: str, log_level: str) -> None:
 
     with open(config_file, encoding="utf-8") as file:
         full_config = yaml.safe_load(file)
-        relevant_config = {key: full_config[key] for key in Config.__annotations__}
-        config = Config(**relevant_config)
-    jsonl_to_tsv(input, output, config=config)
+        ncbi_mappings_data = full_config["ncbi_mappings"]
+        relevant_config = {key: ncbi_mappings_data[key] for key in NCBIMappings.__annotations__}
+        ncbi_mappings = NCBIMappings(**relevant_config)
+
+    jsonl_to_tsv(input, output, ncbi_mappings=ncbi_mappings)
 
 
 if __name__ == "__main__":

diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -1248,7 +1248,6 @@ defaultOrganisms:
           header: "Collection Details"
         - name: pangoLineage
           initiallyVisible: true
-          type: pango_lineage
           autocomplete: true
           required: true
       website:
@@ -1261,7 +1260,6 @@ defaultOrganisms:
         defaultOrderBy: date
       silo:
         dateToSortBy: date
-        partitionBy: pangoLineage
     preprocessing:
       - version: 1
         image: ghcr.io/loculus-project/preprocessing-dummy