Create a python package for ena-deposition

More clean up Move checks from snakefile to config fix config update deployment update tests ci Add trigger from db option Fix cronjob Fix link to config-file fix deployment install package in dockerfile install at correct location Remove snakemake as no longer needed Add missing dependency try to debug Create an XmlNone dataclass - this is required since package update test threads stop revert exception test test upload to ena dev still works on preview Make sure test is set correctly!!! remove debug print statements Improve logs Fix merge errors Update ena-submission/README.md Co-authored-by: Cornelius Roemer <[email protected]> Apply suggestions from code review Co-authored-by: Cornelius Roemer <[email protected]> Cronjob: create results directory before writing to it format authors in prepro Fix ingest try to fix pattern simplify regex fix check Add tests # Conflicts: # preprocessing/nextclade/tests/test.py Add to ena submission fix fix other edge case Update ena-submission/scripts/ena_submission_helper.py Co-authored-by: Cornelius Roemer <[email protected]> Update ena-submission/scripts/ena_submission_helper.py Update ena-submission/scripts/ena_submission_helper.py Co-authored-by: Cornelius Roemer <[email protected]> Update ena-submission/scripts/ena_submission_helper.py Update ingest/scripts/prepare_metadata.py Update preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py rename Update reformat_authors_from_genbank_to_loculus Additionally format authors with correct white space Improve error message add tests fix missing pattern improve error logs fix error Update preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py improve logging more feat(ingest): Do not use processed tsv but raw jsonl when ingesting data from NCBI Virus (#2990) * Use raw jsonl instead of generated tsv when ingesting data from NCBI virus * Do not require authors list to end in ';', capitalize names correctly. * Add tests for capitalization * Add a warning if author list might be in wrong format * Add ascii specific warning * Add tests for warnings and errors * Only capitalize if full authors string is upper case * Properly capitalize initial * Move titlecase option to ingest only - add ingest tests Move author formatting functions to format_ncbi_metadata as this is a more logical location Remove duplicate group name # Conflicts: # ena-submission/scripts/get_ena_submission_list.py # ena-submission/src/ena_deposition/config.py
loculus-project · Oct 18, 2024 · f480d67 · f480d67
1 parent b022f4d
commit f480d67
Show file tree

Hide file tree

Showing 17 changed files with 627 additions and 102 deletions.
diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py
@@ -23,6 +23,7 @@
     dataclass_to_xml,
     get_chromsome_accessions,
     get_ena_analysis_process,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from ena_deposition.ena_types import default_project_type, default_sample_type
 
@@ -188,6 +189,12 @@ def setUp(self):
         }
         self.seq_key = {"accession": "test_accession", "version": "test_version"}
 
+    def test_format_authors(self):
+        authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
+        result = reformat_authors_from_loculus_to_embl_style(authors)
+        desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
+        self.assertEqual(result, desired_result)
+
     def test_create_chromosome_list_multi_segment(self):
         chromosome_list = create_chromosome_list_object(
             self.unaligned_sequences_multi, self.seq_key

diff --git a/ena-submission/src/ena_deposition/config.py b/ena-submission/src/ena_deposition/config.py
@@ -50,7 +50,7 @@ def secure_ena_connection(config: Config):
         config.test = True
         logging.info("Submitting to ENA dev environment")
         config.ena_submission_url = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit"
-        config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json"
+        config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json"
         config.ena_reports_service_url = "https://wwwdev.ebi.ac.uk/ena/submit/report"
 
     if submit_to_ena_prod:

diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py
@@ -17,6 +17,7 @@
     create_manifest,
     get_ena_analysis_process,
     get_ena_config,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from .ena_types import (
     AssemblyChromosomeListFile,
@@ -129,6 +130,12 @@ def create_manifest_object(
     authors = (
         metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
     )
+    try:
+        authors = reformat_authors_from_loculus_to_embl_style(authors)
+    except ValueError as err:
+        msg = f"Was unable to format authors: {authors} as ENA expects"
+        logger.error(msg)
+        raise ValueError(msg) from err
     collection_date = metadata.get("sampleCollectionDate", "Unknown")
     country = metadata.get("geoLocCountry", "Unknown")
     admin1 = metadata.get("geoLocAdmin1", "")

diff --git a/ena-submission/src/ena_deposition/ena_submission_helper.py b/ena-submission/src/ena_deposition/ena_submission_helper.py
@@ -127,6 +127,28 @@ def get_project_xml(project_set):
     }
 
 
+def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
+    """This function reformats the Loculus authors string to the format expected by ENA
+    Loculus format: `Doe, John A.; Roe, Jane B. C.`
+    EMBL expected: `Doe J.A., Roe J.B.C.;`
+
+    EMBL spec: "The names are listed surname first followed by a blank
+      followed by initial(s) with stops.
+      Occasionally the initials may not be known,
+      in which case the surname alone will be listed.
+      The author names are separated by commas
+      and terminated by a semicolon; they are not split between lines."
+    See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
+    authors_list = [author for author in authors.split(";") if author]
+    ena_authors = []
+    for author in authors_list:
+        last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
+        initials = ".".join([name[0] for name in first_name.split(" ") if name])
+        initials = initials + "." if initials else initials
+        ena_authors.append(f"{last_name} {initials}")
+    return ", ".join(ena_authors) + ";"
+
+
 def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
     """
     The project creation request should be equivalent to 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -21,24 +21,13 @@ with open("results/config.yaml", "w") as f:
 
 TAXON_ID = config["taxon_id"]
 SEGMENTED = config["segmented"]
-COLUMN_MAPPING = config["column_mapping"]
 LOG_LEVEL = config.get("log_level", "INFO")
 NCBI_API_KEY = os.getenv("NCBI_API_KEY")
 FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
 APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min")  # time in minutes
 CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)
 
 
-def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
-    with open(input_file, "r") as f:
-        header = f.readline().strip().split("\t")
-        header = [mapping.get(h, h) for h in header]
-        with open(output_file, "w") as g:
-            g.write("\t".join(header) + "\n")
-            for line in f:
-                g.write(line)
-
-
 rule all:
     params:
         config=lambda wildcards: str(config),
@@ -72,30 +61,21 @@ rule fetch_ncbi_dataset_package:
 
 rule format_ncbi_dataset_report:
     input:
+        script="scripts/format_ncbi_metadata.py",
         dataset_package="results/ncbi_dataset.zip",
+        config="results/config.yaml",
     output:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
     shell:
         """
-        dataformat tsv virus-genome \
-            --package {input.dataset_package} \
-            > {output.ncbi_dataset_tsv}
+        unzip -o {input.dataset_package} -d results
+        python {input.script} \
+            --config-file {input.config} \
+            --input results/ncbi_dataset/data/data_report.jsonl \
+            --output {output.ncbi_dataset_tsv}
         """
 
 
-rule rename_columns:
-    input:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
-    output:
-        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
-    params:
-        mapping=COLUMN_MAPPING,
-    run:
-        rename_columns(
-            input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
-        )
-
-
 if CHECK_ENA_DEPOSITION:
 
     rule get_loculus_depositions:

diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -24,52 +24,44 @@ keep:
   - sequence_md5
   - genbankAccession
   - jointAccession
-column_mapping:
-  Accession: genbankAccession
-  BioProjects: bioprojects
-  BioSample accession: biosampleAccession
-  Gene count: ncbi_gene_count
-  Geographic Location: ncbiGeoLocation
-  Geographic Region: ncbiGeoRegion
-  Host Common Name: ncbiHostCommonName
-  Host Infraspecific Names Breed: ncbiHostBreed
-  Host Infraspecific Names Cultivar: ncbiHostCultivar
-  Host Infraspecific Names Ecotype: ncbiHostEcotype
-  Host Infraspecific Names Isolate: ncbiHostIsolate
-  Host Infraspecific Names Sex: ncbiHostSex
-  Host Infraspecific Names Strain: ncbiHostStrain
-  Host Name: ncbiHostName
-  Host Pangolin Classification: ncbiHostPangolin
-  Host Taxonomic ID: ncbiHostTaxId
-  Is Annotated: ncbiIsAnnotated
-  Is Complete: ncbiIsComplete
-  Is Lab Host: ncbiIsLabHost
-  Is Vaccine Strain: ncbiIsVaccineStrain
-  Isolate Collection date: ncbiCollectionDate
-  Isolate Lineage: ncbiIsolateName
-  Isolate Lineage source: ncbiIsolateSource
-  Lab Host: ncbiLabHost
-  Mature peptide count: ncbiMaturePeptideCount
-  Molecule type: ncbiMolType
-  Protein count: ncbiProteinCount
-  Purpose of Sampling: ncbiPurposeOfSampling
-  Release date: ncbiReleaseDate
-  Source database: ncbiSourceDb
-  SRA Accessions: ncbiSraAccessions
-  Submitter Affiliation: ncbiSubmitterAffiliation
-  Submitter Country: ncbiSubmitterCountry
-  Submitter Names: ncbiSubmitterNames
-  Update date: ncbiUpdateDate
-  Virus Common Name: ncbiVirusCommonName
-  Virus Infraspecific Names Breed: ncbiVirusBreed
-  Virus Infraspecific Names Cultivar: ncbiVirusCultivar
-  Virus Infraspecific Names Ecotype: ncbiVirusEcotype
-  Virus Infraspecific Names Isolate: ncbiVirusIsolate
-  Virus Infraspecific Names Sex: ncbi_virus
-  Virus Infraspecific Names Strain: ncbiVirusStrain
-  Virus Name: ncbiVirusName
-  Virus Pangolin Classification: ncbiVirusPangolin
-  Virus Taxonomic ID: ncbiVirusTaxId
+simple_mappings:
+  "ncbiReleaseDate": "releaseDate"
+  "ncbiIsAnnotated": "isAnnotated"
+  "ncbiIsLabHost": "isLabHost"
+  "ncbiProteinCount": "proteinCount"
+  "ncbiSourceDb": "sourceDatabase"
+  "ncbiIsComplete": "completeness"
+  "ncbiLabHost": "labHost"
+  "ncbiUpdateDate": "updateDate"
+  "genbankAccession": "accession"
+  "biosampleAccession": "biosample"
+  "ncbi_gene_count": "geneCount"
+  "bioprojects": "bioprojects"
+  "ncbiSraAccessions": "sraAccessions"
+location_mappings:
+  "ncbiGeoLocation": "geographicLocation"
+  "ncbiGeoRegion": "geographicRegion"
+submitter_mappings:
+  "ncbiSubmitterAffiliation": "affiliation"
+  "ncbiSubmitterNames": "names"
+  "ncbiSubmitterCountry": "country"
+isolate_mappings:
+  "ncbiIsolateName": "name"
+  "ncbiIsolateSource": "source"
+  "ncbiCollectionDate": "collectionDate"
+virus_mappings:
+  "ncbiVirusName": "organismName"
+  "ncbiVirusTaxId": "taxId"
+host_mappings:
+  "ncbiHostTaxId": "taxId"
+  "ncbiHostName": "organismName"
+parse_list:
+  - bioprojects
+  - ncbiSraAccessions
+unknown_mappings: # I don't know yet where these fields come from
+  - ncbiHostCommonName
+  - ncbiPurposeOfSampling
+  - ncbiHostSex
 group_name: insdc_ingest_group  # Used only to set the group name, never read
 username: insdc_ingest_user
 password: insdc_ingest_user