loculus-project · anna-parker · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024 · Oct 10, 2024
diff --git a/ena-submission/.gitignore b/ena-submission/.gitignore
@@ -4,4 +4,5 @@ assembly/
 project/
 sample/
 __pycache__
-config/config.yaml
+config/config.yaml
+webin-cli.jar
diff --git a/ena-submission/scripts/create_assembly.py b/ena-submission/scripts/create_assembly.py
@@ -16,6 +16,7 @@
     create_manifest,
     get_ena_analysis_process,
     get_ena_config,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from ena_types import (
     AssemblyChromosomeListFile,
@@ -140,9 +141,11 @@ def create_manifest_object(
         try:
             group_info = get_group_info(config, project_table_entry["group_id"])[0]["group"]
             address = group_info["address"]
-            address_string = (f'{address.get("line1", "")}, {address.get("line2", "")}, '
+            address_string = (
+                f'{address.get("line1", "")}, {address.get("line2", "")}, '
                 f'{address.get("city", "")}, {address.get("state", "")}, '
-                f'{address.get("postalCode", "")}, {address.get("country")}')
+                f'{address.get("postalCode", "")}, {address.get("country")}'
+            )
         except Exception as e:
             logger.error(f"Was unable to create address, setting address to center_name due to {e}")
 
@@ -154,6 +157,12 @@ def create_manifest_object(
     authors = (
         metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
     )
+    try:
+        authors = reformat_authors_from_loculus_to_embl_style(authors)
+    except ValueError as err:
+        msg = f"Was unable to format authors: {authors} as ENA expects"
+        logger.error(msg)
+        raise ValueError(msg) from err
     collection_date = metadata.get("sampleCollectionDate", "Unknown")
     country = metadata.get("geoLocCountry", "Unknown")
     admin1 = metadata.get("geoLocAdmin1", "")

diff --git a/ena-submission/scripts/ena_submission_helper.py b/ena-submission/scripts/ena_submission_helper.py
@@ -130,6 +130,28 @@ def get_project_xml(project_set):
     }
 
 
+def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
+    """This function reformats the Loculus authors string to the format expected by ENA
+    Loculus format: `Doe, John A.; Roe, Jane B. C.`
+    EMBL expected: `Doe J.A., Roe J.B.C.;`
+
+    EMBL spec: "The names are listed surname first followed by a blank
+      followed by initial(s) with stops.
+      Occasionally the initials may not be known,
+      in which case the surname alone will be listed.
+      The author names are separated by commas
+      and terminated by a semicolon; they are not split between lines."
+    See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
+    authors_list = [author for author in authors.split(";") if author]
+    ena_authors = []
+    for author in authors_list:
+        last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
+        initials = ".".join([name[0] for name in first_name.split(" ") if name])
+        initials = initials + "." if initials else initials
+        ena_authors.append(f"{last_name} {initials}")
+    return ", ".join(ena_authors) + ";"
+
+
 def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
     """
     The project creation request should be equivalent to 

diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py
@@ -23,6 +23,7 @@
     dataclass_to_xml,
     get_chromsome_accessions,
     get_ena_analysis_process,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from ena_types import default_project_type, default_sample_type
 
@@ -184,6 +185,12 @@ def setUp(self):
         }
         self.seq_key = {"accession": "test_accession", "version": "test_version"}
 
+    def test_format_authors(self):
+        authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
+        result = reformat_authors_from_loculus_to_embl_style(authors)
+        desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
+        self.assertEqual(result, desired_result)
+
     def test_create_chromosome_list_multi_segment(self):
         chromosome_list = create_chromosome_list_object(
             self.unaligned_sequences_multi, self.seq_key

diff --git a/ingest/.gitignore b/ingest/.gitignore
@@ -3,4 +3,5 @@ data/
 results/
 .DS_Store
 .ruff_cache
-config/config.yaml
+config/config.yaml
+*.ipynb
diff --git a/ingest/Dockerfile b/ingest/Dockerfile
@@ -1,5 +1,10 @@
 FROM mambaorg/micromamba:1.5.8
 
+USER root
+RUN apt-get update && apt-get install -y curl
+RUN mkdir -p /package && chown -R $MAMBA_USER:$MAMBA_USER /package
+USER $MAMBA_USER
+
 COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml
 COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc
 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -21,22 +21,12 @@ with open("results/config.yaml", "w") as f:
 
 TAXON_ID = config["taxon_id"]
 SEGMENTED = config["segmented"]
-COLUMN_MAPPING = config["column_mapping"]
 LOG_LEVEL = config.get("log_level", "INFO")
 NCBI_API_KEY = os.getenv("NCBI_API_KEY")
 FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
 APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min")  # time in minutes
 CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)
-
-
-def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
-    with open(input_file, "r") as f:
-        header = f.readline().strip().split("\t")
-        header = [mapping.get(h, h) for h in header]
-        with open(output_file, "w") as g:
-            g.write("\t".join(header) + "\n")
-            for line in f:
-                g.write(line)
+GEOLOC_RULES_URL = config.get("geolocation_rules_url", None)
 
 
 rule all:
@@ -55,46 +45,67 @@ rule clean:
 
 
 rule fetch_ncbi_dataset_package:
-    # TODO: #1844 Set API key through secret
     output:
-        dataset_package="results/ncbi_dataset.zip",
+        dataset_package_zip="results/ncbi_dataset.zip",
+        dataset_package_dir=directory("results/ncbi_dataset"),
     params:
         taxon_id=TAXON_ID,
         api_key=NCBI_API_KEY,
     shell:
         """
         datasets download virus genome taxon {params.taxon_id} \
             --no-progressbar \
-            --filename {output.dataset_package} \
-            --api-key {params.api_key} \
+            --filename {output.dataset_package_zip} \
+            --api-key {params.api_key}
+        unzip -o {output.dataset_package_zip} -d results/
         """
 
 
 rule format_ncbi_dataset_report:
     input:
-        dataset_package="results/ncbi_dataset.zip",
+        script="scripts/rename_fields.py",
+        dataset_package=directory("results/ncbi_dataset"),
+        config="results/config.yaml",
     output:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
+    params:
+        pre_ncbi_dataset_tsv="results/ncbi_dataset/data/data_report.jsonl",
     shell:
         """
-        dataformat tsv virus-genome \
-            --package {input.dataset_package} \
-            > {output.ncbi_dataset_tsv}
+        python {input.script} \
+            --config-file {input.config} \
+            --input {params.pre_ncbi_dataset_tsv} \
+            --output {output.ncbi_dataset_tsv}
         """
 
-
-rule rename_columns:
-    input:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+rule fetch_general_geolocation_rules:
     output:
-        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
+        geolocation_rules="config/geolocation-rules.tsv",
     params:
-        mapping=COLUMN_MAPPING,
-    run:
-        rename_columns(
-            input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
-        )
+        geolocation_rules_url=GEOLOC_RULES_URL,
+    shell:
+        """
+        curl {params.geolocation_rules_url} > {output.geolocation_rules}
+        """
 
+rule curate_geoloc_metadata:
+    input:
+        metadata="results/metadata_post_rename.tsv",
+        geolocation_rules="config/geolocation-rules.tsv",
+    output:
+        pre_curated_metadata="results/pre_curated_metadata.tsv",
+        curated_metadata="results/curated_metadata.tsv",
+    shell:
+        """
+        augur curate parse-genbank-location --metadata {input.metadata} \
+            --location-field=ncbiGeoLocation --id-column "genbankAccession" \
+            --output-metadata {output.pre_curated_metadata}
+        augur curate apply-geolocation-rules \
+            --metadata {output.pre_curated_metadata} \
+            --output-metadata {output.curated_metadata}  \
+            --geolocation-rules={input.geolocation_rules} \
+            --region-field='ncbiGeoRegion' --id-column "genbankAccession"
+        """
 
 if CHECK_ENA_DEPOSITION:
 
@@ -118,7 +129,7 @@ if CHECK_ENA_DEPOSITION:
 
     rule filter_out_loculus_depositions:
         input:
-            ncbi_dataset_tsv="results/metadata_post_rename.tsv",
+            ncbi_dataset_tsv="results/curated_metadata.tsv",
             exclude_biosample_accessions="results/biosample_accessions_to_exclude.tsv",
             exclude_insdc_accessions="results/insdc_accessions_to_exclude.tsv",
             script="scripts/filter_out_depositions.py",
@@ -257,7 +268,7 @@ rule prepare_metadata:
         metadata=(
             "results/filtered_metadata.tsv"
             if CHECK_ENA_DEPOSITION
-            else "results/metadata_post_rename.tsv"
+            else "results/curated_metadata.tsv"
         ),
         segments="results/nextclade_merged.tsv" if SEGMENTED else "results/config.yaml",  # else is just a dummy
         sequence_hashes="results/sequence_hashes.ndjson",

diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -8,6 +8,7 @@ fasta_id_field: genbankAccession
 keep:
   - division
   - country
+  - location
   - submissionId
   - insdcAccessionBase
   - insdcVersion
@@ -24,52 +25,45 @@ keep:
   - sequence_md5
   - genbankAccession
   - jointAccession
-column_mapping:
-  Accession: genbankAccession
-  BioProjects: bioprojects
-  BioSample accession: biosampleAccession
-  Gene count: ncbi_gene_count
-  Geographic Location: ncbiGeoLocation
-  Geographic Region: ncbiGeoRegion
-  Host Common Name: ncbiHostCommonName
-  Host Infraspecific Names Breed: ncbiHostBreed
-  Host Infraspecific Names Cultivar: ncbiHostCultivar
-  Host Infraspecific Names Ecotype: ncbiHostEcotype
-  Host Infraspecific Names Isolate: ncbiHostIsolate
-  Host Infraspecific Names Sex: ncbiHostSex
-  Host Infraspecific Names Strain: ncbiHostStrain
-  Host Name: ncbiHostName
-  Host Pangolin Classification: ncbiHostPangolin
-  Host Taxonomic ID: ncbiHostTaxId
-  Is Annotated: ncbiIsAnnotated
-  Is Complete: ncbiIsComplete
-  Is Lab Host: ncbiIsLabHost
-  Is Vaccine Strain: ncbiIsVaccineStrain
-  Isolate Collection date: ncbiCollectionDate
-  Isolate Lineage: ncbiIsolateName
-  Isolate Lineage source: ncbiIsolateSource
-  Lab Host: ncbiLabHost
-  Mature peptide count: ncbiMaturePeptideCount
-  Molecule type: ncbiMolType
-  Protein count: ncbiProteinCount
-  Purpose of Sampling: ncbiPurposeOfSampling
-  Release date: ncbiReleaseDate
-  Source database: ncbiSourceDb
-  SRA Accessions: ncbiSraAccessions
-  Submitter Affiliation: ncbiSubmitterAffiliation
-  Submitter Country: ncbiSubmitterCountry
-  Submitter Names: ncbiSubmitterNames
-  Update date: ncbiUpdateDate
-  Virus Common Name: ncbiVirusCommonName
-  Virus Infraspecific Names Breed: ncbiVirusBreed
-  Virus Infraspecific Names Cultivar: ncbiVirusCultivar
-  Virus Infraspecific Names Ecotype: ncbiVirusEcotype
-  Virus Infraspecific Names Isolate: ncbiVirusIsolate
-  Virus Infraspecific Names Sex: ncbi_virus
-  Virus Infraspecific Names Strain: ncbiVirusStrain
-  Virus Name: ncbiVirusName
-  Virus Pangolin Classification: ncbiVirusPangolin
-  Virus Taxonomic ID: ncbiVirusTaxId
+simple_mappings:
+  "ncbiReleaseDate": "releaseDate"
+  "ncbiIsAnnotated": "isAnnotated"
+  "ncbiIsLabHost": "isLabHost"
+  "ncbiProteinCount": "proteinCount"
+  "ncbiSourceDb": "sourceDatabase"
+  "ncbiIsComplete": "completeness"
+  "ncbiLabHost": "labHost"
+  "ncbiUpdateDate": "updateDate"
+  "genbankAccession": "accession"
+  "biosampleAccession": "biosample"
+  "ncbi_gene_count": "geneCount"
+  "bioprojects": "bioprojects"
+  "ncbiSraAccessions": "sraAccessions"
+location_mappings:
+  "ncbiGeoLocation": "geographicLocation"
+  "ncbiGeoRegion": "geographicRegion"
+submitter_mappings:
+  "ncbiSubmitterAffiliation": "affiliation"
+  "ncbiSubmitterNames": "names"
+  "ncbiSubmitterCountry": "country"
+isolate_mappings:
+  "ncbiIsolateName": "name"
+  "ncbiIsolateSource": "source"
+  "ncbiCollectionDate": "collectionDate"
+virus_mappings:
+  "ncbiVirusName": "organismName"
+  "ncbiVirusTaxId": "taxId"
+host_mappings:
+  "ncbiHostTaxId": "taxId"
+  "ncbiHostName": "organismName"
+parse_list:
+  - bioprojects
+  - ncbiSraAccessions
+unknown_mappings: # I don't know yet where these fields come from
+  - ncbiHostCommonName
+  - ncbiPurposeOfSampling
+  - ncbiHostSex
+group_name: insdc_ingest_group
 group_name: insdc_ingest_group  # Used only to set the group name, never read
 username: insdc_ingest_user
 password: insdc_ingest_user
@@ -79,3 +73,4 @@ approve_timeout_min: "25" # Cronjobs run every 30min, make approve stop before i
 db_username: postgres
 db_password: unsecure
 db_url: "jdbc:postgresql://127.0.0.1:5432/loculus"
+geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
diff --git a/ingest/environment.yml b/ingest/environment.yml
@@ -4,14 +4,15 @@ channels:
   - bioconda
 dependencies:
   # Core Python dependencies
-  - python =3.12
+  - python
   # Extra dependencies
   - biopython
   - click
   - ijson
   - jsonlines
   - ncbi-datasets-cli >=16.29.0
   - nextclade >=3.7.0
+  - augur >=26.0.0
   - orjsonl
   - pandas
   - psycopg2

diff --git a/ingest/scripts/call_loculus.py b/ingest/scripts/call_loculus.py
@@ -199,7 +199,7 @@ def submit_or_revise(
     url = f"{organism_url(config)}/{endpoint}"
 
     metadata_lines = len(Path(metadata).read_text(encoding="utf-8").splitlines()) - 1
-    logger.info(f"{logging_strings["gerund"]} {metadata_lines} sequence(s) to Loculus")
+    logger.info(f"{logging_strings['gerund']} {metadata_lines} sequence(s) to Loculus")
 
     params = {
         "groupId": group_id,
@@ -213,7 +213,7 @@ def submit_or_revise(
             "sequenceFile": sequences_file,
         }
         response = make_request(HTTPMethod.POST, url, config, params=params, files=files)
-    logger.debug(f"{logging_strings["noun"]} response: {response.json()}")
+    logger.debug(f"{logging_strings['noun']} response: {response.json()}")
 
     return response.json()