loculus-project · anna-parker · Oct 8, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py
@@ -23,6 +23,7 @@
     dataclass_to_xml,
     get_chromsome_accessions,
     get_ena_analysis_process,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from ena_deposition.ena_types import default_project_type, default_sample_type
 
@@ -188,6 +189,12 @@ def setUp(self):
         }
         self.seq_key = {"accession": "test_accession", "version": "test_version"}
 
+    def test_format_authors(self):
+        authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
+        result = reformat_authors_from_loculus_to_embl_style(authors)
+        desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
+        self.assertEqual(result, desired_result)
+
     def test_create_chromosome_list_multi_segment(self):
         chromosome_list = create_chromosome_list_object(
             self.unaligned_sequences_multi, self.seq_key

diff --git a/ena-submission/src/ena_deposition/config.py b/ena-submission/src/ena_deposition/config.py
@@ -50,7 +50,7 @@ def secure_ena_connection(config: Config):
         config.test = True
         logging.info("Submitting to ENA dev environment")
         config.ena_submission_url = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit"
-        config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json"
+        config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json"
         config.ena_reports_service_url = "https://wwwdev.ebi.ac.uk/ena/submit/report"
 
     if submit_to_ena_prod:

diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py
@@ -17,6 +17,7 @@
     create_manifest,
     get_ena_analysis_process,
     get_ena_config,
+    reformat_authors_from_loculus_to_embl_style,
 )
 from .ena_types import (
     AssemblyChromosomeListFile,
@@ -129,6 +130,12 @@ def create_manifest_object(
     authors = (
         metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
     )
+    try:
+        authors = reformat_authors_from_loculus_to_embl_style(authors)
+    except ValueError as err:
+        msg = f"Was unable to format authors: {authors} as ENA expects"
+        logger.error(msg)
+        raise ValueError(msg) from err
     collection_date = metadata.get("sampleCollectionDate", "Unknown")
     country = metadata.get("geoLocCountry", "Unknown")
     admin1 = metadata.get("geoLocAdmin1", "")

diff --git a/ena-submission/src/ena_deposition/ena_submission_helper.py b/ena-submission/src/ena_deposition/ena_submission_helper.py
@@ -127,6 +127,28 @@ def get_project_xml(project_set):
     }
 
 
+def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
+    """This function reformats the Loculus authors string to the format expected by ENA
+    Loculus format: `Doe, John A.; Roe, Jane B. C.`
+    EMBL expected: `Doe J.A., Roe J.B.C.;`
+
+    EMBL spec: "The names are listed surname first followed by a blank
+      followed by initial(s) with stops.
+      Occasionally the initials may not be known,
+      in which case the surname alone will be listed.
+      The author names are separated by commas
+      and terminated by a semicolon; they are not split between lines."
+    See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
+    authors_list = [author for author in authors.split(";") if author]
+    ena_authors = []
+    for author in authors_list:
+        last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
+        initials = ".".join([name[0] for name in first_name.split(" ") if name])
+        initials = initials + "." if initials else initials
+        ena_authors.append(f"{last_name} {initials}")
+    return ", ".join(ena_authors) + ";"
+
+
 def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
     """
     The project creation request should be equivalent to 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -21,24 +21,13 @@ with open("results/config.yaml", "w") as f:
 
 TAXON_ID = config["taxon_id"]
 SEGMENTED = config["segmented"]
-COLUMN_MAPPING = config["column_mapping"]
 LOG_LEVEL = config.get("log_level", "INFO")
 NCBI_API_KEY = os.getenv("NCBI_API_KEY")
 FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
 APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min")  # time in minutes
 CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)
 
 
-def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
-    with open(input_file, "r") as f:
-        header = f.readline().strip().split("\t")
-        header = [mapping.get(h, h) for h in header]
-        with open(output_file, "w") as g:
-            g.write("\t".join(header) + "\n")
-            for line in f:
-                g.write(line)
-
-
 rule all:
     params:
         config=lambda wildcards: str(config),
@@ -72,30 +61,21 @@ rule fetch_ncbi_dataset_package:
 
 rule format_ncbi_dataset_report:
     input:
+        script="scripts/format_ncbi_metadata.py",
         dataset_package="results/ncbi_dataset.zip",
+        config="results/config.yaml",
     output:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
+        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
     shell:
         """
-        dataformat tsv virus-genome \
-            --package {input.dataset_package} \
-            > {output.ncbi_dataset_tsv}
+        unzip -o {input.dataset_package} -d results
+        python {input.script} \
+            --config-file {input.config} \
+            --input results/ncbi_dataset/data/data_report.jsonl \
+            --output {output.ncbi_dataset_tsv}
         """
 
 
-rule rename_columns:
-    input:
-        ncbi_dataset_tsv="results/metadata_post_extract.tsv",
-    output:
-        ncbi_dataset_tsv="results/metadata_post_rename.tsv",
-    params:
-        mapping=COLUMN_MAPPING,
-    run:
-        rename_columns(
-            input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
-        )
-
-
 if CHECK_ENA_DEPOSITION:
 
     rule get_loculus_depositions: