Skip to content

Commit

Permalink
Create a python package for ena-deposition
Browse files Browse the repository at this point in the history
More clean up

Move checks from snakefile to config

fix config

update deployment

update tests ci

Add trigger from db option

Fix cronjob

Fix link to config-file

fix deployment

install package in dockerfile

install at correct location

Remove snakemake as no longer needed

Add missing dependency

try to debug

Create an XmlNone dataclass - this is required since package update

test threads stop

revert exception test

test upload to ena dev still works on preview

Make sure test is set correctly!!!

remove debug print statements

Improve logs

Fix merge errors

Update ena-submission/README.md

Co-authored-by: Cornelius Roemer <[email protected]>

Apply suggestions from code review

Co-authored-by: Cornelius Roemer <[email protected]>

Cronjob: create results directory before writing to it

format authors in prepro

Fix ingest

try to fix pattern

simplify regex

fix check

Add tests

# Conflicts:
#	preprocessing/nextclade/tests/test.py

Add to ena submission

fix

fix other edge case

Update ena-submission/scripts/ena_submission_helper.py

Co-authored-by: Cornelius Roemer <[email protected]>

Update ena-submission/scripts/ena_submission_helper.py

Update ena-submission/scripts/ena_submission_helper.py

Co-authored-by: Cornelius Roemer <[email protected]>

Update ena-submission/scripts/ena_submission_helper.py

Update ingest/scripts/prepare_metadata.py

Update preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py

rename

Update reformat_authors_from_genbank_to_loculus

Additionally format authors with correct white space

Improve error message

add tests

fix missing pattern

improve error logs

fix error

Update preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py

improve logging more

feat(ingest): Do not use processed tsv but raw jsonl when ingesting data from NCBI Virus (#2990)

* Use raw jsonl instead of generated tsv when ingesting data from NCBI virus

* Do not require authors list to end in ';', capitalize names correctly.

* Add tests for capitalization

* Add a warning if author list might be in wrong format

* Add ascii specific warning

* Add tests for warnings and errors

* Only capitalize if full authors string is upper case

* Properly capitalize initial

* Move titlecase option to ingest only - add ingest tests

Move author formatting functions to format_ncbi_metadata as this is a more logical location

Remove duplicate group name

# Conflicts:
#	ena-submission/scripts/get_ena_submission_list.py
#	ena-submission/src/ena_deposition/config.py
  • Loading branch information
anna-parker authored and corneliusroemer committed Oct 18, 2024
1 parent b022f4d commit f480d67
Show file tree
Hide file tree
Showing 17 changed files with 627 additions and 102 deletions.
7 changes: 7 additions & 0 deletions ena-submission/scripts/test_ena_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
dataclass_to_xml,
get_chromsome_accessions,
get_ena_analysis_process,
reformat_authors_from_loculus_to_embl_style,
)
from ena_deposition.ena_types import default_project_type, default_sample_type

Expand Down Expand Up @@ -188,6 +189,12 @@ def setUp(self):
}
self.seq_key = {"accession": "test_accession", "version": "test_version"}

def test_format_authors(self):
authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
result = reformat_authors_from_loculus_to_embl_style(authors)
desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
self.assertEqual(result, desired_result)

def test_create_chromosome_list_multi_segment(self):
chromosome_list = create_chromosome_list_object(
self.unaligned_sequences_multi, self.seq_key
Expand Down
2 changes: 1 addition & 1 deletion ena-submission/src/ena_deposition/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def secure_ena_connection(config: Config):
config.test = True
logging.info("Submitting to ENA dev environment")
config.ena_submission_url = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit"
config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json"
config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json"
config.ena_reports_service_url = "https://wwwdev.ebi.ac.uk/ena/submit/report"

if submit_to_ena_prod:
Expand Down
7 changes: 7 additions & 0 deletions ena-submission/src/ena_deposition/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
create_manifest,
get_ena_analysis_process,
get_ena_config,
reformat_authors_from_loculus_to_embl_style,
)
from .ena_types import (
AssemblyChromosomeListFile,
Expand Down Expand Up @@ -129,6 +130,12 @@ def create_manifest_object(
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
authors = reformat_authors_from_loculus_to_embl_style(authors)
except ValueError as err:
msg = f"Was unable to format authors: {authors} as ENA expects"
logger.error(msg)
raise ValueError(msg) from err
collection_date = metadata.get("sampleCollectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
admin1 = metadata.get("geoLocAdmin1", "")
Expand Down
22 changes: 22 additions & 0 deletions ena-submission/src/ena_deposition/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,28 @@ def get_project_xml(project_set):
}


def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
"""This function reformats the Loculus authors string to the format expected by ENA
Loculus format: `Doe, John A.; Roe, Jane B. C.`
EMBL expected: `Doe J.A., Roe J.B.C.;`
EMBL spec: "The names are listed surname first followed by a blank
followed by initial(s) with stops.
Occasionally the initials may not be known,
in which case the surname alone will be listed.
The author names are separated by commas
and terminated by a semicolon; they are not split between lines."
See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
authors_list = [author for author in authors.split(";") if author]
ena_authors = []
for author in authors_list:
last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
initials = ".".join([name[0] for name in first_name.split(" ") if name])
initials = initials + "." if initials else initials
ena_authors.append(f"{last_name} {initials}")
return ", ".join(ena_authors) + ";"


def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
"""
The project creation request should be equivalent to
Expand Down
36 changes: 8 additions & 28 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,13 @@ with open("results/config.yaml", "w") as f:

TAXON_ID = config["taxon_id"]
SEGMENTED = config["segmented"]
COLUMN_MAPPING = config["column_mapping"]
LOG_LEVEL = config.get("log_level", "INFO")
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes
CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)


def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
with open(input_file, "r") as f:
header = f.readline().strip().split("\t")
header = [mapping.get(h, h) for h in header]
with open(output_file, "w") as g:
g.write("\t".join(header) + "\n")
for line in f:
g.write(line)


rule all:
params:
config=lambda wildcards: str(config),
Expand Down Expand Up @@ -72,30 +61,21 @@ rule fetch_ncbi_dataset_package:

rule format_ncbi_dataset_report:
input:
script="scripts/format_ncbi_metadata.py",
dataset_package="results/ncbi_dataset.zip",
config="results/config.yaml",
output:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
shell:
"""
dataformat tsv virus-genome \
--package {input.dataset_package} \
> {output.ncbi_dataset_tsv}
unzip -o {input.dataset_package} -d results
python {input.script} \
--config-file {input.config} \
--input results/ncbi_dataset/data/data_report.jsonl \
--output {output.ncbi_dataset_tsv}
"""


rule rename_columns:
input:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
output:
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
params:
mapping=COLUMN_MAPPING,
run:
rename_columns(
input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
)


if CHECK_ENA_DEPOSITION:

rule get_loculus_depositions:
Expand Down
84 changes: 38 additions & 46 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,52 +24,44 @@ keep:
- sequence_md5
- genbankAccession
- jointAccession
column_mapping:
Accession: genbankAccession
BioProjects: bioprojects
BioSample accession: biosampleAccession
Gene count: ncbi_gene_count
Geographic Location: ncbiGeoLocation
Geographic Region: ncbiGeoRegion
Host Common Name: ncbiHostCommonName
Host Infraspecific Names Breed: ncbiHostBreed
Host Infraspecific Names Cultivar: ncbiHostCultivar
Host Infraspecific Names Ecotype: ncbiHostEcotype
Host Infraspecific Names Isolate: ncbiHostIsolate
Host Infraspecific Names Sex: ncbiHostSex
Host Infraspecific Names Strain: ncbiHostStrain
Host Name: ncbiHostName
Host Pangolin Classification: ncbiHostPangolin
Host Taxonomic ID: ncbiHostTaxId
Is Annotated: ncbiIsAnnotated
Is Complete: ncbiIsComplete
Is Lab Host: ncbiIsLabHost
Is Vaccine Strain: ncbiIsVaccineStrain
Isolate Collection date: ncbiCollectionDate
Isolate Lineage: ncbiIsolateName
Isolate Lineage source: ncbiIsolateSource
Lab Host: ncbiLabHost
Mature peptide count: ncbiMaturePeptideCount
Molecule type: ncbiMolType
Protein count: ncbiProteinCount
Purpose of Sampling: ncbiPurposeOfSampling
Release date: ncbiReleaseDate
Source database: ncbiSourceDb
SRA Accessions: ncbiSraAccessions
Submitter Affiliation: ncbiSubmitterAffiliation
Submitter Country: ncbiSubmitterCountry
Submitter Names: ncbiSubmitterNames
Update date: ncbiUpdateDate
Virus Common Name: ncbiVirusCommonName
Virus Infraspecific Names Breed: ncbiVirusBreed
Virus Infraspecific Names Cultivar: ncbiVirusCultivar
Virus Infraspecific Names Ecotype: ncbiVirusEcotype
Virus Infraspecific Names Isolate: ncbiVirusIsolate
Virus Infraspecific Names Sex: ncbi_virus
Virus Infraspecific Names Strain: ncbiVirusStrain
Virus Name: ncbiVirusName
Virus Pangolin Classification: ncbiVirusPangolin
Virus Taxonomic ID: ncbiVirusTaxId
simple_mappings:
"ncbiReleaseDate": "releaseDate"
"ncbiIsAnnotated": "isAnnotated"
"ncbiIsLabHost": "isLabHost"
"ncbiProteinCount": "proteinCount"
"ncbiSourceDb": "sourceDatabase"
"ncbiIsComplete": "completeness"
"ncbiLabHost": "labHost"
"ncbiUpdateDate": "updateDate"
"genbankAccession": "accession"
"biosampleAccession": "biosample"
"ncbi_gene_count": "geneCount"
"bioprojects": "bioprojects"
"ncbiSraAccessions": "sraAccessions"
location_mappings:
"ncbiGeoLocation": "geographicLocation"
"ncbiGeoRegion": "geographicRegion"
submitter_mappings:
"ncbiSubmitterAffiliation": "affiliation"
"ncbiSubmitterNames": "names"
"ncbiSubmitterCountry": "country"
isolate_mappings:
"ncbiIsolateName": "name"
"ncbiIsolateSource": "source"
"ncbiCollectionDate": "collectionDate"
virus_mappings:
"ncbiVirusName": "organismName"
"ncbiVirusTaxId": "taxId"
host_mappings:
"ncbiHostTaxId": "taxId"
"ncbiHostName": "organismName"
parse_list:
- bioprojects
- ncbiSraAccessions
unknown_mappings: # I don't know yet where these fields come from
- ncbiHostCommonName
- ncbiPurposeOfSampling
- ncbiHostSex
group_name: insdc_ingest_group # Used only to set the group name, never read
username: insdc_ingest_user
password: insdc_ingest_user
Expand Down
Loading

0 comments on commit f480d67

Please sign in to comment.