Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingest): Use augur curate to curate country metadata #3015

Draft
wants to merge 36 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
b44dc19
format authors in prepro
anna-parker Oct 10, 2024
8c8912d
Fix ingest
anna-parker Oct 10, 2024
c6ba486
try to fix pattern
anna-parker Oct 10, 2024
4aba133
simplify regex
anna-parker Oct 10, 2024
77828b6
fix check
anna-parker Oct 10, 2024
77a5d02
Add tests
anna-parker Oct 10, 2024
410a645
Add to ena submission
anna-parker Oct 10, 2024
814dab2
fix
anna-parker Oct 10, 2024
255337d
fix other edge case
anna-parker Oct 10, 2024
3b00534
Update ena-submission/scripts/ena_submission_helper.py
anna-parker Oct 11, 2024
315baf3
Update ena-submission/scripts/ena_submission_helper.py
anna-parker Oct 11, 2024
e780275
Update ena-submission/scripts/ena_submission_helper.py
anna-parker Oct 11, 2024
fe9543f
Update ena-submission/scripts/ena_submission_helper.py
anna-parker Oct 11, 2024
9a13990
Update ingest/scripts/prepare_metadata.py
anna-parker Oct 11, 2024
c71e6a6
Update preprocessing/nextclade/src/loculus_preprocessing/processing_f…
anna-parker Oct 11, 2024
9886d4d
rename
anna-parker Oct 11, 2024
1af54f5
Update reformat_authors_from_genbank_to_loculus
anna-parker Oct 11, 2024
7eed322
Additionally format authors with correct white space
anna-parker Oct 11, 2024
b3ab701
Improve error message
anna-parker Oct 11, 2024
d94e60a
add tests
anna-parker Oct 11, 2024
4a27aaf
fix missing pattern
anna-parker Oct 11, 2024
356b70e
improve error logs
anna-parker Oct 11, 2024
497d09f
fix error
anna-parker Oct 11, 2024
5940cbc
Update preprocessing/nextclade/src/loculus_preprocessing/processing_f…
anna-parker Oct 11, 2024
a44e05a
improve logging more
anna-parker Oct 11, 2024
c5f9966
feat(ingest): Do not use processed tsv but raw jsonl when ingesting d…
anna-parker Oct 17, 2024
b019e96
Merge branch 'main' into format_authors
anna-parker Oct 17, 2024
f07a92f
Add geoloc metadata parser
anna-parker Oct 17, 2024
9433ab6
fix loculus call
anna-parker Oct 17, 2024
1a0718f
Fix tests
anna-parker Oct 17, 2024
42ec318
more updates
anna-parker Oct 17, 2024
da2e140
try again
anna-parker Oct 17, 2024
25bb835
add curl to dockerfile
anna-parker Oct 17, 2024
0c4ba63
second try
anna-parker Oct 17, 2024
7fc4c1b
fix quoting issues with escape characters
anna-parker Oct 17, 2024
fa938bc
Do not convert ints to floats
anna-parker Oct 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ena-submission/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ assembly/
project/
sample/
__pycache__
config/config.yaml
config/config.yaml
webin-cli.jar
13 changes: 11 additions & 2 deletions ena-submission/scripts/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
create_manifest,
get_ena_analysis_process,
get_ena_config,
reformat_authors_from_loculus_to_embl_style,
)
from ena_types import (
AssemblyChromosomeListFile,
Expand Down Expand Up @@ -140,9 +141,11 @@ def create_manifest_object(
try:
group_info = get_group_info(config, project_table_entry["group_id"])[0]["group"]
address = group_info["address"]
address_string = (f'{address.get("line1", "")}, {address.get("line2", "")}, '
address_string = (
f'{address.get("line1", "")}, {address.get("line2", "")}, '
f'{address.get("city", "")}, {address.get("state", "")}, '
f'{address.get("postalCode", "")}, {address.get("country")}')
f'{address.get("postalCode", "")}, {address.get("country")}'
)
except Exception as e:
logger.error(f"Was unable to create address, setting address to center_name due to {e}")

Expand All @@ -154,6 +157,12 @@ def create_manifest_object(
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
authors = reformat_authors_from_loculus_to_embl_style(authors)
except ValueError as err:
msg = f"Was unable to format authors: {authors} as ENA expects"
logger.error(msg)
raise ValueError(msg) from err
collection_date = metadata.get("sampleCollectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
admin1 = metadata.get("geoLocAdmin1", "")
Expand Down
22 changes: 22 additions & 0 deletions ena-submission/scripts/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,28 @@ def get_project_xml(project_set):
}


def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
"""This function reformats the Loculus authors string to the format expected by ENA
Loculus format: `Doe, John A.; Roe, Jane B. C.`
EMBL expected: `Doe J.A., Roe J.B.C.;`

EMBL spec: "The names are listed surname first followed by a blank
followed by initial(s) with stops.
Occasionally the initials may not be known,
in which case the surname alone will be listed.
The author names are separated by commas
and terminated by a semicolon; they are not split between lines."
See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
authors_list = [author for author in authors.split(";") if author]
ena_authors = []
for author in authors_list:
last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
initials = ".".join([name[0] for name in first_name.split(" ") if name])
initials = initials + "." if initials else initials
ena_authors.append(f"{last_name} {initials}")
return ", ".join(ena_authors) + ";"


def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
"""
The project creation request should be equivalent to
Expand Down
7 changes: 7 additions & 0 deletions ena-submission/scripts/test_ena_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
dataclass_to_xml,
get_chromsome_accessions,
get_ena_analysis_process,
reformat_authors_from_loculus_to_embl_style,
)
from ena_types import default_project_type, default_sample_type

Expand Down Expand Up @@ -184,6 +185,12 @@ def setUp(self):
}
self.seq_key = {"accession": "test_accession", "version": "test_version"}

def test_format_authors(self):
authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
result = reformat_authors_from_loculus_to_embl_style(authors)
desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
self.assertEqual(result, desired_result)

def test_create_chromosome_list_multi_segment(self):
chromosome_list = create_chromosome_list_object(
self.unaligned_sequences_multi, self.seq_key
Expand Down
3 changes: 2 additions & 1 deletion ingest/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ data/
results/
.DS_Store
.ruff_cache
config/config.yaml
config/config.yaml
*.ipynb
5 changes: 5 additions & 0 deletions ingest/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
FROM mambaorg/micromamba:1.5.8

USER root
RUN apt-get update && apt-get install -y curl
RUN mkdir -p /package && chown -R $MAMBA_USER:$MAMBA_USER /package
USER $MAMBA_USER

COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yaml
COPY --chown=$MAMBA_USER:$MAMBA_USER .mambarc /tmp/.mambarc

Expand Down
75 changes: 43 additions & 32 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,12 @@ with open("results/config.yaml", "w") as f:

TAXON_ID = config["taxon_id"]
SEGMENTED = config["segmented"]
COLUMN_MAPPING = config["column_mapping"]
LOG_LEVEL = config.get("log_level", "INFO")
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes
CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)


def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
with open(input_file, "r") as f:
header = f.readline().strip().split("\t")
header = [mapping.get(h, h) for h in header]
with open(output_file, "w") as g:
g.write("\t".join(header) + "\n")
for line in f:
g.write(line)
GEOLOC_RULES_URL = config.get("geolocation_rules_url", None)


rule all:
Expand All @@ -55,46 +45,67 @@ rule clean:


rule fetch_ncbi_dataset_package:
# TODO: #1844 Set API key through secret
output:
dataset_package="results/ncbi_dataset.zip",
dataset_package_zip="results/ncbi_dataset.zip",
dataset_package_dir=directory("results/ncbi_dataset"),
params:
taxon_id=TAXON_ID,
api_key=NCBI_API_KEY,
shell:
"""
datasets download virus genome taxon {params.taxon_id} \
--no-progressbar \
--filename {output.dataset_package} \
--api-key {params.api_key} \
--filename {output.dataset_package_zip} \
--api-key {params.api_key}
unzip -o {output.dataset_package_zip} -d results/
"""


rule format_ncbi_dataset_report:
input:
dataset_package="results/ncbi_dataset.zip",
script="scripts/rename_fields.py",
dataset_package=directory("results/ncbi_dataset"),
config="results/config.yaml",
output:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
params:
pre_ncbi_dataset_tsv="results/ncbi_dataset/data/data_report.jsonl",
shell:
"""
dataformat tsv virus-genome \
--package {input.dataset_package} \
> {output.ncbi_dataset_tsv}
python {input.script} \
--config-file {input.config} \
--input {params.pre_ncbi_dataset_tsv} \
--output {output.ncbi_dataset_tsv}
"""


rule rename_columns:
input:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
rule fetch_general_geolocation_rules:
output:
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
geolocation_rules="config/geolocation-rules.tsv",
params:
mapping=COLUMN_MAPPING,
run:
rename_columns(
input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
)
geolocation_rules_url=GEOLOC_RULES_URL,
shell:
"""
curl {params.geolocation_rules_url} > {output.geolocation_rules}
"""

rule curate_geoloc_metadata:
input:
metadata="results/metadata_post_rename.tsv",
geolocation_rules="config/geolocation-rules.tsv",
output:
pre_curated_metadata="results/pre_curated_metadata.tsv",
curated_metadata="results/curated_metadata.tsv",
shell:
"""
augur curate parse-genbank-location --metadata {input.metadata} \
--location-field=ncbiGeoLocation --id-column "genbankAccession" \
--output-metadata {output.pre_curated_metadata}
augur curate apply-geolocation-rules \
--metadata {output.pre_curated_metadata} \
--output-metadata {output.curated_metadata} \
--geolocation-rules={input.geolocation_rules} \
--region-field='ncbiGeoRegion' --id-column "genbankAccession"
"""

if CHECK_ENA_DEPOSITION:

Expand All @@ -118,7 +129,7 @@ if CHECK_ENA_DEPOSITION:

rule filter_out_loculus_depositions:
input:
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
ncbi_dataset_tsv="results/curated_metadata.tsv",
exclude_biosample_accessions="results/biosample_accessions_to_exclude.tsv",
exclude_insdc_accessions="results/insdc_accessions_to_exclude.tsv",
script="scripts/filter_out_depositions.py",
Expand Down Expand Up @@ -257,7 +268,7 @@ rule prepare_metadata:
metadata=(
"results/filtered_metadata.tsv"
if CHECK_ENA_DEPOSITION
else "results/metadata_post_rename.tsv"
else "results/curated_metadata.tsv"
),
segments="results/nextclade_merged.tsv" if SEGMENTED else "results/config.yaml", # else is just a dummy
sequence_hashes="results/sequence_hashes.ndjson",
Expand Down
87 changes: 41 additions & 46 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ fasta_id_field: genbankAccession
keep:
- division
- country
- location
- submissionId
- insdcAccessionBase
- insdcVersion
Expand All @@ -24,52 +25,45 @@ keep:
- sequence_md5
- genbankAccession
- jointAccession
column_mapping:
Accession: genbankAccession
BioProjects: bioprojects
BioSample accession: biosampleAccession
Gene count: ncbi_gene_count
Geographic Location: ncbiGeoLocation
Geographic Region: ncbiGeoRegion
Host Common Name: ncbiHostCommonName
Host Infraspecific Names Breed: ncbiHostBreed
Host Infraspecific Names Cultivar: ncbiHostCultivar
Host Infraspecific Names Ecotype: ncbiHostEcotype
Host Infraspecific Names Isolate: ncbiHostIsolate
Host Infraspecific Names Sex: ncbiHostSex
Host Infraspecific Names Strain: ncbiHostStrain
Host Name: ncbiHostName
Host Pangolin Classification: ncbiHostPangolin
Host Taxonomic ID: ncbiHostTaxId
Is Annotated: ncbiIsAnnotated
Is Complete: ncbiIsComplete
Is Lab Host: ncbiIsLabHost
Is Vaccine Strain: ncbiIsVaccineStrain
Isolate Collection date: ncbiCollectionDate
Isolate Lineage: ncbiIsolateName
Isolate Lineage source: ncbiIsolateSource
Lab Host: ncbiLabHost
Mature peptide count: ncbiMaturePeptideCount
Molecule type: ncbiMolType
Protein count: ncbiProteinCount
Purpose of Sampling: ncbiPurposeOfSampling
Release date: ncbiReleaseDate
Source database: ncbiSourceDb
SRA Accessions: ncbiSraAccessions
Submitter Affiliation: ncbiSubmitterAffiliation
Submitter Country: ncbiSubmitterCountry
Submitter Names: ncbiSubmitterNames
Update date: ncbiUpdateDate
Virus Common Name: ncbiVirusCommonName
Virus Infraspecific Names Breed: ncbiVirusBreed
Virus Infraspecific Names Cultivar: ncbiVirusCultivar
Virus Infraspecific Names Ecotype: ncbiVirusEcotype
Virus Infraspecific Names Isolate: ncbiVirusIsolate
Virus Infraspecific Names Sex: ncbi_virus
Virus Infraspecific Names Strain: ncbiVirusStrain
Virus Name: ncbiVirusName
Virus Pangolin Classification: ncbiVirusPangolin
Virus Taxonomic ID: ncbiVirusTaxId
simple_mappings:
"ncbiReleaseDate": "releaseDate"
"ncbiIsAnnotated": "isAnnotated"
"ncbiIsLabHost": "isLabHost"
"ncbiProteinCount": "proteinCount"
"ncbiSourceDb": "sourceDatabase"
"ncbiIsComplete": "completeness"
"ncbiLabHost": "labHost"
"ncbiUpdateDate": "updateDate"
"genbankAccession": "accession"
"biosampleAccession": "biosample"
"ncbi_gene_count": "geneCount"
"bioprojects": "bioprojects"
"ncbiSraAccessions": "sraAccessions"
location_mappings:
"ncbiGeoLocation": "geographicLocation"
"ncbiGeoRegion": "geographicRegion"
submitter_mappings:
"ncbiSubmitterAffiliation": "affiliation"
"ncbiSubmitterNames": "names"
"ncbiSubmitterCountry": "country"
isolate_mappings:
"ncbiIsolateName": "name"
"ncbiIsolateSource": "source"
"ncbiCollectionDate": "collectionDate"
virus_mappings:
"ncbiVirusName": "organismName"
"ncbiVirusTaxId": "taxId"
host_mappings:
"ncbiHostTaxId": "taxId"
"ncbiHostName": "organismName"
parse_list:
- bioprojects
- ncbiSraAccessions
unknown_mappings: # I don't know yet where these fields come from
- ncbiHostCommonName
- ncbiPurposeOfSampling
- ncbiHostSex
group_name: insdc_ingest_group
group_name: insdc_ingest_group # Used only to set the group name, never read
username: insdc_ingest_user
password: insdc_ingest_user
Expand All @@ -79,3 +73,4 @@ approve_timeout_min: "25" # Cronjobs run every 30min, make approve stop before i
db_username: postgres
db_password: unsecure
db_url: "jdbc:postgresql://127.0.0.1:5432/loculus"
geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/master/source-data/gisaid_geoLocationRules.tsv"
3 changes: 2 additions & 1 deletion ingest/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ channels:
- bioconda
dependencies:
# Core Python dependencies
- python =3.12
- python
# Extra dependencies
- biopython
- click
- ijson
- jsonlines
- ncbi-datasets-cli >=16.29.0
- nextclade >=3.7.0
- augur >=26.0.0
- orjsonl
- pandas
- psycopg2
Expand Down
4 changes: 2 additions & 2 deletions ingest/scripts/call_loculus.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def submit_or_revise(
url = f"{organism_url(config)}/{endpoint}"

metadata_lines = len(Path(metadata).read_text(encoding="utf-8").splitlines()) - 1
logger.info(f"{logging_strings["gerund"]} {metadata_lines} sequence(s) to Loculus")
logger.info(f"{logging_strings['gerund']} {metadata_lines} sequence(s) to Loculus")

params = {
"groupId": group_id,
Expand All @@ -213,7 +213,7 @@ def submit_or_revise(
"sequenceFile": sequences_file,
}
response = make_request(HTTPMethod.POST, url, config, params=params, files=files)
logger.debug(f"{logging_strings["noun"]} response: {response.json()}")
logger.debug(f"{logging_strings['noun']} response: {response.json()}")

return response.json()

Expand Down
Loading
Loading