Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingest): Use pycountry and fuzzywuzzy to format countries #3026

Draft
wants to merge 24 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions ena-submission/scripts/test_ena_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
dataclass_to_xml,
get_chromsome_accessions,
get_ena_analysis_process,
reformat_authors_from_loculus_to_embl_style,
)
from ena_deposition.ena_types import default_project_type, default_sample_type

Expand Down Expand Up @@ -188,6 +189,12 @@ def setUp(self):
}
self.seq_key = {"accession": "test_accession", "version": "test_version"}

def test_format_authors(self):
authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;"
result = reformat_authors_from_loculus_to_embl_style(authors)
desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;"
self.assertEqual(result, desired_result)

def test_create_chromosome_list_multi_segment(self):
chromosome_list = create_chromosome_list_object(
self.unaligned_sequences_multi, self.seq_key
Expand Down
2 changes: 1 addition & 1 deletion ena-submission/src/ena_deposition/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def secure_ena_connection(config: Config):
config.test = True
logging.info("Submitting to ENA dev environment")
config.ena_submission_url = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit"
config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json"
config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json"
config.ena_reports_service_url = "https://wwwdev.ebi.ac.uk/ena/submit/report"

if submit_to_ena_prod:
Expand Down
7 changes: 7 additions & 0 deletions ena-submission/src/ena_deposition/create_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
create_manifest,
get_ena_analysis_process,
get_ena_config,
reformat_authors_from_loculus_to_embl_style,
)
from .ena_types import (
AssemblyChromosomeListFile,
Expand Down Expand Up @@ -129,6 +130,12 @@ def create_manifest_object(
authors = (
metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown")
)
try:
authors = reformat_authors_from_loculus_to_embl_style(authors)
except ValueError as err:
msg = f"Was unable to format authors: {authors} as ENA expects"
logger.error(msg)
raise ValueError(msg) from err
collection_date = metadata.get("sampleCollectionDate", "Unknown")
country = metadata.get("geoLocCountry", "Unknown")
admin1 = metadata.get("geoLocAdmin1", "")
Expand Down
22 changes: 22 additions & 0 deletions ena-submission/src/ena_deposition/ena_submission_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,28 @@ def get_project_xml(project_set):
}


def reformat_authors_from_loculus_to_embl_style(authors: str) -> str:
"""This function reformats the Loculus authors string to the format expected by ENA
Loculus format: `Doe, John A.; Roe, Jane B. C.`
EMBL expected: `Doe J.A., Roe J.B.C.;`

EMBL spec: "The names are listed surname first followed by a blank
followed by initial(s) with stops.
Occasionally the initials may not be known,
in which case the surname alone will be listed.
The author names are separated by commas
and terminated by a semicolon; they are not split between lines."
See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt"""
authors_list = [author for author in authors.split(";") if author]
ena_authors = []
for author in authors_list:
last_name, first_name = author.split(",")[0].strip(), author.split(",")[1]
initials = ".".join([name[0] for name in first_name.split(" ") if name])
initials = initials + "." if initials else initials
ena_authors.append(f"{last_name} {initials}")
return ", ".join(ena_authors) + ";"


def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult:
"""
The project creation request should be equivalent to
Expand Down
36 changes: 8 additions & 28 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -21,24 +21,13 @@ with open("results/config.yaml", "w") as f:

TAXON_ID = config["taxon_id"]
SEGMENTED = config["segmented"]
COLUMN_MAPPING = config["column_mapping"]
LOG_LEVEL = config.get("log_level", "INFO")
NCBI_API_KEY = os.getenv("NCBI_API_KEY")
FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes
CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)


def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING):
with open(input_file, "r") as f:
header = f.readline().strip().split("\t")
header = [mapping.get(h, h) for h in header]
with open(output_file, "w") as g:
g.write("\t".join(header) + "\n")
for line in f:
g.write(line)


rule all:
params:
config=lambda wildcards: str(config),
Expand Down Expand Up @@ -72,30 +61,21 @@ rule fetch_ncbi_dataset_package:

rule format_ncbi_dataset_report:
input:
script="scripts/format_ncbi_metadata.py",
dataset_package="results/ncbi_dataset.zip",
config="results/config.yaml",
output:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
shell:
"""
dataformat tsv virus-genome \
--package {input.dataset_package} \
> {output.ncbi_dataset_tsv}
unzip -o {input.dataset_package} -d results
python {input.script} \
--config-file {input.config} \
--input results/ncbi_dataset/data/data_report.jsonl \
--output {output.ncbi_dataset_tsv}
"""


rule rename_columns:
input:
ncbi_dataset_tsv="results/metadata_post_extract.tsv",
output:
ncbi_dataset_tsv="results/metadata_post_rename.tsv",
params:
mapping=COLUMN_MAPPING,
run:
rename_columns(
input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping
)


if CHECK_ENA_DEPOSITION:

rule get_loculus_depositions:
Expand Down
Loading
Loading