diff --git a/ena-submission/scripts/test_ena_submission.py b/ena-submission/scripts/test_ena_submission.py index 9da08925c..fb08bbd30 100644 --- a/ena-submission/scripts/test_ena_submission.py +++ b/ena-submission/scripts/test_ena_submission.py @@ -23,6 +23,7 @@ dataclass_to_xml, get_chromsome_accessions, get_ena_analysis_process, + reformat_authors_from_loculus_to_embl_style, ) from ena_deposition.ena_types import default_project_type, default_sample_type @@ -188,6 +189,12 @@ def setUp(self): } self.seq_key = {"accession": "test_accession", "version": "test_version"} + def test_format_authors(self): + authors = "Xi,L.;Smith, Anna Maria; Perez Gonzalez, Anthony J.;" + result = reformat_authors_from_loculus_to_embl_style(authors) + desired_result = "Xi L., Smith A.M., Perez Gonzalez A.J.;" + self.assertEqual(result, desired_result) + def test_create_chromosome_list_multi_segment(self): chromosome_list = create_chromosome_list_object( self.unaligned_sequences_multi, self.seq_key diff --git a/ena-submission/src/ena_deposition/config.py b/ena-submission/src/ena_deposition/config.py index 092c59802..0fefc3dca 100644 --- a/ena-submission/src/ena_deposition/config.py +++ b/ena-submission/src/ena_deposition/config.py @@ -50,7 +50,7 @@ def secure_ena_connection(config: Config): config.test = True logging.info("Submitting to ENA dev environment") config.ena_submission_url = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit" - config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/main/test/approved_ena_submission_list.json" + config.github_url = "https://raw.githubusercontent.com/pathoplexus/ena-submission/loculus_test/test/approved_ena_submission_list.json" config.ena_reports_service_url = "https://wwwdev.ebi.ac.uk/ena/submit/report" if submit_to_ena_prod: diff --git a/ena-submission/src/ena_deposition/create_assembly.py b/ena-submission/src/ena_deposition/create_assembly.py index a6c51d38c..c12a3aa57 100644 --- a/ena-submission/src/ena_deposition/create_assembly.py +++ b/ena-submission/src/ena_deposition/create_assembly.py @@ -17,6 +17,7 @@ create_manifest, get_ena_analysis_process, get_ena_config, + reformat_authors_from_loculus_to_embl_style, ) from .ena_types import ( AssemblyChromosomeListFile, @@ -129,6 +130,12 @@ def create_manifest_object( authors = ( metadata["authors"] if metadata.get("authors") else metadata.get("submitter", "Unknown") ) + try: + authors = reformat_authors_from_loculus_to_embl_style(authors) + except ValueError as err: + msg = f"Was unable to format authors: {authors} as ENA expects" + logger.error(msg) + raise ValueError(msg) from err collection_date = metadata.get("sampleCollectionDate", "Unknown") country = metadata.get("geoLocCountry", "Unknown") admin1 = metadata.get("geoLocAdmin1", "") diff --git a/ena-submission/src/ena_deposition/ena_submission_helper.py b/ena-submission/src/ena_deposition/ena_submission_helper.py index 6da10c99f..fcca90575 100644 --- a/ena-submission/src/ena_deposition/ena_submission_helper.py +++ b/ena-submission/src/ena_deposition/ena_submission_helper.py @@ -127,6 +127,28 @@ def get_project_xml(project_set): } +def reformat_authors_from_loculus_to_embl_style(authors: str) -> str: + """This function reformats the Loculus authors string to the format expected by ENA + Loculus format: `Doe, John A.; Roe, Jane B. C.` + EMBL expected: `Doe J.A., Roe J.B.C.;` + + EMBL spec: "The names are listed surname first followed by a blank + followed by initial(s) with stops. + Occasionally the initials may not be known, + in which case the surname alone will be listed. + The author names are separated by commas + and terminated by a semicolon; they are not split between lines." + See section "3.4.10.6: The RA Line" here: https://raw.githubusercontent.com/enasequence/read_docs/c4bd306c82710844128cdf43003a0167837dc442/submit/fileprep/flatfile_user_manual.txt""" + authors_list = [author for author in authors.split(";") if author] + ena_authors = [] + for author in authors_list: + last_name, first_name = author.split(",")[0].strip(), author.split(",")[1] + initials = ".".join([name[0] for name in first_name.split(" ") if name]) + initials = initials + "." if initials else initials + ena_authors.append(f"{last_name} {initials}") + return ", ".join(ena_authors) + ";" + + def create_ena_project(config: ENAConfig, project_set: ProjectSet) -> CreationResult: """ The project creation request should be equivalent to diff --git a/ingest/Snakefile b/ingest/Snakefile index ca8da1ba4..818fb7cd1 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -21,7 +21,6 @@ with open("results/config.yaml", "w") as f: TAXON_ID = config["taxon_id"] SEGMENTED = config["segmented"] -COLUMN_MAPPING = config["column_mapping"] LOG_LEVEL = config.get("log_level", "INFO") NCBI_API_KEY = os.getenv("NCBI_API_KEY") FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None) @@ -29,16 +28,6 @@ APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False) -def rename_columns(input_file, output_file, mapping=COLUMN_MAPPING): - with open(input_file, "r") as f: - header = f.readline().strip().split("\t") - header = [mapping.get(h, h) for h in header] - with open(output_file, "w") as g: - g.write("\t".join(header) + "\n") - for line in f: - g.write(line) - - rule all: params: config=lambda wildcards: str(config), @@ -72,30 +61,21 @@ rule fetch_ncbi_dataset_package: rule format_ncbi_dataset_report: input: + script="scripts/format_ncbi_metadata.py", dataset_package="results/ncbi_dataset.zip", + config="results/config.yaml", output: - ncbi_dataset_tsv="results/metadata_post_extract.tsv", + ncbi_dataset_tsv="results/metadata_post_rename.tsv", shell: """ - dataformat tsv virus-genome \ - --package {input.dataset_package} \ - > {output.ncbi_dataset_tsv} + unzip -o {input.dataset_package} -d results + python {input.script} \ + --config-file {input.config} \ + --input results/ncbi_dataset/data/data_report.jsonl \ + --output {output.ncbi_dataset_tsv} """ -rule rename_columns: - input: - ncbi_dataset_tsv="results/metadata_post_extract.tsv", - output: - ncbi_dataset_tsv="results/metadata_post_rename.tsv", - params: - mapping=COLUMN_MAPPING, - run: - rename_columns( - input.ncbi_dataset_tsv, output.ncbi_dataset_tsv, mapping=params.mapping - ) - - if CHECK_ENA_DEPOSITION: rule get_loculus_depositions: diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml index 71354e67f..4a4538cc5 100644 --- a/ingest/config/defaults.yaml +++ b/ingest/config/defaults.yaml @@ -6,8 +6,8 @@ log_level: DEBUG compound_country_field: ncbiGeoLocation fasta_id_field: genbankAccession keep: - - division - - country + - geoLocAdmin1 + - geoLocAdmin2 - submissionId - insdcAccessionBase - insdcVersion @@ -24,52 +24,44 @@ keep: - sequence_md5 - genbankAccession - jointAccession -column_mapping: - Accession: genbankAccession - BioProjects: bioprojects - BioSample accession: biosampleAccession - Gene count: ncbi_gene_count - Geographic Location: ncbiGeoLocation - Geographic Region: ncbiGeoRegion - Host Common Name: ncbiHostCommonName - Host Infraspecific Names Breed: ncbiHostBreed - Host Infraspecific Names Cultivar: ncbiHostCultivar - Host Infraspecific Names Ecotype: ncbiHostEcotype - Host Infraspecific Names Isolate: ncbiHostIsolate - Host Infraspecific Names Sex: ncbiHostSex - Host Infraspecific Names Strain: ncbiHostStrain - Host Name: ncbiHostName - Host Pangolin Classification: ncbiHostPangolin - Host Taxonomic ID: ncbiHostTaxId - Is Annotated: ncbiIsAnnotated - Is Complete: ncbiIsComplete - Is Lab Host: ncbiIsLabHost - Is Vaccine Strain: ncbiIsVaccineStrain - Isolate Collection date: ncbiCollectionDate - Isolate Lineage: ncbiIsolateName - Isolate Lineage source: ncbiIsolateSource - Lab Host: ncbiLabHost - Mature peptide count: ncbiMaturePeptideCount - Molecule type: ncbiMolType - Protein count: ncbiProteinCount - Purpose of Sampling: ncbiPurposeOfSampling - Release date: ncbiReleaseDate - Source database: ncbiSourceDb - SRA Accessions: ncbiSraAccessions - Submitter Affiliation: ncbiSubmitterAffiliation - Submitter Country: ncbiSubmitterCountry - Submitter Names: ncbiSubmitterNames - Update date: ncbiUpdateDate - Virus Common Name: ncbiVirusCommonName - Virus Infraspecific Names Breed: ncbiVirusBreed - Virus Infraspecific Names Cultivar: ncbiVirusCultivar - Virus Infraspecific Names Ecotype: ncbiVirusEcotype - Virus Infraspecific Names Isolate: ncbiVirusIsolate - Virus Infraspecific Names Sex: ncbi_virus - Virus Infraspecific Names Strain: ncbiVirusStrain - Virus Name: ncbiVirusName - Virus Pangolin Classification: ncbiVirusPangolin - Virus Taxonomic ID: ncbiVirusTaxId +simple_mappings: + "ncbiReleaseDate": "releaseDate" + "ncbiIsAnnotated": "isAnnotated" + "ncbiIsLabHost": "isLabHost" + "ncbiProteinCount": "proteinCount" + "ncbiSourceDb": "sourceDatabase" + "ncbiIsComplete": "completeness" + "ncbiLabHost": "labHost" + "ncbiUpdateDate": "updateDate" + "genbankAccession": "accession" + "biosampleAccession": "biosample" + "ncbi_gene_count": "geneCount" + "bioprojects": "bioprojects" + "ncbiSraAccessions": "sraAccessions" +location_mappings: + "ncbiGeoLocation": "geographicLocation" + "ncbiGeoRegion": "geographicRegion" +submitter_mappings: + "ncbiSubmitterAffiliation": "affiliation" + "ncbiSubmitterNames": "names" + "ncbiSubmitterCountry": "country" +isolate_mappings: + "ncbiIsolateName": "name" + "ncbiIsolateSource": "source" + "ncbiCollectionDate": "collectionDate" +virus_mappings: + "ncbiVirusName": "organismName" + "ncbiVirusTaxId": "taxId" +host_mappings: + "ncbiHostTaxId": "taxId" + "ncbiHostName": "organismName" +parse_list: + - bioprojects + - ncbiSraAccessions +unknown_mappings: # I don't know yet where these fields come from + - ncbiHostCommonName + - ncbiPurposeOfSampling + - ncbiHostSex group_name: insdc_ingest_group # Used only to set the group name, never read username: insdc_ingest_user password: insdc_ingest_user @@ -79,3 +71,335 @@ approve_timeout_min: "25" # Cronjobs run every 30min, make approve stop before i db_username: postgres db_password: unsecure db_url: "jdbc:postgresql://127.0.0.1:5432/loculus" +min_score: 89 +country_codes: + Afghanistan: AF + Albania: AL + Algeria: DZ + American Samoa: AS + Andorra: AD + Angola: AO + Anguilla: AI + Antarctica: AQ + Antigua and Barbuda: AG + Arctic Ocean: null + Argentina: AR + Armenia: AM + Aruba: AW + Ashmore and Cartier Islands: null + Atlantic Ocean: null + Australia: AU + Austria: AT + Azerbaijan: AZ + Bahamas: BS + Bahrain: BH + Baltic Sea: null + Baker Island: null + Bangladesh: BD + Barbados: BB + Bassas da India: null + Belarus: BY + Belgium: BE + Belize: BZ + Benin: BJ + Bermuda: BM + Bhutan: BT + Bolivia: BO + Borneo: null + Bosnia and Herzegovina: BA + Botswana: BW + Bouvet Island: BV + Brazil: BR + British Virgin Islands: VG + Brunei: BN + Bulgaria: BG + Burkina Faso: BF + Burundi: BI + Cambodia: KH + Cameroon: CM + Canada: CA + Cape Verde: CV + Cayman Islands: KY + Central African Republic: CF + Chad: TD + Chile: CL + China: CN + Christmas Island: CX + Clipperton Island: null + Cocos Islands: CC + Colombia: CO + Comoros: KM + Cook Islands: CK + Coral Sea Islands: null + Costa Rica: CR + Cote d'Ivoire: CI + Croatia: HR + Cuba: CU + Curacao: CW + Cyprus: CY + Czechia: CZ + Democratic Republic of the Congo: CD + Denmark: DK + Djibouti: DJ + Dominica: DM + Dominican Republic: DO + Ecuador: EC + Egypt: EG + El Salvador: SV + Equatorial Guinea: GQ + Eritrea: ER + Estonia: EE + Eswatini: SZ + Ethiopia: ET + Europa Island: null + Falkland Islands (Islas Malvinas): FK + Faroe Islands: FO + Fiji: FJ + Finland: FI + France: FR + French Guiana: GF + French Polynesia: PF + French Southern and Antarctic Lands: TF + Gabon: GA + Gambia: GM + Gaza Strip: null + Georgia: GE + Germany: DE + Ghana: GH + Gibraltar: GI + Glorioso Islands: null + Greece: GR + Greenland: GL + Grenada: GD + Guadeloupe: GP + Guam: GU + Guatemala: GT + Guernsey: GG + Guinea: GN + Guinea-Bissau: GW + Guyana: GY + Haiti: HT + Heard Island and McDonald Islands: HM + Honduras: HN + Hong Kong: HK + Howland Island: null + Hungary: HU + Iceland: IS + India: IN + Indian Ocean: null + Indonesia: ID + Iran: IR + Iraq: IQ + Ireland: IE + Isle of Man: IM + Israel: IL + Italy: IT + Jamaica: JM + Jan Mayen: null + Japan: JP + Jarvis Island: null + Jersey: JE + Johnston Atoll: null + Jordan: JO + Juan de Nova Island: null + Kazakhstan: KZ + Kenya: KE + Kerguelen Archipelago: null + Kingman Reef: null + Kiribati: KI + Kosovo: null # XK is not recognized by the ISO + Kuwait: KW + Kyrgyzstan: KG + Laos: LA + Latvia: LV + Lebanon: LB + Lesotho: LS + Liberia: LR + Libya: LY + Liechtenstein: LI + Line Islands: null + Lithuania: LT + Luxembourg: LU + Macau: MO + Madagascar: MG + Malawi: MW + Malaysia: MY + Maldives: MV + Mali: ML + Malta: MT + Marshall Islands: MH + Martinique: MQ + Mauritania: MR + Mauritius: MU + Mayotte: YT + Mediterranean Sea: null + Mexico: MX + Micronesia, Federated States of: FM + Midway Islands: null + Moldova: MD + Monaco: MC + Mongolia: MN + Montenegro: ME + Montserrat: MS + Morocco: MA + Mozambique: MZ + Myanmar: MM + Namibia: NA + Nauru: NR + Navassa Island: null + Nepal: NP + Netherlands: NL + New Caledonia: NC + New Zealand: NZ + Nicaragua: NI + Niger: NE + Nigeria: NG + Niue: NU + Norfolk Island: NF + North Korea: KP + North Macedonia: MK + North Sea: null + Northern Mariana Islands: MP + Norway: NO + Oman: OM + Pacific Ocean: null + Pakistan: PK + Palau: PW + Palmyra Atoll: null + Panama: PA + Papua New Guinea: PG + Paracel Islands: null + Paraguay: PY + Peru: PE + Philippines: PH + Pitcairn Islands: PN + Poland: PL + Portugal: PT + Puerto Rico: PR + Qatar: QA + Republic of the Congo: CG + Reunion: RE + Romania: RO + Ross Sea: null + Russia: RU + Rwanda: RW + Saint Barthelemy: BL + Saint Helena: SH + Saint Kitts and Nevis: KN + Saint Lucia: LC + Saint Martin: MF + Saint Pierre and Miquelon: PM + Saint Vincent and the Grenadines: VC + Samoa: WS + San Marino: SM + Sao Tome and Principe: ST + Saudi Arabia: SA + Senegal: SN + Serbia: RS + Seychelles: SC + Sierra Leone: SL + Singapore: SG + Sint Maarten: SX + Slovakia: SK + Slovenia: SI + Solomon Islands: SB + Somalia: SO + South Africa: ZA + South Georgia and the South Sandwich Islands: GS + South Korea: KR + South Sudan: SS + Southern Ocean: null + Spain: ES + Spratly Islands: null + Sri Lanka: LK + State of Palestine: PS + Sudan: SD + Suriname: SR + Svalbard: SJ + Sweden: SE + Switzerland: CH + Syria: SY + Taiwan: TW + Tajikistan: TJ + Tanzania: TZ + Tasman Sea: null + Thailand: TH + Timor-Leste: TL + Togo: TG + Tokelau: TK + Tonga: TO + Trinidad and Tobago: TT + Tromelin Island: null + Tunisia: TN + Turkey: TR + Turkmenistan: TM + Turks and Caicos Islands: TC + Tuvalu: TV + Uganda: UG + Ukraine: UA + United Arab Emirates: AE + United Kingdom: GB + Uruguay: UY + USA: US + Uzbekistan: UZ + Vanuatu: VU + Venezuela: VE + Viet Nam: VN + Virgin Islands: VI + Wake Island: null + Wallis and Futuna: WF + West Bank: null + Western Sahara: EH + Yemen: YE + Zambia: ZM + Zimbabwe: ZW + Belgian Congo: CGO + British Guiana: BG + Burma: BU + Czechoslovakia: CS + Czech Republic: CZ + East Timor: TP + Korea: null + Macedonia: MK + Micronesia: FM + Netherlands Antilles: AN + Serbia and Montenegro: CS + Siam: null + Swaziland: SZ + The former Yugoslav Republic of Macedonia: MK + USSR: SU + Yugoslavia: YU + Zaire: ZR +administrative_divisions: + - "autonomous region" + - region + - state + - province + - oblast + - krug + - territory + - zone + - subdistrict + - district + - canton + - subprefecture + - prefecture + - county + - department + - municipality + - republic + - governorate + - parish + - borough + - commune + - division + - ward + - emirate + - principality + - "federal subject" + - "union territory" + - department + - commune + - "council area" + + diff --git a/ingest/environment.yml b/ingest/environment.yml index 505fcddd2..d68c97d1b 100644 --- a/ingest/environment.yml +++ b/ingest/environment.yml @@ -21,3 +21,6 @@ dependencies: - snakemake - tsv-utils - unzip + - pycountry + - fuzzywuzzy + - unidecode diff --git a/ingest/scripts/format_ncbi_metadata.py b/ingest/scripts/format_ncbi_metadata.py new file mode 100644 index 000000000..2b72b01fd --- /dev/null +++ b/ingest/scripts/format_ncbi_metadata.py @@ -0,0 +1,166 @@ +import csv +import json +import logging +import re +from dataclasses import dataclass + +import click +import pandas as pd +import yaml + +logger = logging.getLogger(__name__) +logging.basicConfig( + encoding="utf-8", + level=logging.DEBUG, + format="%(asctime)s %(levelname)8s (%(filename)20s:%(lineno)4d) - %(message)s ", + datefmt="%H:%M:%S", +) + + +@dataclass +class Config: + simple_mappings: dict[str, str] + location_mappings: dict[str, str] + submitter_mappings: dict[str, str] + isolate_mappings: dict[str, str] + virus_mappings: dict[str, str] + host_mappings: dict[str, str] + unknown_mappings: list[str] + parse_list: list[str] + + +def convert_to_title_case(name: str) -> str: + # List of lowercase particles or prepositions commonly used in names + lowercase_particles = ["de", "la", "van", "den", "der", "le", "du", "von", "del"] + title_case_text = name.title() + + words = title_case_text.split() + result = [] + for word in words: + if word.lower() in lowercase_particles: + result.append(word.lower()) + else: + result.append(word) + return " ".join(result) + + +def reformat_authors_from_genbank_to_loculus( + authors_list: list[str], insdc_accession_base: str +) -> str: + """Split authors by each second comma, then split by comma and reverse + So "['Xi,L.', 'Yu,X.']" becomes Xi, L.; Yu, X. + Where first name and last name are separated by no-break space""" + + if not authors_list: + return "" + + formatted_authors_list = [] + + for author in authors_list: + author_single_white_space = re.sub(r"\s\s+", " ", author) + names = [a for a in author_single_white_space.split(",") if a] + if len(names) == 2: + author_formatted = f"{names[0].strip()}, {names[1].strip()}" + elif len(names) == 1: + author_formatted = f"{names[0].strip()}, " + else: + msg = ( + f"{insdc_accession_base}: Unexpected number of commas in author {author} " + f"not adding author to authors list" + ) + logger.error(msg) + continue + formatted_authors_list.append(author_formatted) + + formatted_authors = "; ".join(formatted_authors_list) + + # If entire string is uppercase, convert to title case, some journals do this + if formatted_authors.isupper(): + formatted_authors = convert_to_title_case(formatted_authors) + return formatted_authors + + +def extract_fields(row, config: Config) -> dict: + try: + extracted = {} + extracted.update({key: row.get(value) for key, value in config.simple_mappings.items()}) + location = row.get("location", {}) + extracted.update( + {key: location.get(value) for key, value in config.location_mappings.items()} + ) + submitter = row.get("submitter", {}) + extracted.update( + {key: submitter.get(value) for key, value in config.submitter_mappings.items()} + ) + isolate = row.get("isolate", {}) + extracted.update( + {key: isolate.get(value) for key, value in config.isolate_mappings.items()} + ) + + host_lineage = row.get("host", {}) + extracted.update( + {key: host_lineage.get(value) for key, value in config.host_mappings.items()} + ) + + virus_lineage = row.get("virus", {}) + extracted.update( + {key: virus_lineage.get(value) for key, value in config.virus_mappings.items()} + ) + + extracted.update(dict.fromkeys(config.unknown_mappings)) + + except KeyError as e: + print(f"Missing key: {e}") + extracted = {} + return extracted + + +def jsonl_to_tsv(jsonl_file: str, tsv_file: str, config: Config) -> None: + extracted_rows: list[dict[str, str]] = [] + with ( + open(jsonl_file, encoding="utf-8") as infile, + ): + for line in infile: + row = json.loads(line.strip()) + extracted = extract_fields(row, config) + extracted["ncbiSubmitterNames"] = reformat_authors_from_genbank_to_loculus( + extracted["ncbiSubmitterNames"], extracted["genbankAccession"] + ) + for field in config.parse_list: + if extracted[field]: + extracted[field] = ",".join(extracted[field]) + else: + extracted[field] = "" + extracted_rows.append(extracted) + df = pd.DataFrame(extracted_rows) + df.to_csv( + tsv_file, + sep="\t", + quoting=csv.QUOTE_NONE, + escapechar="\\", + index=False, + float_format="%.0f", + ) + + +@click.command() +@click.option("--config-file", required=True, type=click.Path(exists=True)) +@click.option("--input", required=True, type=click.Path(exists=True)) +@click.option("--output", required=True, type=click.Path()) +@click.option( + "--log-level", + default="INFO", + type=click.Choice(["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]), +) +def main(config_file: str, input: str, output: str, log_level: str) -> None: + logger.setLevel(log_level) + + with open(config_file, encoding="utf-8") as file: + full_config = yaml.safe_load(file) + relevant_config = {key: full_config[key] for key in Config.__annotations__} + config = Config(**relevant_config) + jsonl_to_tsv(input, output, config=config) + + +if __name__ == "__main__": + main() diff --git a/ingest/scripts/group_segments.py b/ingest/scripts/group_segments.py index 2b766d473..dcfc427c3 100644 --- a/ingest/scripts/group_segments.py +++ b/ingest/scripts/group_segments.py @@ -33,7 +33,7 @@ def sort_authors(authors: str) -> str: """Sort authors alphabetically""" - return ", ".join(sorted(authors.split(", "))) + return "; ".join(sorted([author.strip() for author in authors.split(";")])) def values_with_sorted_authors(values: dict[str, str]) -> dict[str, str]: diff --git a/ingest/scripts/prepare_metadata.py b/ingest/scripts/prepare_metadata.py index b440a22eb..56fd4a938 100644 --- a/ingest/scripts/prepare_metadata.py +++ b/ingest/scripts/prepare_metadata.py @@ -8,13 +8,17 @@ import hashlib import json import logging +import re from dataclasses import dataclass from pathlib import Path import click import orjsonl import pandas as pd +import pycountry +import unidecode import yaml +from fuzzywuzzy import fuzz, process logger = logging.getLogger(__name__) logging.basicConfig( @@ -32,22 +36,108 @@ class Config: rename: dict[str, str] keep: list[str] segmented: bool - - -def split_authors(authors: str) -> str: - """Split authors by each second comma, then split by comma and reverse - So Xi,L.,Yu,X. becomes L. Xi, X. Yu - Where first name and last name are separated by no-break space""" - single_split = authors.split(",") - result = [] - - for i in range(0, len(single_split), 2): - if i + 1 < len(single_split): - result.append(single_split[i + 1].strip() + " " + single_split[i].strip()) - else: - result.append(single_split[i].strip()) - - return ", ".join(result) + country_codes: dict[str, str] + min_score: int + administrative_divisions: list[str] + + +def format_geo_loc_admin2(division: str, matched_geo_loc_admin1: str) -> str: + """Remove the matched geo_loc_admin1 from the division string and return the rest""" + replaced_string = division.replace(matched_geo_loc_admin1, "").strip().rstrip(",") + geo_loc_admin2 = [x.strip() for x in replaced_string.split(",") if x.strip()] + joint = ", ".join(geo_loc_admin2) + return re.sub(r"\s+", " ", joint).strip() + + +def fuzzy_match_geo_loc_admin1(query: str, geo_loc_admin1_list: list[str], config: Config) -> str: + """Return highest fuzzy match of query to items in list + if score of match>= min_score, match range is 0-100""" + for admin_region in config.administrative_divisions: + if admin_region.lower() in query.lower(): + query = query.lower().replace(admin_region.lower(), "") + break + match, score = process.extractOne(query, geo_loc_admin1_list, scorer=fuzz.partial_ratio) + if score >= config.min_score: + return match + return "" + + +def get_geo_loc_admin1_options(country: str, config: Config) -> tuple[list[str], dict[str, str]]: + country_code = config.country_codes.get(country) + if not country_code: + return [], {} + try: + geolocadmin1_options = [ + unidecode.unidecode(division.name) # pycountry returns non-ASCII characters + for division in pycountry.subdivisions.get(country_code=country_code) + if division.parent_code is None # Only get the top level subdivisions + ] + geolocadmin1_abbreviations = { + division.code: unidecode.unidecode(division.name) + for division in pycountry.subdivisions.get(country_code=country_code) + } + geolocadmin1_abbreviations = { + abbrev.split("-")[1]: name for abbrev, name in geolocadmin1_abbreviations.items() + } + except Exception as e: + try: + # Try to get the historic subdivisions if the current ones don't work + geolocadmin1_options = [ + unidecode.unidecode(division.name) # pycountry returns non-ASCII characters + for division in pycountry.historic_countries.get(country_code=country_code) + if division.parent_code is None # Only get the top level subdivisions + ] + geolocadmin1_abbreviations = { + division.code: unidecode.unidecode(division.name) + for division in pycountry.historic_countries.get(country_code=country_code) + } + geolocadmin1_abbreviations = { + abbrev.split("-")[1]: name for abbrev, name in geolocadmin1_abbreviations.items() + } + except Exception: + logger.error(f"Error getting subdivisions for {country}: {e}") + return [], {} + return geolocadmin1_options, geolocadmin1_abbreviations + + +def get_geoloc(input_string: str, config: Config) -> tuple[str, str, str]: + """ + Takes INSDC geolocation string in format `country: division` + Returns country and attempts to split division into geoLocAdmin1 and geoLocAdmin2. + 1. Use pycountry for official list of geoLocAdmin1 options and abbreviations + 2. Attempt exact match of division substring (split by ",") to geoLocAdmin1 + 3. Attempt exact match of division substring (split by "\s" or ",") to geoLocAdmin1 abbr + 4. Attempt fuzzy match of division substring (after removing common administrative_divisions + substrings) to geoLocAdmin1 + 5. If no match, return division as geoLocAdmin2 + """ + country = input_string.split(":", 1)[0].strip() + division = input_string.split(":", 1)[1].strip() if len(input_string.split(":", 1)) == 2 else "" + + geolocadmin1_options, geolocadmin1_abbreviations = get_geo_loc_admin1_options(country, config) + if not geolocadmin1_options: + return country, division, "" + + # Try to find an exact substring match for subdivision + for option in geolocadmin1_options: + division_words = [word.strip() for word in division.lower().split(",")] + if option.lower() in division_words: + return country, option, format_geo_loc_admin2(division, option) + + # Try to find an exact substring match subdivision abbreviation + for option, name in geolocadmin1_abbreviations.items(): + division_words = re.split(r"[,\s]+", division) + if option in division_words: + return country, name, format_geo_loc_admin2(division, option) + + # Try to find a fuzzy match for subdivision + division_words = [name.strip() for name in division.split(",") if name] + for division_word in division_words: + fuzzy_match = fuzzy_match_geo_loc_admin1(division_word, geolocadmin1_options, config) + if fuzzy_match: + logger.info(f"Fuzzy matched {division_word} to {fuzzy_match}") + return country, fuzzy_match, format_geo_loc_admin2(division, division_word) + return country, "", division @click.command() @@ -98,15 +188,12 @@ def main( for record in metadata: # Transform the metadata - try: - record["division"] = record[config.compound_country_field].split(":", 1)[1].strip() - except IndexError: - record["division"] = "" - record["country"] = record[config.compound_country_field].split(":", 1)[0].strip() + record["country"], record["geoLocAdmin1"], record["geoLocAdmin2"] = get_geoloc( + record[config.compound_country_field], config + ) record["submissionId"] = record[config.fasta_id_field] record["insdcAccessionBase"] = record[config.fasta_id_field].split(".", 1)[0] record["insdcVersion"] = record[config.fasta_id_field].split(".", 1)[1] - record["ncbiSubmitterNames"] = split_authors(record["ncbiSubmitterNames"]) if config.segmented: record["segment"] = segments_dict.get(record[config.fasta_id_field], "") diff --git a/ingest/tests/config_cchf/config.yaml b/ingest/tests/config_cchf/config.yaml index 2ae38b880..65995b7b5 100644 --- a/ingest/tests/config_cchf/config.yaml +++ b/ingest/tests/config_cchf/config.yaml @@ -16,7 +16,6 @@ organism: cchf rename: bioprojects: bioprojectAccession country: geoLocCountry - division: geoLocAdmin1 genbankAccession: insdcAccessionFull ncbiCollectionDate: sampleCollectionDate ncbiHostName: hostNameScientific diff --git a/ingest/tests/expected_output_cchf/metadata_post_prepare.json b/ingest/tests/expected_output_cchf/metadata_post_prepare.json new file mode 100644 index 000000000..f20987c9a --- /dev/null +++ b/ingest/tests/expected_output_cchf/metadata_post_prepare.json @@ -0,0 +1,158 @@ +{ + "KX013462.1": { + "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", + "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.", + "bioprojectAccession": "", + "biosampleAccession": "", + "geoLocAdmin1": "Astrahanskaja oblast'", + "geoLocAdmin2": "", + "geoLocCountry": "Russia", + "hash": "7d43b0538a13b718babb885c5a985fd8", + "hostNameScientific": "Ixodoidea", + "hostTaxonId": "297308", + "insdcAccessionBase": "KX013462", + "insdcAccessionFull": "KX013462.1", + "insdcVersion": "1", + "isLabHost": "", + "ncbiReleaseDate": "2016-12-07T00:00:00Z", + "ncbiSourceDb": "GenBank", + "ncbiUpdateDate": "2016-12-07T00:00:00Z", + "ncbiVirusName": "Orthonairovirus haemorrhagiae", + "ncbiVirusTaxId": "3052518", + "sampleCollectionDate": "1989", + "segment": "L", + "specimenCollectorSampleId": "K229_194", + "sraRunAccession": "", + "submissionId": "KX013462.1" + }, + "KX013463.1": { + "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", + "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.", + "bioprojectAccession": "", + "biosampleAccession": "", + "geoLocAdmin1": "Astrahanskaja oblast'", + "geoLocAdmin2": "", + "geoLocCountry": "Russia", + "hash": "d8ee414037323ed4c3a07ca5d3e6c71e", + "hostNameScientific": "Ixodoidea", + "hostTaxonId": "297308", + "insdcAccessionBase": "KX013463", + "insdcAccessionFull": "KX013463.1", + "insdcVersion": "1", + "isLabHost": "", + "ncbiReleaseDate": "2016-12-07T00:00:00Z", + "ncbiSourceDb": "GenBank", + "ncbiUpdateDate": "2016-12-07T00:00:00Z", + "ncbiVirusName": "Orthonairovirus haemorrhagiae", + "ncbiVirusTaxId": "3052518", + "sampleCollectionDate": "1989", + "segment": "M", + "specimenCollectorSampleId": "K229_194", + "sraRunAccession": "", + "submissionId": "KX013463.1" + }, + "KX013464.1": { + "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", + "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.", + "bioprojectAccession": "", + "biosampleAccession": "", + "geoLocAdmin1": "Astrahanskaja oblast'", + "geoLocAdmin2": "", + "geoLocCountry": "Russia", + "hash": "0f73aa76d08ffbe5a4dd96eb8b8de95b", + "hostNameScientific": "Ixodoidea", + "hostTaxonId": "297308", + "insdcAccessionBase": "KX013464", + "insdcAccessionFull": "KX013464.1", + "insdcVersion": "1", + "isLabHost": "", + "ncbiReleaseDate": "2016-12-07T00:00:00Z", + "ncbiSourceDb": "GenBank", + "ncbiUpdateDate": "2016-12-07T00:00:00Z", + "ncbiVirusName": "Orthonairovirus haemorrhagiae", + "ncbiVirusTaxId": "3052518", + "sampleCollectionDate": "1989", + "segment": "S", + "specimenCollectorSampleId": "K229_194", + "sraRunAccession": "", + "submissionId": "KX013464.1" + }, + "KX013483.1": { + "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", + "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.", + "bioprojectAccession": "", + "biosampleAccession": "", + "geoLocAdmin1": "", + "geoLocAdmin2": "", + "geoLocCountry": "Uganda", + "hash": "f1875fdd32cb34bf7792f4175604bc3d", + "hostNameScientific": "Homo sapiens", + "hostTaxonId": "9606", + "insdcAccessionBase": "KX013483", + "insdcAccessionFull": "KX013483.1", + "insdcVersion": "1", + "isLabHost": "", + "ncbiReleaseDate": "2016-12-07T00:00:00Z", + "ncbiSourceDb": "GenBank", + "ncbiUpdateDate": "2016-12-07T00:00:00Z", + "ncbiVirusName": "Orthonairovirus haemorrhagiae", + "ncbiVirusTaxId": "3052518", + "sampleCollectionDate": "1958", + "segment": "L", + "specimenCollectorSampleId": "Nakiwogo", + "sraRunAccession": "", + "submissionId": "KX013483.1" + }, + "KX013485.1": { + "authorAffiliations": "Chumakov Institute of Poliomyelitis and Viral Encephalitides", + "authors": "Lukashev, A.N.; Klimentov, A.S.; Smirnova, S.E.; Dzagurova, T.K.; Drexler, J.F.; Gmyl, A.P.", + "bioprojectAccession": "", + "biosampleAccession": "", + "geoLocAdmin1": "", + "geoLocAdmin2": "", + "geoLocCountry": "Uganda", + "hash": "831e58883a345a466977c13af3492ee9", + "hostNameScientific": "Homo sapiens", + "hostTaxonId": "9606", + "insdcAccessionBase": "KX013485", + "insdcAccessionFull": "KX013485.1", + "insdcVersion": "1", + "isLabHost": "", + "ncbiReleaseDate": "2016-12-07T00:00:00Z", + "ncbiSourceDb": "GenBank", + "ncbiUpdateDate": "2016-12-07T00:00:00Z", + "ncbiVirusName": "Orthonairovirus haemorrhagiae", + "ncbiVirusTaxId": "3052518", + "sampleCollectionDate": "1958", + "segment": "S", + "specimenCollectorSampleId": "Nakiwogo", + "sraRunAccession": "", + "submissionId": "KX013485.1" + }, + "KX096703.1": { + "authorAffiliations": "Public Health England, Research", + "authors": "Deryabin, ; Atshabar, B.; Sansyzbaev, Y.; Berezin, V.; Nurmakhanov, T.; Yeskhojayev, O.; Vilkova, A.; Shevtsov, A.; Hewson, R.; Atkinson, B.", + "bioprojectAccession": "", + "biosampleAccession": "", + "geoLocAdmin1": "", + "geoLocAdmin2": "Sairam district", + "geoLocCountry": "Kazakhstan", + "hash": "77de1141bacb559ea14c89c351df8372", + "hostNameScientific": "Hyalomma anatolicum", + "hostTaxonId": "176092", + "insdcAccessionBase": "KX096703", + "insdcAccessionFull": "KX096703.1", + "insdcVersion": "1", + "isLabHost": "", + "ncbiReleaseDate": "2016-04-30T00:00:00Z", + "ncbiSourceDb": "GenBank", + "ncbiUpdateDate": "2016-04-30T00:00:00Z", + "ncbiVirusName": "Orthonairovirus haemorrhagiae", + "ncbiVirusTaxId": "3052518", + "sampleCollectionDate": "2015", + "segment": "S", + "specimenCollectorSampleId": "tick pool #134", + "sraRunAccession": "", + "submissionId": "KX096703.1" + } +} \ No newline at end of file diff --git a/ingest/tests/test_data_cchf/metadata_post_extract.tsv b/ingest/tests/test_data_cchf/metadata_post_extract.tsv deleted file mode 100644 index 5681090d4..000000000 --- a/ingest/tests/test_data_cchf/metadata_post_extract.tsv +++ /dev/null @@ -1,8 +0,0 @@ -Accession BioProjects BioSample accession Completeness Gene count Geographic Location Geographic Region Host Common Name Host Infraspecific Names Breed Host Infraspecific Names Cultivar Host Infraspecific Names Ecotype Host Infraspecific Names Isolate Host Infraspecific Names Sex Host Infraspecific Names Strain Host Name Host Pangolin Classification Host Taxonomic ID Is Annotated Is Complete Is Lab Host Is Vaccine Strain Isolate Collection date Isolate Lineage Isolate Lineage source Lab Host Length Mature peptide count Molecule type Nucleotide completeness Protein count Purpose of Sampling Release date Source database SRA Accessions Submitter Affiliation Submitter Country Submitter Names Update date Virus Common Name Virus Infraspecific Names Breed Virus Infraspecific Names Cultivar Virus Infraspecific Names Ecotype Virus Infraspecific Names Isolate Virus Infraspecific Names Sex Virus Infraspecific Names Strain Virus Name Virus Pangolin Classification Virus Taxonomic ID -KX013462.1 PARTIAL 1 Russia: Astrakhan Europe Ixodoidea 297308 true 1989 K229_194 12109 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 -KX013463.1 PARTIAL 1 Russia: Astrakhan Europe Ixodoidea 297308 true 1989 K229_194 5343 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 -KX013464.1 PARTIAL 1 Russia: Astrakhan Europe Ixodoidea 297308 true 1989 K229_194 1641 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 -KX096703.1 PARTIAL 1 Kazakhstan: Sairam district Asia Hyalomma anatolicum 176092 true 2015 tick pool #134 513 1 2016-04-30T00:00:00Z GenBank Public Health England, Research Deryabin,P.,Atshabar,B.,Sansyzbaev,Y.,Berezin,V.,Nurmakhanov,T.,Yeskhojayev,O.,Vilkova,A.,Shevtsov,A.,Hewson,R.,Atkinson,B. 2016-04-30T00:00:00Z Orthonairovirus haemorrhagiae 3052518 -KX013483.1 PARTIAL 1 Uganda Africa Homo sapiens 9606 true 1958 Nakiwogo blood 12098 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 -KX013485.1 PARTIAL 1 Uganda Africa Homo sapiens 9606 true 1958 Nakiwogo blood 1571 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 -INSDC001.1 PARTIAL 1 Uganda Africa Homo sapiens 9606 true 1958 Nakiwogo blood 1571 1 2016-12-07T00:00:00Z GenBank Chumakov Institute of Poliomyelitis and Viral Encephalitides Russia Lukashev,A.N.,Klimentov,A.S.,Smirnova,S.E.,Dzagurova,T.K.,Drexler,J.F.,Gmyl,A.P. 2016-12-07T00:00:00Z Orthonairovirus haemorrhagiae 3052518 diff --git a/ingest/tests/test_data_cchf/ncbi_dataset.zip b/ingest/tests/test_data_cchf/ncbi_dataset.zip new file mode 100644 index 000000000..a94f72b44 Binary files /dev/null and b/ingest/tests/test_data_cchf/ncbi_dataset.zip differ diff --git a/ingest/tests/test_ingest.py b/ingest/tests/test_ingest.py index d469efb79..543f66d4a 100644 --- a/ingest/tests/test_ingest.py +++ b/ingest/tests/test_ingest.py @@ -62,6 +62,7 @@ def test_snakemake(): destination_directory = CONFIG_DIR source_directory = TEST_DATA_DIR / "config_cchf" copy_files(source_directory, destination_directory) + run_snakemake("extract_ncbi_dataset_sequences", touch=True) # Ignore sequences for now run_snakemake("get_loculus_depositions", touch=True) # Do not call_loculus run_snakemake("group_segments") run_snakemake("get_previous_submissions", touch=True) # Do not call_loculus diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml index 9829b3355..f366acf56 100644 --- a/kubernetes/loculus/values.yaml +++ b/kubernetes/loculus/values.yaml @@ -424,7 +424,6 @@ defaultOrganismConfig: &defaultOrganismConfig autocomplete: true initiallyVisible: true header: Sample details - ingest: division - name: geoLocAdmin2 displayName: Collection subdivision level 2 generateIndex: true @@ -452,6 +451,10 @@ defaultOrganismConfig: &defaultOrganismConfig header: Authors truncateColumnDisplayTo: 15 ingest: ncbiSubmitterNames + preprocessing: + function: check_authors + inputs: + authors: authors - name: authorAffiliations displayName: Author affiliations generateIndex: true diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py index 216d2b614..bced3c16a 100644 --- a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py +++ b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py @@ -5,6 +5,7 @@ import json import logging +import re from datetime import datetime import dateutil.parser as dateutil @@ -45,6 +46,39 @@ def invalid_value_annotation(input_datum, output_field, value_type) -> Processin ) +def valid_authors(authors: str) -> bool: + pattern = r"^([a-zA-Z\s\.\-\']+,[a-zA-Z\s\.\-\']*;)*([a-zA-Z\s\.\-\']+,[a-zA-Z\s\.\-\']*;*)$" + return re.match(pattern, authors) is not None + + +def warn_potentially_invalid_authors(authors: str) -> bool: + authors_split = re.split(r"[,\s]+", authors) + return bool(";" not in authors and len(authors_split) > 3) + + +def format_authors(authors: str) -> bool: + authors_list = [author for author in authors.split(";") if author] + loculus_authors = [] + for author in authors_list: + author_single_white_space = re.sub(r"\s\s+", " ", author) + last_name, first_name = ( + author_single_white_space.split(",")[0].strip(), + author.split(",")[1].strip(), + ) + # Add dot after initials in first name + first_names = [] + for name in first_name.split(): + if len(name) == 1: + first_names.append(f"{name.upper()}.") + elif len(name) == 2 and name[1] == ".": + first_names.append(f"{name.upper()}") + else: + first_names.append(name) + first_name = " ".join(first_names) + loculus_authors.append(f"{last_name}, {first_name}") + return "; ".join(loculus_authors).strip() + + class ProcessingFunctions: @classmethod def call_function( @@ -398,6 +432,91 @@ def concatenate( warnings=warnings, ) + @staticmethod + def check_authors( + input_data: InputMetadata, output_field: str, args: FunctionArgs = None + ) -> ProcessingResult: + authors = input_data["authors"] + + author_format_description = ( + "Please ensure that " + "authors are separated by semi-colons. Each author's name should be in the format " + "'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to " + "separate first names/initials from last name. Only ASCII alphabetical characters A-Z " + "are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' " + "or 'Xu,;' if the first name is unknown." + ) + warnings: list[ProcessingAnnotation] = [] + errors: list[ProcessingAnnotation] = [] + + if not authors: + return ProcessingResult( + datum=None, + warnings=warnings, + errors=errors, + ) + try: + authors.encode("ascii") + except UnicodeEncodeError: + error_message = ( + f"The authors list '{authors}' contains non-ASCII characters. " + + author_format_description + ) + return ProcessingResult( + datum=None, + errors=[ + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message=error_message, + ) + ], + warnings=warnings, + ) + if valid_authors(authors): + formatted_authors = format_authors(authors) + if warn_potentially_invalid_authors(authors): + warning_message = ( + f"The authors list '{authors}' might not be using the Loculus format. " + + author_format_description + ) + warnings = [ + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message=warning_message, + ) + ] + return ProcessingResult( + datum=formatted_authors, + warnings=warnings, + errors=errors, + ) + return ProcessingResult( + datum=formatted_authors, + warnings=warnings, + errors=errors, + ) + + error_message = ( + f"The authors list '{authors}' is not in a recognized format. " + + author_format_description + ) + return ProcessingResult( + datum=None, + errors=[ + ProcessingAnnotation( + source=[ + AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA) + ], + message=error_message, + ) + ], + warnings=warnings, + ) + @staticmethod def identity( input_data: InputMetadata, output_field: str, args: FunctionArgs = None diff --git a/preprocessing/nextclade/tests/test.py b/preprocessing/nextclade/tests/test.py index 1b77028aa..a2f9e6da4 100644 --- a/preprocessing/nextclade/tests/test.py +++ b/preprocessing/nextclade/tests/test.py @@ -9,6 +9,7 @@ ) from loculus_preprocessing.prepro import process_all from loculus_preprocessing.processing_functions import format_frameshift, format_stop_codon +from loculus_preprocessing.processing_functions import valid_authors, format_authors test_config_file = "tests/test_config.yaml" @@ -296,6 +297,55 @@ def get_test_cases(config: Config) -> list[TestCase]: ], ), ), + TestCase( + name="warn_potential_author_error", + input=UnprocessedEntryFactory.create_unprocessed_entry( + metadata_dict={ + "submissionId": "warn_potential_author_error", + "name_required": "name", + "required_collection_date": "2022-11-01", + "authors": "Anna Smith, Cameron Tucker", + } + ), + expected_output=factory_custom.create_processed_entry( + metadata_dict={ + "name_required": "name", + "required_collection_date": "2022-11-01", + "concatenated_string": "LOC_12.1/2022-11-01", + "authors": "Anna Smith, Cameron Tucker", + }, + metadata_warnings=[ + ( + "authors", + "The authors list 'Anna Smith, Cameron Tucker' might not be using the Loculus format. Please ensure that authors are separated by semi-colons. Each author's name should be in the format 'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to separate first names/initials from last name. Only ASCII alphabetical characters A-Z are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' or 'Xu,;' if the first name is unknown." + ), + ], + ), + ), + TestCase( + name="non_ascii_authors", + input=UnprocessedEntryFactory.create_unprocessed_entry( + metadata_dict={ + "submissionId": "non_ascii_authors", + "name_required": "name", + "required_collection_date": "2022-11-01", + "authors": "Møller, Anäis; Pérez, José", + } + ), + expected_output=factory_custom.create_processed_entry( + metadata_dict={ + "name_required": "name", + "required_collection_date": "2022-11-01", + "concatenated_string": "LOC_13.1/2022-11-01", + }, + metadata_errors=[ + ( + "authors", + "The authors list 'Møller, Anäis; Pérez, José' contains non-ASCII characters. Please ensure that authors are separated by semi-colons. Each author's name should be in the format 'last name, first name;'. Last name(s) is mandatory, a comma is mandatory to separate first names/initials from last name. Only ASCII alphabetical characters A-Z are allowed. For example: 'Smith, Anna; Perez, Tom J.; Xu, X.L.;' or 'Xu,;' if the first name is unknown." + ), + ], + ), + ), ] @@ -303,7 +353,53 @@ def sort_annotations(annotations: list[ProcessingAnnotation]): return sorted(annotations, key=lambda x: (x.source[0].name, x.message)) +accepted_authors = { + "Xi, L.; Yu, X.;": "Xi, L.; Yu, X.", + "Xi,L;Yu,X.;": "Xi, L.; Yu, X.", + "Xi,;Yu,X.;": "Xi, ; Yu, X.", + "Xi, ;Yu,X.;": "Xi, ; Yu, X.", + "Xi, ;Yu,X.": "Xi, ; Yu, X.", + "Xi,;": "Xi,", + "Xi,": "Xi,", + "Smith, Anna Maria; Perez, Jose X.;": "Smith, Anna Maria; Perez, Jose X.", + "Smith,Anna Maria;Perez,Jose X;": "Smith, Anna Maria; Perez, Jose X.", + "de souza, a.": "de souza, A.", + "McGregor, Ewan": "McGregor, Ewan", +} +not_accepted_authors = [ + ";", + ",;", + ",X.;Yu,X.", + ",;Yu,X.", + "Anna Maria Smith; Jose X. Perez", + "Anna Maria Smith;", + "Anna Maria Smith", + "Smith9, Anna;", + "Anna Smith, Cameron Tucker, and Jose Perez", +] + + class PreprocessingTests(unittest.TestCase): + def test_valid_authors(self) -> None: + for author in accepted_authors: + if valid_authors(author) is not True: + msg = f"{author} should be accepted but is not." + raise AssertionError(msg) + for author in not_accepted_authors: + if valid_authors(author) is not False: + msg = f"{author} should not be accepted but is." + raise AssertionError(msg) + + def test_format_authors(self) -> None: + for author, formatted_author in accepted_authors.items(): + if format_authors(author) != formatted_author: + print(format_authors(author)) + msg = ( + f"{author} is not formatted: '{format_authors(author)}' " + f"as expected: '{formatted_author}'" + ) + raise AssertionError(msg) + def test_process_all(self) -> None: config: Config = get_config(test_config_file) test_cases = get_test_cases(config=config) diff --git a/preprocessing/nextclade/tests/test_config.yaml b/preprocessing/nextclade/tests/test_config.yaml index ba4327c08..526d2337d 100644 --- a/preprocessing/nextclade/tests/test_config.yaml +++ b/preprocessing/nextclade/tests/test_config.yaml @@ -73,4 +73,8 @@ processing_spec: type: [string, string, date] inputs: continent: continent - required_collection_date: required_collection_date \ No newline at end of file + required_collection_date: required_collection_date + authors: + function: check_authors + inputs: + authors: authors \ No newline at end of file