Skip to content

Commit

Permalink
Format authors cleanup (#3068)
Browse files Browse the repository at this point in the history
* Clean up defaults for author-formatting

* make dictionary not list of dictionaries

* fix tests
  • Loading branch information
anna-parker authored Oct 25, 2024
1 parent 9385fbd commit 8fc0a47
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 80 deletions.
76 changes: 38 additions & 38 deletions ingest/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,44 +24,44 @@ keep:
- sequence_md5
- genbankAccession
- jointAccession
simple_mappings:
"ncbiReleaseDate": "releaseDate"
"ncbiIsAnnotated": "isAnnotated"
"ncbiIsLabHost": "isLabHost"
"ncbiProteinCount": "proteinCount"
"ncbiSourceDb": "sourceDatabase"
"ncbiIsComplete": "completeness"
"ncbiLabHost": "labHost"
"ncbiUpdateDate": "updateDate"
"genbankAccession": "accession"
"biosampleAccession": "biosample"
"ncbi_gene_count": "geneCount"
"bioprojects": "bioprojects"
"ncbiSraAccessions": "sraAccessions"
location_mappings:
"ncbiGeoLocation": "geographicLocation"
"ncbiGeoRegion": "geographicRegion"
submitter_mappings:
"ncbiSubmitterAffiliation": "affiliation"
"ncbiSubmitterNames": "names"
"ncbiSubmitterCountry": "country"
isolate_mappings:
"ncbiIsolateName": "name"
"ncbiIsolateSource": "source"
"ncbiCollectionDate": "collectionDate"
virus_mappings:
"ncbiVirusName": "organismName"
"ncbiVirusTaxId": "taxId"
host_mappings:
"ncbiHostTaxId": "taxId"
"ncbiHostName": "organismName"
parse_list:
- bioprojects
- ncbiSraAccessions
unknown_mappings: # I don't know yet where these fields come from
- ncbiHostCommonName
- ncbiPurposeOfSampling
- ncbiHostSex
ncbi_mappings:
string_to_string_mappings:
"ncbiReleaseDate": "releaseDate"
"ncbiIsAnnotated": "isAnnotated"
"ncbiIsLabHost": "isLabHost"
"ncbiProteinCount": "proteinCount"
"ncbiSourceDb": "sourceDatabase"
"ncbiIsComplete": "completeness"
"ncbiLabHost": "labHost"
"ncbiUpdateDate": "updateDate"
"genbankAccession": "accession"
"biosampleAccession": "biosample"
"ncbi_gene_count": "geneCount"
string_to_list_mappings:
"bioprojects": "bioprojects"
"ncbiSraAccessions": "sraAccessions"
string_to_dict_mappings:
location:
"ncbiGeoLocation": "geographicLocation"
"ncbiGeoRegion": "geographicRegion"
submitter:
"ncbiSubmitterAffiliation": "affiliation"
"ncbiSubmitterNames": "names"
"ncbiSubmitterCountry": "country"
isolate:
"ncbiIsolateName": "name"
"ncbiIsolateSource": "source"
"ncbiCollectionDate": "collectionDate"
virus:
"ncbiVirusName": "organismName"
"ncbiVirusTaxId": "taxId"
host:
"ncbiHostTaxId": "taxId"
"ncbiHostName": "organismName"
unknown_mappings: # Keep for backward compatibility with old ingest pipeline
- ncbiHostCommonName
- ncbiPurposeOfSampling
- ncbiHostSex
group_name: insdc_ingest_group # Used only to set the group name, never read
username: insdc_ingest_user
password: insdc_ingest_user
Expand Down
63 changes: 23 additions & 40 deletions ingest/scripts/format_ncbi_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,11 @@


@dataclass
class Config:
simple_mappings: dict[str, str]
location_mappings: dict[str, str]
submitter_mappings: dict[str, str]
isolate_mappings: dict[str, str]
virus_mappings: dict[str, str]
host_mappings: dict[str, str]
class NCBIMappings:
string_to_string_mappings: dict[str, str]
string_to_list_mappings: dict[str, str]
string_to_dict_mappings: dict[str, dict[str, str]]
unknown_mappings: list[str]
parse_list: list[str]


def convert_to_title_case(name: str) -> str:
Expand Down Expand Up @@ -113,57 +109,42 @@ def reformat_authors_from_genbank_to_loculus(
return formatted_authors


def extract_fields(row, config: Config) -> dict:
def extract_fields(row, ncbi_mappings: NCBIMappings) -> dict:
try:
extracted = {}
extracted.update({key: row.get(value) for key, value in config.simple_mappings.items()})
location = row.get("location", {})
extracted.update(
{key: location.get(value) for key, value in config.location_mappings.items()}
{key: row.get(value) for key, value in ncbi_mappings.string_to_string_mappings.items()}
)
submitter = row.get("submitter", {})
extracted.update(
{key: submitter.get(value) for key, value in config.submitter_mappings.items()}
)
isolate = row.get("isolate", {})
extracted.update(
{key: isolate.get(value) for key, value in config.isolate_mappings.items()}
)

host_lineage = row.get("host", {})
extracted.update(
{key: host_lineage.get(value) for key, value in config.host_mappings.items()}
{key: row.get(value) for key, value in ncbi_mappings.string_to_list_mappings.items()}
)

virus_lineage = row.get("virus", {})
extracted.update(
{key: virus_lineage.get(value) for key, value in config.virus_mappings.items()}
)

extracted.update(dict.fromkeys(config.unknown_mappings))
for field in ncbi_mappings.string_to_list_mappings:
if extracted[field]:
extracted[field] = ",".join(extracted[field])
else:
extracted[field] = ""
for field, sub_dict in ncbi_mappings.string_to_dict_mappings.items():
dict_as_string = row.get(field, {})
extracted.update({key: dict_as_string.get(value) for key, value in sub_dict.items()})
extracted.update(dict.fromkeys(ncbi_mappings.unknown_mappings))

except KeyError as e:
print(f"Missing key: {e}")
extracted = {}
return extracted


def jsonl_to_tsv(jsonl_file: str, tsv_file: str, config: Config) -> None:
def jsonl_to_tsv(jsonl_file: str, tsv_file: str, ncbi_mappings: NCBIMappings) -> None:
extracted_rows: list[dict[str, str]] = []
with (
open(jsonl_file, encoding="utf-8") as infile,
):
for line in infile:
row = json.loads(line.strip())
extracted = extract_fields(row, config)
extracted = extract_fields(row, ncbi_mappings)
extracted["ncbiSubmitterNames"] = reformat_authors_from_genbank_to_loculus(
extracted["ncbiSubmitterNames"], extracted["genbankAccession"]
)
for field in config.parse_list:
if extracted[field]:
extracted[field] = ",".join(extracted[field])
else:
extracted[field] = ""
extracted_rows.append(extracted)
df = pd.DataFrame(extracted_rows)
df.to_csv(
Expand All @@ -190,9 +171,11 @@ def main(config_file: str, input: str, output: str, log_level: str) -> None:

with open(config_file, encoding="utf-8") as file:
full_config = yaml.safe_load(file)
relevant_config = {key: full_config[key] for key in Config.__annotations__}
config = Config(**relevant_config)
jsonl_to_tsv(input, output, config=config)
ncbi_mappings_data = full_config["ncbi_mappings"]
relevant_config = {key: ncbi_mappings_data[key] for key in NCBIMappings.__annotations__}
ncbi_mappings = NCBIMappings(**relevant_config)

jsonl_to_tsv(input, output, ncbi_mappings=ncbi_mappings)


if __name__ == "__main__":
Expand Down
2 changes: 0 additions & 2 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1248,7 +1248,6 @@ defaultOrganisms:
header: "Collection Details"
- name: pangoLineage
initiallyVisible: true
type: pango_lineage
autocomplete: true
required: true
website:
Expand All @@ -1261,7 +1260,6 @@ defaultOrganisms:
defaultOrderBy: date
silo:
dateToSortBy: date
partitionBy: pangoLineage
preprocessing:
- version: 1
image: ghcr.io/loculus-project/preprocessing-dummy
Expand Down

0 comments on commit 8fc0a47

Please sign in to comment.