Skip to content

Commit

Permalink
feat(preprocessing): parse dates into date ranges to correctly repres…
Browse files Browse the repository at this point in the history
…ent incomplete dates (#3263)

---------

Co-authored-by: Felix Hennig <[email protected]>
Co-authored-by: Anna (Anya) Parker <[email protected]>
  • Loading branch information
3 people authored Dec 2, 2024
1 parent fd59cd9 commit 0fdac51
Show file tree
Hide file tree
Showing 5 changed files with 296 additions and 34 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ In the default configuration the pipeline performs:

The pipeline also formats metadata fields:

- **process date**: Takes a date string and returns a date field in the "%Y-%m-%d" format.
- **parse timestamp**: Takes a timestamp e.g. 2022-11-01T00:00:00Z and returns that field in the "%Y-%m-%d" format.
- **parse timestamp**: Takes an ISO timestamp e.g. `2022-11-01T00:00:00Z` and returns that field in the `%Y-%m-%d` format.

The code is available on [GitHub](https://github.com/loculus-project/loculus/tree/main/preprocessing/nextclade) under the [AGPL-3.0 license](https://github.com/loculus-project/loculus/blob/main/LICENSE).
73 changes: 51 additions & 22 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ logo:
defaultOrganismConfig: &defaultOrganismConfig
schema: &schema
loadSequencesAutomatically: true
organismName: "Ebola Zaire"
image: "/images/organisms/ebolazaire_small.jpg"
organismName: "Ebola Sudan"
image: "/images/organisms/ebolasudan_small.jpg"
### Field list
## General fields
# name: Key used across app to refer to this field (required)
Expand Down Expand Up @@ -64,15 +64,44 @@ defaultOrganismConfig: &defaultOrganismConfig
metadata:
- name: sampleCollectionDate
displayName: Collection date
type: date
initiallyVisible: true
header: Sample details
ingest: ncbiCollectionDate
preprocessing:
function: parse_and_assert_past_date
function: parse_date_into_range
inputs:
date: sampleCollectionDate
releaseDate: ncbiReleaseDate
args:
fieldType: dateRangeString
required: true
enableSubstringSearch: true
- name: sampleCollectionDateRangeLower
displayName: Collection date (lower bound)
type: date
initiallyVisible: true
header: Sample details
preprocessing:
function: parse_date_into_range
inputs:
date: sampleCollectionDate
releaseDate: ncbiReleaseDate
args:
fieldType: dateRangeLower
noInput: true
- name: sampleCollectionDateRangeUpper
displayName: Collection date (upper bound)
type: date
initiallyVisible: true
header: Sample details
preprocessing:
function: parse_date_into_range
inputs:
date: sampleCollectionDate
releaseDate: ncbiReleaseDate
args:
fieldType: dateRangeUpper
noInput: true
- name: displayName
preprocessing:
function: concatenate
Expand All @@ -81,7 +110,7 @@ defaultOrganismConfig: &defaultOrganismConfig
sampleCollectionDate: sampleCollectionDate
args:
order: [geoLocCountry, accession_version, sampleCollectionDate]
type: [string, string, date]
type: [string, string, string]
noInput: true
- name: ncbiReleaseDate
displayName: NCBI release date
Expand Down Expand Up @@ -1088,7 +1117,7 @@ defaultOrganismConfig: &defaultOrganismConfig
defaultOrderBy: sampleCollectionDate
defaultOrder: descending
silo:
dateToSortBy: sampleCollectionDate
dateToSortBy: sampleCollectionDateRangeUpper
extraInputFields: []
preprocessing:
- &preprocessing
Expand All @@ -1099,51 +1128,51 @@ defaultOrganismConfig: &defaultOrganismConfig
replicas: 2
configFile: &preprocessingConfigFile
log_level: DEBUG
nextclade_dataset_name: nextstrain/ebola/zaire
nextclade_dataset_name: nextstrain/ebola/sudan
genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
batch_size: 100
ingest: &ingest
image: ghcr.io/loculus-project/ingest
configFile: &ingestConfigFile
taxon_id: 3052462
taxon_id: 186540
enaDeposition:
configFile:
taxon_id: 186538
scientific_name: "Zaire ebolavirus"
taxon_id: 186540
scientific_name: "Sudan ebolavirus"
molecule_type: "genomic RNA"
referenceGenomes:
nucleotideSequences:
- name: "main"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/reference.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/reference.fasta]]"
insdcAccessionFull: NC_002549.1
genes:
- name: NP
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/NP.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/NP.fasta]]"
- name: VP35
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP35.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP35.fasta]]"
- name: VP40
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP40.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP40.fasta]]"
- name: GP
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/GP.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/GP.fasta]]"
- name: ssGP
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/ssGP.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/ssGP.fasta]]"
- name: sGP
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/sGP.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/sGP.fasta]]"
- name: VP30
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP30.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP30.fasta]]"
- name: VP24
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP24.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP24.fasta]]"
- name: L
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/L.fasta]]"
sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/L.fasta]]"
defaultOrganisms:
ebola-zaire:
ebola-sudan:
<<: *defaultOrganismConfig
preprocessing:
- <<: *preprocessing
configFile:
<<: *preprocessingConfigFile
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
nextclade_dataset_name: nextstrain/ebola/zaire
nextclade_dataset_name: nextstrain/ebola/sudan
west-nile:
<<: *defaultOrganismConfig
schema:
Expand Down
12 changes: 6 additions & 6 deletions preprocessing/nextclade/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ channels:
- nodefaults
dependencies:
- python=3.12
- biopython=1.83
- dpath=2.1
- nextclade=3.8
- pip=24.0
- biopython=1.84
- dpath=2.2
- nextclade=3.9
- pip=24.3
- PyYAML=6.0
- pyjwt=2.8
- pyjwt=2.10
- python-dateutil=2.9
- pytz=2024.1
- pytz=2024.2
- requests=2.32
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
This makes it easy to test and reason about the code
"""

import calendar
import json
import logging
import re
from dataclasses import dataclass
from datetime import datetime

import dateutil.parser as dateutil
Expand Down Expand Up @@ -198,6 +200,162 @@ def check_date(
],
)

@staticmethod
def parse_date_into_range(
input_data: InputMetadata,
output_field: str,
args: FunctionArgs = None, # args is essential - even if Pylance says it's not used
) -> ProcessingResult:
"""Parse date string (`input.date`) formatted as one of YYYY | YYYY-MM | YYYY-MM-DD into a range using upper bound (`input.releaseDate`)
Return value determined FunctionArgs:
fieldType: "dateRangeString" | "dateRangeLower" | "dateRangeUpper"
Default fieldType is "dateRangeString"
"""
if args is None:
args = {"fieldType": "dateRangeString"}

logger.debug(f"input_data: {input_data}")

input_date_str = input_data["date"]

release_date_str = input_data.get("releaseDate", "") or ""
try:
release_date = dateutil.parse(release_date_str).astimezone(pytz.utc)
except Exception:
release_date = None

now = datetime.now(tz=pytz.utc)
max_upper_limit = min(now, release_date) if release_date else now

if not input_date_str:
return ProcessingResult(
datum=max_upper_limit.strftime("%Y-%m-%d")
if args["fieldType"] == "dateRangeUpper"
else None,
warnings=[],
errors=[],
)

formats_to_messages = {
"%Y-%m-%d": None,
"%Y-%m": "Day is missing. Assuming date is some time in the month.",
"%Y": "Month and day are missing. Assuming date is some time in the year.",
}

warnings = []
errors = []

@dataclass
class DateRange:
date_range_string: str | None
date_range_lower: datetime | None
date_range_upper: datetime

for format, message in formats_to_messages.items():
try:
parsed_date = datetime.strptime(input_date_str, format).replace(tzinfo=pytz.utc)
except ValueError:
continue
match format:
case "%Y-%m-%d":
datum = DateRange(
date_range_string=parsed_date.strftime(format),
date_range_lower=parsed_date,
date_range_upper=parsed_date,
)
case "%Y-%m":
datum = DateRange(
date_range_string=parsed_date.strftime(format),
date_range_lower=parsed_date.replace(day=1),
date_range_upper=(
parsed_date.replace(
day=calendar.monthrange(parsed_date.year, parsed_date.month)[1]
)
),
)
case "%Y":
datum = DateRange(
date_range_string=parsed_date.strftime(format),
date_range_lower=parsed_date.replace(month=1, day=1),
date_range_upper=parsed_date.replace(month=12, day=31),
)

logger.debug(f"parsed_date: {datum}")

if datum.date_range_upper > max_upper_limit:
logger.debug(
"Tightening upper limit due to release date or current date. "
f"Original upper limit: {datum.date_range_upper},"
f"new upper limit: {max_upper_limit}"
)
datum.date_range_upper = max_upper_limit

if message:
warnings.append(
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message=f"Metadata field {output_field}:'{input_date_str}' - " + message,
)
)

if datum.date_range_lower > datetime.now(tz=pytz.utc):
logger.debug(
f"Lower range of date: {datum.date_range_lower} > {datetime.now(tz=pytz.utc)}"
)
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message=f"Metadata field {output_field}:'{input_date_str}' is in the future.",
)
)

if release_date and (datum.date_range_lower > release_date):
logger.debug(f"Lower range of date: {parsed_date} > release_date: {release_date}")
errors.append(
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message=(
f"Metadata field {output_field}:'{input_date_str}'"
"is after release date."
),
)
)

match args["fieldType"]:
case "dateRangeString":
return_value = datum.date_range_string
case "dateRangeLower":
return_value = datum.date_range_lower.strftime("%Y-%m-%d")
warnings = errors = []
case "dateRangeUpper":
return_value = datum.date_range_upper.strftime("%Y-%m-%d")
warnings = errors = []
case _:
msg = f"Config error: Unknown fieldType: {args['fieldType']}"
raise ValueError(msg)

return ProcessingResult(datum=return_value, warnings=warnings, errors=errors)

# If all parsing attempts fail, it's an unrecognized format
return ProcessingResult(
datum=None,
warnings=[],
errors=[
ProcessingAnnotation(
source=[
AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
],
message=f"Metadata field {output_field}: Date {input_date_str} could not be parsed.",
)
],
)

@staticmethod
def parse_and_assert_past_date( # noqa: C901
input_data: InputMetadata,
Expand Down Expand Up @@ -693,7 +851,9 @@ def range_string(_start: str | int, _end: str | int) -> str:
return f"{start}-{end}"
return str(start)

frame_shifts = json.loads(input.replace("'", '"')) # Required for json.loads to recognize input as json string and convert to dict
frame_shifts = json.loads(
input.replace("'", '"')
) # Required for json.loads to recognize input as json string and convert to dict
frame_shift_strings = []
for frame_shift in frame_shifts:
nuc_range_list = [range_string(nuc["begin"], nuc["end"]) for nuc in frame_shift["nucAbs"]]
Expand Down
Loading

0 comments on commit 0fdac51

Please sign in to comment.