feat(preprocessing): parse dates into date ranges to correctly repres…

…ent incomplete dates (#3263) --------- Co-authored-by: Felix Hennig <[email protected]> Co-authored-by: Anna (Anya) Parker <[email protected]>
loculus-project · Dec 2, 2024 · 0fdac51 · 0fdac51
1 parent fd59cd9
commit 0fdac51
Show file tree

Hide file tree

Showing 5 changed files with 296 additions and 34 deletions.
diff --git a/docs/src/content/docs/for-administrators/existing-preprocessing-pipelines.md b/docs/src/content/docs/for-administrators/existing-preprocessing-pipelines.md
@@ -34,7 +34,6 @@ In the default configuration the pipeline performs:
 
 The pipeline also formats metadata fields:
 
--   **process date**: Takes a date string and returns a date field in the "%Y-%m-%d" format.
--   **parse timestamp**: Takes a timestamp e.g. 2022-11-01T00:00:00Z and returns that field in the "%Y-%m-%d" format.
+-   **parse timestamp**: Takes an ISO timestamp e.g. `2022-11-01T00:00:00Z` and returns that field in the `%Y-%m-%d` format.
 
 The code is available on [GitHub](https://github.com/loculus-project/loculus/tree/main/preprocessing/nextclade) under the [AGPL-3.0 license](https://github.com/loculus-project/loculus/blob/main/LICENSE).
diff --git a/kubernetes/loculus/values.yaml b/kubernetes/loculus/values.yaml
@@ -31,8 +31,8 @@ logo:
 defaultOrganismConfig: &defaultOrganismConfig
   schema: &schema
     loadSequencesAutomatically: true
-    organismName: "Ebola Zaire"
-    image: "/images/organisms/ebolazaire_small.jpg"
+    organismName: "Ebola Sudan"
+    image: "/images/organisms/ebolasudan_small.jpg"
     ### Field list
     ## General fields
     # name: Key used across app to refer to this field (required)
@@ -64,15 +64,44 @@ defaultOrganismConfig: &defaultOrganismConfig
     metadata:
       - name: sampleCollectionDate
         displayName: Collection date
-        type: date
         initiallyVisible: true
         header: Sample details
         ingest: ncbiCollectionDate
         preprocessing:
-          function: parse_and_assert_past_date
+          function: parse_date_into_range
           inputs:
             date: sampleCollectionDate
+            releaseDate: ncbiReleaseDate
+          args:
+            fieldType: dateRangeString
         required: true
+        enableSubstringSearch: true
+      - name: sampleCollectionDateRangeLower
+        displayName: Collection date (lower bound)
+        type: date
+        initiallyVisible: true
+        header: Sample details
+        preprocessing:
+          function: parse_date_into_range
+          inputs:
+            date: sampleCollectionDate
+            releaseDate: ncbiReleaseDate
+          args:
+            fieldType: dateRangeLower
+        noInput: true
+      - name: sampleCollectionDateRangeUpper
+        displayName: Collection date (upper bound)
+        type: date
+        initiallyVisible: true
+        header: Sample details
+        preprocessing:
+          function: parse_date_into_range
+          inputs:
+            date: sampleCollectionDate
+            releaseDate: ncbiReleaseDate
+          args:
+            fieldType: dateRangeUpper
+        noInput: true
       - name: displayName
         preprocessing:
           function: concatenate
@@ -81,7 +110,7 @@ defaultOrganismConfig: &defaultOrganismConfig
             sampleCollectionDate: sampleCollectionDate
           args:
             order: [geoLocCountry, accession_version, sampleCollectionDate]
-            type: [string, string, date]
+            type: [string, string, string]
         noInput: true
       - name: ncbiReleaseDate
         displayName: NCBI release date
@@ -1088,7 +1117,7 @@ defaultOrganismConfig: &defaultOrganismConfig
       defaultOrderBy: sampleCollectionDate
       defaultOrder: descending
     silo:
-      dateToSortBy: sampleCollectionDate
+      dateToSortBy: sampleCollectionDateRangeUpper
     extraInputFields: []
   preprocessing:
     - &preprocessing
@@ -1099,51 +1128,51 @@ defaultOrganismConfig: &defaultOrganismConfig
       replicas: 2
       configFile: &preprocessingConfigFile
         log_level: DEBUG
-        nextclade_dataset_name: nextstrain/ebola/zaire
+        nextclade_dataset_name: nextstrain/ebola/sudan
         genes: [NP, VP35, VP40, GP, sGP, ssGP, VP30, VP24, L]
         batch_size: 100
   ingest: &ingest
     image: ghcr.io/loculus-project/ingest
     configFile: &ingestConfigFile
-      taxon_id: 3052462
+      taxon_id: 186540
   enaDeposition:
     configFile:
-      taxon_id: 186538
-      scientific_name: "Zaire ebolavirus"
+      taxon_id: 186540
+      scientific_name: "Sudan ebolavirus"
       molecule_type: "genomic RNA"
   referenceGenomes:
     nucleotideSequences:
       - name: "main"
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/reference.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/reference.fasta]]"
         insdcAccessionFull: NC_002549.1
     genes:
       - name: NP
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/NP.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/NP.fasta]]"
       - name: VP35
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP35.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP35.fasta]]"
       - name: VP40
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP40.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP40.fasta]]"
       - name: GP
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/GP.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/GP.fasta]]"
       - name: ssGP
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/ssGP.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/ssGP.fasta]]"
       - name: sGP
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/sGP.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/sGP.fasta]]"
       - name: VP30
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP30.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP30.fasta]]"
       - name: VP24
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/VP24.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/VP24.fasta]]"
       - name: L
-        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-zaire/L.fasta]]"
+        sequence: "[[URL:https://corneliusroemer.github.io/seqs/artefacts/ebola-sudan/L.fasta]]"
 defaultOrganisms:
-  ebola-zaire:
+  ebola-sudan:
     <<: *defaultOrganismConfig
     preprocessing:
       - <<: *preprocessing
         configFile:
           <<: *preprocessingConfigFile
           nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/ebola/data_output
-          nextclade_dataset_name: nextstrain/ebola/zaire
+          nextclade_dataset_name: nextstrain/ebola/sudan
   west-nile:
     <<: *defaultOrganismConfig
     schema:

diff --git a/preprocessing/nextclade/environment.yml b/preprocessing/nextclade/environment.yml
@@ -5,12 +5,12 @@ channels:
   - nodefaults
 dependencies:
   - python=3.12
-  - biopython=1.83
-  - dpath=2.1
-  - nextclade=3.8
-  - pip=24.0
+  - biopython=1.84
+  - dpath=2.2
+  - nextclade=3.9
+  - pip=24.3
   - PyYAML=6.0
-  - pyjwt=2.8
+  - pyjwt=2.10
   - python-dateutil=2.9
-  - pytz=2024.1
+  - pytz=2024.2
   - requests=2.32
diff --git a/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py b/preprocessing/nextclade/src/loculus_preprocessing/processing_functions.py
@@ -3,9 +3,11 @@
 This makes it easy to test and reason about the code
 """
 
+import calendar
 import json
 import logging
 import re
+from dataclasses import dataclass
 from datetime import datetime
 
 import dateutil.parser as dateutil
@@ -198,6 +200,162 @@ def check_date(
                 ],
             )
 
+    @staticmethod
+    def parse_date_into_range(
+        input_data: InputMetadata,
+        output_field: str,
+        args: FunctionArgs = None,  # args is essential - even if Pylance says it's not used
+    ) -> ProcessingResult:
+        """Parse date string (`input.date`) formatted as one of YYYY | YYYY-MM | YYYY-MM-DD into a range using upper bound (`input.releaseDate`)
+        Return value determined FunctionArgs:
+        fieldType: "dateRangeString" | "dateRangeLower" | "dateRangeUpper"
+        Default fieldType is "dateRangeString"
+        """
+        if args is None:
+            args = {"fieldType": "dateRangeString"}
+
+        logger.debug(f"input_data: {input_data}")
+
+        input_date_str = input_data["date"]
+
+        release_date_str = input_data.get("releaseDate", "") or ""
+        try:
+            release_date = dateutil.parse(release_date_str).astimezone(pytz.utc)
+        except Exception:
+            release_date = None
+
+        now = datetime.now(tz=pytz.utc)
+        max_upper_limit = min(now, release_date) if release_date else now
+
+        if not input_date_str:
+            return ProcessingResult(
+                datum=max_upper_limit.strftime("%Y-%m-%d")
+                if args["fieldType"] == "dateRangeUpper"
+                else None,
+                warnings=[],
+                errors=[],
+            )
+
+        formats_to_messages = {
+            "%Y-%m-%d": None,
+            "%Y-%m": "Day is missing. Assuming date is some time in the month.",
+            "%Y": "Month and day are missing. Assuming date is some time in the year.",
+        }
+
+        warnings = []
+        errors = []
+
+        @dataclass
+        class DateRange:
+            date_range_string: str | None
+            date_range_lower: datetime | None
+            date_range_upper: datetime
+
+        for format, message in formats_to_messages.items():
+            try:
+                parsed_date = datetime.strptime(input_date_str, format).replace(tzinfo=pytz.utc)
+            except ValueError:
+                continue
+            match format:
+                case "%Y-%m-%d":
+                    datum = DateRange(
+                        date_range_string=parsed_date.strftime(format),
+                        date_range_lower=parsed_date,
+                        date_range_upper=parsed_date,
+                    )
+                case "%Y-%m":
+                    datum = DateRange(
+                        date_range_string=parsed_date.strftime(format),
+                        date_range_lower=parsed_date.replace(day=1),
+                        date_range_upper=(
+                            parsed_date.replace(
+                                day=calendar.monthrange(parsed_date.year, parsed_date.month)[1]
+                            )
+                        ),
+                    )
+                case "%Y":
+                    datum = DateRange(
+                        date_range_string=parsed_date.strftime(format),
+                        date_range_lower=parsed_date.replace(month=1, day=1),
+                        date_range_upper=parsed_date.replace(month=12, day=31),
+                    )
+
+            logger.debug(f"parsed_date: {datum}")
+
+            if datum.date_range_upper > max_upper_limit:
+                logger.debug(
+                    "Tightening upper limit due to release date or current date. "
+                    f"Original upper limit: {datum.date_range_upper},"
+                    f"new upper limit: {max_upper_limit}"
+                )
+                datum.date_range_upper = max_upper_limit
+
+            if message:
+                warnings.append(
+                    ProcessingAnnotation(
+                        source=[
+                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                        ],
+                        message=f"Metadata field {output_field}:'{input_date_str}' - " + message,
+                    )
+                )
+
+            if datum.date_range_lower > datetime.now(tz=pytz.utc):
+                logger.debug(
+                    f"Lower range of date: {datum.date_range_lower} > {datetime.now(tz=pytz.utc)}"
+                )
+                errors.append(
+                    ProcessingAnnotation(
+                        source=[
+                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                        ],
+                        message=f"Metadata field {output_field}:'{input_date_str}' is in the future.",
+                    )
+                )
+
+            if release_date and (datum.date_range_lower > release_date):
+                logger.debug(f"Lower range of date: {parsed_date} > release_date: {release_date}")
+                errors.append(
+                    ProcessingAnnotation(
+                        source=[
+                            AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                        ],
+                        message=(
+                            f"Metadata field {output_field}:'{input_date_str}'"
+                            "is after release date."
+                        ),
+                    )
+                )
+
+            match args["fieldType"]:
+                case "dateRangeString":
+                    return_value = datum.date_range_string
+                case "dateRangeLower":
+                    return_value = datum.date_range_lower.strftime("%Y-%m-%d")
+                    warnings = errors = []
+                case "dateRangeUpper":
+                    return_value = datum.date_range_upper.strftime("%Y-%m-%d")
+                    warnings = errors = []
+                case _:
+                    msg = f"Config error: Unknown fieldType: {args['fieldType']}"
+                    raise ValueError(msg)
+
+            return ProcessingResult(datum=return_value, warnings=warnings, errors=errors)
+
+        # If all parsing attempts fail, it's an unrecognized format
+        return ProcessingResult(
+            datum=None,
+            warnings=[],
+            errors=[
+                ProcessingAnnotation(
+                    source=[
+                        AnnotationSource(name=output_field, type=AnnotationSourceType.METADATA)
+                    ],
+                    message=f"Metadata field {output_field}: Date {input_date_str} could not be parsed.",
+                )
+            ],
+        )
+
     @staticmethod
     def parse_and_assert_past_date(  # noqa: C901
         input_data: InputMetadata,
@@ -693,7 +851,9 @@ def range_string(_start: str | int, _end: str | int) -> str:
             return f"{start}-{end}"
         return str(start)
 
-    frame_shifts = json.loads(input.replace("'", '"'))  # Required for json.loads to recognize input as json string and convert to dict
+    frame_shifts = json.loads(
+        input.replace("'", '"')
+    )  # Required for json.loads to recognize input as json string and convert to dict
     frame_shift_strings = []
     for frame_shift in frame_shifts:
         nuc_range_list = [range_string(nuc["begin"], nuc["end"]) for nuc in frame_shift["nucAbs"]]