Skip to content

Commit

Permalink
debugged stage and upload unittest
Browse files Browse the repository at this point in the history
  • Loading branch information
tclose committed Sep 20, 2024
1 parent bb2b215 commit e566c81
Show file tree
Hide file tree
Showing 8 changed files with 156 additions and 93 deletions.
15 changes: 14 additions & 1 deletion conftest.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
import os
from pathlib import Path
import logging
import typing as ty
import tempfile
from logging.handlers import SMTPHandler
import pytest
from click.testing import CliRunner
import xnat4tests
import xnat4tests # type: ignore[import-untyped]
from datetime import datetime
from xnat_ingest.utils import logger
from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b.pet_listmode import (
get_data as get_listmode_data,
)
from medimages4tests.dummy.raw.pet.siemens.biograph_vision.vr20b.pet_countrate import (
get_data as get_countrate_data,
)

# Set DEBUG logging for unittests

Expand Down Expand Up @@ -110,3 +117,9 @@ def emit(self, record):
# Capture the email message and append it to the list
msg = self.format(record)
self.emails.append(msg)


def get_raw_data_files(out_dir: ty.Optional[Path] = None, **kwargs) -> ty.List[Path]:
if out_dir is None:
out_dir = Path(tempfile.mkdtemp())
return get_listmode_data(out_dir, **kwargs) + get_countrate_data(out_dir, **kwargs)
2 changes: 1 addition & 1 deletion scripts/run_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from xnat_ingest.utils import show_cli_trace
from click.testing import CliRunner

PATTERN = "{PatientName.given_name}_{PatientName.family_name}_{SeriesDate}.*"
PATTERN = "{PatientName.family_name}_{PatientName.given_name}_{SeriesDate}.*"

runner = CliRunner()

Expand Down
10 changes: 5 additions & 5 deletions xnat_ingest/cli/stage.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@
@click.option(
"--resource-field",
type=str,
default="ImageType",
default="ImageType[-1]",
envvar="XNAT_INGEST_STAGE_RESOURCE",
help=(
"The keyword of the metadata field to extract the XNAT imaging resource ID from "
Expand All @@ -110,19 +110,19 @@
@click.option(
"--associated-files",
type=AssociatedFiles.cli_type,
nargs=2,
nargs=3,
default=None,
multiple=True,
envvar="XNAT_INGEST_STAGE_ASSOCIATED",
metavar="<glob> <id-pattern>",
metavar="<datatype> <glob> <id-pattern>",
help=(
'The "glob" arg is a glob pattern by which to detect associated files to be '
"attached to the DICOM sessions. Note that when this pattern corresponds to a "
"relative path it is considered to be relative to the parent directory containing "
"the DICOMs for the session NOT the current working directory Can contain string "
"templates corresponding to DICOM metadata fields, which are substituted before "
"the glob is called. For example, "
'"./associated/{PatientName.given_name}_{PatientName.family_name}/*)" '
'"./associated/{PatientName.family_name}_{PatientName.given_name}/*)" '
"will find all files under the subdirectory within '/path/to/dicoms/associated' that matches "
"<GIVEN-NAME>_<FAMILY-NAME>. Will be interpreted as being relative to `dicoms_dir` "
"if a relative path is provided.\n"
Expand Down Expand Up @@ -308,7 +308,7 @@ def stage(
# Identify theDeidentify files if necessary and save them to the staging directory
session.stage(
staging_dir,
associated_files=associated_files,
associated_file_groups=associated_files,
remove_original=delete,
deidentify=deidentify,
project_list=project_list,
Expand Down
1 change: 1 addition & 0 deletions xnat_ingest/cli/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,7 @@ def iter_staged_sessions():
image_type = scan.metadata.get("ImageType")
if image_type and image_type[:2] == ["DERIVED", "SECONDARY"]:
modality = "SC"
resource_name = "secondary"
else:
modality = scan.metadata.get(
"Modality", default_scan_modality
Expand Down
80 changes: 55 additions & 25 deletions xnat_ingest/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
from tqdm import tqdm
import yaml
import pydicom
from fileformats.generic import File
from fileformats.application import Dicom
from fileformats.medimage import DicomSeries
from fileformats.core import from_paths, FileSet, DataType, from_mime, to_mime
Expand All @@ -43,9 +42,20 @@ def scan_type_converter(scan_type: str) -> str:

@attrs.define
class ImagingScan:
"""Representation of a scan to be uploaded to XNAT
Parameters
----------
id: str
the ID of the scan on XNAT
type: str
the scan type/description
"""

id: str
type: str = attrs.field(converter=scan_type_converter)
resources: ty.Dict[str, FileSet] = attrs.field()
associated: bool = False

def __contains__(self, resource_name):
return resource_name in self.resources
Expand Down Expand Up @@ -115,13 +125,22 @@ def modalities(self) -> ty.Set[str]:
return modalities

@property
def dicoms(self):
return (scan["DICOM"] for scan in self.scans.values() if "DICOM" in scan)
def parent_dirs(self) -> ty.Set[Path]:
"Return parent directories for all resources in the session"
return set(r.parent for r in self.resources)

@property
def dicom_dirs(self) -> ty.List[Path]:
"A common parent directory for all the top-level paths in the file-set"
return [p.parent for p in self.dicoms] # type: ignore
def resources(self) -> ty.List[FileSet]:
return [r for p in self.scans.values() for r in p.resources.values()]

@property
def primary_resources(self) -> ty.List[FileSet]:
return [
r
for s in self.scans.values()
for r in s.resources.values()
if not s.associated
]

def select_resources(
self,
Expand Down Expand Up @@ -199,13 +218,13 @@ def select_resources(

@cached_property
def metadata(self):
all_dicoms = list(self.dicoms)
all_keys = [list(d.metadata.keys()) for d in all_dicoms if d.metadata]
primary_resources = self.primary_resources
all_keys = [list(d.metadata.keys()) for d in primary_resources if d.metadata]
common_keys = [
k for k in set(chain(*all_keys)) if all(k in keys for keys in all_keys)
]
collated = {k: all_dicoms[0].metadata[k] for k in common_keys}
for i, series in enumerate(all_dicoms[1:], start=1):
collated = {k: primary_resources[0].metadata[k] for k in common_keys}
for i, series in enumerate(primary_resources[1:], start=1):
for key in common_keys:
val = series.metadata[key]
if val != collated[key]:
Expand All @@ -232,7 +251,7 @@ def from_paths(
visit_field: str = "AccessionNumber",
scan_id_field: str = "SeriesNumber",
scan_desc_field: str = "SeriesDescription",
resource_field: str = "ImageType",
resource_field: str = "ImageType[-1]",
session_field: str | None = "StudyInstanceUID",
project_id: str | None = None,
) -> ty.List[Self]:
Expand Down Expand Up @@ -260,7 +279,7 @@ def from_paths(
by default "SeriesDescription"
resource_field: str
the metadata field that contains the XNAT resource ID for the imaging session,
by default "ImageType"
by default "ImageType[-1]"
session_field : str, optional
the name of the metadata field that uniquely identifies the session, used
to check that the values extracted from the IDs across the DICOM scans are
Expand Down Expand Up @@ -331,8 +350,13 @@ def from_paths(
session_uid = resource.metadata[session_field] if session_field else None

def get_id(field_type: str, field_name: str) -> str:
if match := re.match(r"(\w+)\[([\-\d]+)\]", field_name):
field_name, index = match.groups()
index = int(index)
else:
index = None
try:
value = resource.metadata[field_name]
value = str(resource.metadata[field_name])
except KeyError:
if session_uid and field_type in ("project", "subject", "visit"):
value = (
Expand All @@ -349,6 +373,8 @@ def get_id(field_type: str, field_name: str) -> str:
f"Did not find '{field_name}' field in {resource}, "
"cannot uniquely identify the resource"
)
if index is not None:
value = value[index]
return value

if not project_id:
Expand All @@ -357,7 +383,10 @@ def get_id(field_type: str, field_name: str) -> str:
visit_id = get_id("visit", visit_field)
scan_id = get_id("scan", scan_id_field)
scan_type = get_id("scan type", scan_desc_field)
resource_id = get_id("resource", resource_field)
if isinstance(resource, DicomSeries):
resource_id = "DICOM"
else:
resource_id = get_id("resource", resource_field)

if session_uid is None:
session_uid = (project_id, subject_id, visit_id)
Expand Down Expand Up @@ -592,7 +621,7 @@ def save(self, save_dir: Path, just_manifest: bool = False) -> "ImagingSession":
def stage(
self,
dest_dir: Path,
associated_files: ty.Optional[AssociatedFiles] = None,
associated_file_groups: ty.Collection[AssociatedFiles] = (),
remove_original: bool = False,
deidentify: bool = True,
project_list: ty.Optional[ty.List[str]] = None,
Expand All @@ -610,12 +639,12 @@ def stage(
work_dir : Path, optional
the directory the staged sessions are created in before they are moved into
the staging directory
associated_files : ty.Tuple[str, str], optional
associated_file_groups : Collection[AssociatedFiles], optional
Glob pattern used to select the non-dicom files to include in the session. Note
that the pattern is relative to the parent directory containing the DICOM files
NOT the current working directory.
The glob pattern can contain string template placeholders corresponding to DICOM
metadata (e.g. '{PatientName.given_name}_{PatientName.family_name}'), which
metadata (e.g. '{PatientName.family_name}_{PatientName.given_name}'), which
are substituted before the string is used to glob the non-DICOM files. In
order to deidentify the filenames, the pattern must explicitly reference all
identifiable fields in string template placeholders. By default, None
Expand Down Expand Up @@ -653,6 +682,7 @@ def stage(
project_dir = "INVALID-UNRECOGNISED-PROJECT-" + self.project_id
session_dir = dest_dir / project_dir / self.subject_id / self.visit_id
session_dir.mkdir(parents=True)
session_metadata = self.metadata
for scan in tqdm(
self.scans.values(), f"Staging DICOM sessions to {session_dir}"
):
Expand Down Expand Up @@ -686,14 +716,14 @@ def stage(
staged_scans.append(
ImagingScan(id=scan.id, type=scan.type, resources=staged_resources)
)
if associated_files:
for associated_files in associated_file_groups:
# substitute string templates int the glob template with values from the
# DICOM metadata to construct a glob pattern to select files associated
# with current session
associated_fspaths: ty.Set[Path] = set()
for dicom_dir in self.dicom_dirs:
for parent_dir in self.parent_dirs:
assoc_glob = str(
dicom_dir / associated_files.glob.format(**self.metadata)
parent_dir / associated_files.glob.format(**session_metadata)
)
if spaces_to_underscores:
assoc_glob = assoc_glob.replace(" ", "_")
Expand All @@ -705,7 +735,7 @@ def stage(
logger.info(
"Found %s associated file paths matching '%s'",
len(associated_fspaths),
assoc_glob,
associated_files.glob,
)

tmpdir = session_dir / ".tmp"
Expand All @@ -720,12 +750,12 @@ def stage(
assoc_glob_pattern = associated_files.glob
else:
assoc_glob_pattern = (
str(dicom_dir) + os.path.sep + associated_files.glob
str(parent_dir) + os.path.sep + associated_files.glob
)
transformed_fspaths = transform_paths(
list(associated_fspaths),
assoc_glob_pattern,
self.metadata,
session_metadata,
staged_metadata,
spaces_to_underscores=spaces_to_underscores,
)
Expand Down Expand Up @@ -804,13 +834,13 @@ def stage(
else:
shutil.copyfile(fspath, dest_path)
resource_fspaths.append(dest_path)
format_type = File if len(fspaths) == 1 else FileSet
scan_resources[resource_name] = format_type(resource_fspaths)
scan_resources[resource_name] = associated_files.datatype(resource_fspaths)
staged_scans.append(
ImagingScan(
id=scan_id,
type=scan_type,
resources=scan_resources,
associated=True,
)
)
os.rmdir(tmpdir) # Should be empty
Expand Down
Loading

0 comments on commit e566c81

Please sign in to comment.