Skip to content

Commit

Permalink
changes for production
Browse files Browse the repository at this point in the history
  • Loading branch information
Tomáš Houfek committed Dec 12, 2024
1 parent fdb8a19 commit b8b9fdc
Show file tree
Hide file tree
Showing 154 changed files with 387 additions and 33 deletions.
14 changes: 14 additions & 0 deletions Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM bitnami/python:3.10

RUN mkdir /organisation-app

WORKDIR /organisation-app

ADD requirements.txt .
ADD main.py .
ADD organiser/ organiser/
ADD tests/ tests/

RUN pip install -r requirements.txt

USER 1001
File renamed without changes.
8 changes: 6 additions & 2 deletions compose.yaml
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
version: '3.0'
services:
run:
build: .
build:
context: .
dockerfile: Dockerfile.dev
volumes:
- /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/PseudonymizedRunes:/PseudonymizedRuns
- /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/PseudonymizedRuns:/PseudonymizedRuns
- /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/NextSeqTemp/:/NextSeqTemp
- /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/OrganisedRuns/:/OrganisedRuns
- /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/Patients/:/Patients
command: bash -c "python main.py
-r /PseudonymizedRuns/
-n /NextSeqTemp/
-o /OrganisedRuns/
-p /Patients/"
9 changes: 0 additions & 9 deletions kubernetes/catalog-secret.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion kubernetes/organiser-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ spec:
secretKeyRef:
name: catalog-secret
key: httpsproxy
command: ["python", "main.py", "-r", "/data/PSEUDONYMIZED", "-o", "/data/OrganisedRuns/", "-p", "/data/Patients/"]
command: ["python", "main.py", "-r", "/data/PSEUDONYMIZED", "-n", "data/NextSeqTemp/" ,"-o", "/data/OrganisedRuns/", "-p", "/data/Patients/"]
volumeMounts:
- name: storage-catalogue-volume
mountPath: /data/
Expand Down
3 changes: 2 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

parser.add_argument("-r", "--runs", type=str, required=True, help="Path to pseudonymized runs")
parser.add_argument("-o", "--output", type=str, required=True, help="Path to the organise file")
parser.add_argument("-n", "--nextseq_ouput", type=str, required=True, help="Temporary nextseq folder")
parser.add_argument("-p", "--patients", type=str, required=True, help="Path to a patient folder")
args = parser.parse_args()

Processor(args.runs, args.output, args.patients).process_runs()
Processor(args.runs, args.output, args.nextseq_output, args.patients).process_runs()
24 changes: 20 additions & 4 deletions organiser/process/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,19 @@
import logging
import sys
from datetime import datetime
import pandas as pd
from organiser.run_organisers.nextseq_organise_run import NextSeqRunOrganiser
from organiser.run_organisers.old_miseq_organise_run import OldMiseqRunOrganiser
from organiser.run_organisers.new_miseq_organise_run import NewMiseqOrganiseRun
from organiser.run_organisers.new_miseq_organise_run import NewMiseqRunOrganiser
from organiser.run_organisers.organise_run import OrganiseRun
from organiser.helpers.file_helpers import create_dictionary_if_not_exist


class Processor:
def __init__(self, pseudnymized_runs_folder, folder_for_organised_files, patient_folder):
def __init__(self, pseudnymized_runs_folder, folder_for_organised_files, next_seq_temporary_folder, patient_folder):
self.psedunymized_runs_folder = pseudnymized_runs_folder
self.organised_files_folder = folder_for_organised_files
self.next_seq_temporary_folder = next_seq_temporary_folder
self.patient_folder = patient_folder

def process_runs(self):
Expand Down Expand Up @@ -60,9 +63,22 @@ def _get_correct_organiser(self, run_path) -> OrganiseRun:

if "Alignment_1" in os.listdir(full_run_path) or "SoftwareVersionsFile" in os.listdir(full_run_path):
logging.info(f"{run_path} processed as New Miseq")
return NewMiseqOrganiseRun(self.psedunymized_runs_folder, run_path,
self.organised_files_folder, self.patient_folder)
return NewMiseqRunOrganiser(self.psedunymized_runs_folder, run_path,
self.organised_files_folder, self.patient_folder)
elif self._is_run_nextseq(full_run_path):
logging.info(f"{run_path} processed as NextSeq")
return NextSeqRunOrganiser(self.psedunymized_runs_folder, run_path,
self.next_seq_temporary_folder, self.patient_folder)
else:
logging.info(f"{run_path} processed as Old Miseq")
return OldMiseqRunOrganiser(self.psedunymized_runs_folder, run_path,
self.organised_files_folder, self.patient_folder)

def _is_run_nextseq(self, full_run_path) -> bool:
sample_sheet_path = os.path.join(full_run_path, "SampleSheet.csv")
df = pd.read_csv(sample_sheet_path, delimiter=",",
names=["[Header]", "Unnamed: 1", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4",
"Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9"])

application_value = df[df["[Header]"] == "Application"]["Unnamed: 1"].tolist()[0]
return application_value.startswith("NextSeq")
2 changes: 1 addition & 1 deletion organiser/run_organisers/new_miseq_organise_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from .old_miseq_organise_run import OldMiseqRunOrganiser


class NewMiseqOrganiseRun(OldMiseqRunOrganiser):
class NewMiseqRunOrganiser(OldMiseqRunOrganiser):

def organise_run(self):
y = self._get_file_year()
Expand Down
50 changes: 50 additions & 0 deletions organiser/run_organisers/nextseq_organise_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from organiser.helpers.file_helpers import copy_if_exists, copy_folder_if_exists
from organiser.run_organisers.old_miseq_organise_run import OldMiseqRunOrganiser
import os
from pathlib import Path

class NextSeqRunOrganiser(OldMiseqRunOrganiser):

def organise_run(self):
y = self._get_file_year()
machine = "NextSeq"
folder_for_run_path = os.path.join(self.organised_runs, y, machine)
Path(folder_for_run_path).mkdir(parents=True, exist_ok=True)
self._create_sample_dirs(folder_for_run_path)
self._create_general_file(folder_for_run_path)
self._create_patient_files_if_clinical_data_exist()
return os.path.join(folder_for_run_path, self.file)

def _create_general_file(self, new_file_path):
self._copy_important_files(os.path.join(self.pseudo_run, self.file), os.path.join(new_file_path, self.file))
self._copy_important_folders(os.path.join(self.pseudo_run, self.file), os.path.join(new_file_path, self.file))

def _collect_data_for_pseudo_number(self, new_folder, pseudo_number):
fastq_files = os.path.join(self.pseudo_run, self.file, "FASTQ")
if not os.path.exists(fastq_files):
return
os.mkdir(os.path.join(new_folder, "FASTQ"))
for file in os.listdir(fastq_files):
if pseudo_number in file:
copy_if_exists(os.path.join(fastq_files, file),
os.path.join(new_folder, "FASTQ", file))


def _copy_important_files(self, old_path, new_path):
files_to_move = ["RunInfo.xml", "RunParameters.xml", "RunCompletionStatus.xml", "SampleSheet.csv"]

for file in files_to_move:
old_file_path = os.path.join(old_path, file)
new_file_path = os.path.join(new_path, file)
copy_if_exists(old_file_path, new_file_path)


def _copy_important_folders(self, old_path, new_path):
folders_path = [
"Data",
"catalog_info_per_pred_number"
]
for folder in folders_path:
old_folder_path = os.path.join(old_path, folder)
new_folder_path = os.path.join(new_path, folder)
copy_folder_if_exists(old_folder_path, new_folder_path)
17 changes: 16 additions & 1 deletion organiser/run_organisers/old_miseq_organise_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ def __init__(self, path_to_pseudonymized_runs_folder, name_of_single_run,
def organise_run(self):
y = self._get_file_year()
machine = "MiSEQ"
folder_for_run_path = os.path.join(self.organised_runs, y, machine)
subtype = self._get_subtype()
folder_for_run_path = os.path.join(self.organised_runs, y, machine, subtype)
Path(folder_for_run_path).mkdir(parents=True, exist_ok=True)
self._create_sample_dirs(folder_for_run_path)
self._create_general_file(folder_for_run_path)
Expand All @@ -33,6 +34,20 @@ def _get_file_year(self):
year = splitted_filename[0][:2]
return f"20{year}"

def _get_subtype(self) -> str:
analysis = os.path.join(self.pseudo_run, self.file, "Analysis")
if os.path.exists(analysis):
return "complete-runs"
else:
sample_sheet_path = os.path.join(self.pseudo_run, self.file, "SampleSheet.csv")
df = pd.read_csv(sample_sheet_path, delimiter=",", names=["[Header]", "Unnamed: 1", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4",
"Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8", "Unnamed: 9"])
experiment_name = df[df["[Header]"] == "Experiment Name"]["Unnamed: 1"].tolist()[0]
if experiment_name.startswith("MP"):
return "mamma-print"
else:
return "missing-analysis"

def _create_sample_dirs(self, run_samples_path):
sample_sheet_path = os.path.join(self.pseudo_run, self.file, "SampleSheet.csv")
pseudo_numbers = self._get_pseudo_numbers(sample_sheet_path)
Expand Down
6 changes: 3 additions & 3 deletions tests/test_organise_new_miseq_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import shutil

from organiser.run_organisers.new_miseq_organise_run import NewMiseqOrganiseRun
from organiser.run_organisers.new_miseq_organise_run import NewMiseqRunOrganiser

FAKE_ALL_RUNS_FOR_TESTING = os.path.join(os.path.dirname(__file__), "FAKE_PSEUDONYMIZED_RUNS")
FAKE_RUN_FOR_COPY = os.path.join(os.path.dirname(__file__), "test_pseudonymized_runs",
Expand Down Expand Up @@ -44,8 +44,8 @@ def setup_and_teardown_organise_files(request):


def get_organiser():
return NewMiseqOrganiseRun(FAKE_ALL_RUNS_FOR_TESTING, "240101_M00000_0000_00000000-00000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
return NewMiseqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "240101_M00000_0000_00000000-00000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)


def test_folder_structure_correct():
Expand Down
128 changes: 128 additions & 0 deletions tests/test_organise_nextseq_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import pytest
import os
import shutil

from organiser.run_organisers.nextseq_organise_run import NextSeqRunOrganiser

FAKE_ALL_RUNS_FOR_TESTING = os.path.join(os.path.dirname(__file__), "FAKE_PSEUDONYMIZED_RUNS")
FAKE_RUN_FOR_COPY = os.path.join(os.path.dirname(__file__), "test_pseudonymized_runs",
"230101_N0000000_0000_0000000000")

FAKE_RUN_FOR_TESTING = os.path.join(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000")
FAKE_DESTINATION_FILES = os.path.join(os.path.dirname(__file__), "test_destination")
FAKE_PATIENT_FILES = os.path.join(os.path.dirname(__file__), "test_patients")

def _copy_fake_run():
shutil.copytree(FAKE_RUN_FOR_COPY, FAKE_RUN_FOR_TESTING)
os.mkdir(FAKE_DESTINATION_FILES)
os.mkdir(FAKE_PATIENT_FILES)

def _remove_coppied_fake_run():
shutil.rmtree(FAKE_ALL_RUNS_FOR_TESTING)
shutil.rmtree(FAKE_PATIENT_FILES)
shutil.rmtree(FAKE_DESTINATION_FILES)

@pytest.fixture(autouse=True)
def setup_and_teardown_organise_files(request):
_copy_fake_run()
request.addfinalizer(_remove_coppied_fake_run)


@pytest.fixture
def remove_fastq_folders():
shutil.rmtree(os.path.join(FAKE_RUN_FOR_TESTING, "FASTQ"))


def test_run_is_in_correct_sturecture():
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

assert os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000"))

def test_data_folder_structred():
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

assert os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000",
"Data", "Intensities", "BaseCalls"))
for i in range(1,5):
assert os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000",
"Data", "Intensities", f"L00{i}"))

def test_samples_folder_contains_fastq_files():
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

assert os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000",
"Samples"))

def test_individual_fastq_files():
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

for i in range(8):
sample_files = os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000",
"Samples")
assert os.path.exists(os.path.join(sample_files, f"2023_000{i}_DNA", "FASTQ"))
assert os.path.exists(os.path.join(sample_files, f"2023_000{i}_RNA", "FASTQ"))
assert all([file.endswith("fastq.gz") for file in os.listdir(os.path.join(sample_files,
f"2023_000{i}_DNA",
"FASTQ"))])
assert len(os.listdir(os.path.join(sample_files, f"2023_000{i}_DNA", "FASTQ"))) == 8

def test_missing_fastq_files(remove_fastq_folders):
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()
sample_files = os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000",
"Samples")
assert not os.path.exists(os.path.join(sample_files, "2023_0000_DNA", "FASTQ"))
for i in range(8):
assert os.path.exists(os.path.join(sample_files, f"2023_000{i}_DNA"))
assert os.path.exists(os.path.join(sample_files, f"2023_000{i}_RNA"))


@pytest.mark.parametrize("filename", ["SampleSheet.csv", "RunParameters.xml", "RunInfo.xml", "RunCompletionStatus.xml"])
def test_individual_nextseq_files(filename):
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

assert os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq", "230101_N0000000_0000_0000000000",
filename))


def test_catalogue_info_pred_number():
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

assert os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq",
"230101_N0000000_0000_0000000000", "catalog_info_per_pred_number"))

def test_catalog_info_missing_no_error(remove_catalog_info_per_pred_number):
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()

assert not os.path.exists(os.path.join(FAKE_DESTINATION_FILES, "2023", "NextSeq",
"230101_N0000000_0000_0000000000", "catalog_info_per_pred_number"))


def test_patient_correctly_created_in_tree():
organiser = NextSeqRunOrganiser(FAKE_ALL_RUNS_FOR_TESTING, "230101_N0000000_0000_0000000000",
FAKE_DESTINATION_FILES, FAKE_PATIENT_FILES)
organiser.organise_run()


assert os.path.exists(os.path.join(FAKE_PATIENT_FILES, "2000", "mmci_patient_00000000-0000-0000-0000-000000000001",
"patient_metadata.json"))


@pytest.fixture
def remove_catalog_info_per_pred_number():
shutil.rmtree(os.path.join(FAKE_RUN_FOR_TESTING, "catalog_info_per_pred_number"))
Loading

0 comments on commit b8b9fdc

Please sign in to comment.