Skip to content

Commit

Permalink
Added nextseq pseudonymization
Browse files Browse the repository at this point in the history
  • Loading branch information
Tomáš Houfek committed Oct 16, 2024
1 parent 0845454 commit ff03c82
Show file tree
Hide file tree
Showing 454 changed files with 174 additions and 40 deletions.
30 changes: 0 additions & 30 deletions pseudonymization/helpers/file_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,33 +8,3 @@ def remove_path_if_exist(file_path):
shutil.rmtree(file_path)
else:
os.remove(file_path)


def remove_miseq_run_files(run_path: object) -> object:
remove_path_if_exist(os.path.join(run_path, "Data", "RTALogs"))
remove_path_if_exist(os.path.join(run_path, "Data", "Intensities", "L001"))
remove_path_if_exist(os.path.join(run_path, "Thumbnail_Images"))
remove_path_if_exist(os.path.join(run_path, "Recipe"))
remove_path_if_exist(os.path.join(run_path, "Data", "Intensities", "RTAConfiguration.xml"))
remove_path_if_exist(os.path.join(run_path, "Data", "Intensities", "BaseCalls", "SampleSheet.csv"))
remove_path_if_exist(os.path.join(run_path, "Data", "Intensities", "BaseCalls", "Alignment", "SampleSheetUsed.csv"))
remove_path_if_exist(os.path.join(run_path, "Data", "Intensities", "BaseCalls", "Alignment", "GenerateFASTQRunStatistics.xml"))
remove_path_if_exist(os.path.join(run_path, "Basecalling_Netcopy_complete_Read1.txt"))
remove_path_if_exist(os.path.join(run_path, "Basecalling_Netcopy_complete_Read2.txt"))
remove_path_if_exist(os.path.join(run_path, "Basecalling_Netcopy_complete_Read3.txt"))
remove_path_if_exist(os.path.join(run_path, "Basecalling_Netcopy_complete_Read4.txt"))
remove_path_if_exist(os.path.join(run_path, "ImageAnalysis_Netcopy_complete_Read1.txt"))
remove_path_if_exist(os.path.join(run_path, "ImageAnalysis_Netcopy_complete_Read2.txt"))
remove_path_if_exist(os.path.join(run_path, "ImageAnalysis_Netcopy_complete_Read3.txt"))
remove_path_if_exist(os.path.join(run_path, "ImageAnalysis_Netcopy_complete_Read4.txt"))
remove_path_if_exist(os.path.join(run_path, "QueuedForAnalysis.txt"))
remove_path_if_exist(os.path.join(run_path, "RTAComplete.txt"))


def remove_nextseq_run_files(run_path):
pass

def mv_if_source_not_exist(old_path, new_path):
if not os.path.exists(new_path):
shutil.copytree(old_path, new_path)
shutil.rmtree(old_path)
14 changes: 11 additions & 3 deletions pseudonymization/process/processor.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import os
import shutil

import xml.etree.ElementTree as ET
from pseudonymization.removers.remover import FileRemover
from pseudonymization.pseudonymizers.run_pseudonymizer import RunPseudonymizer
from pseudonymization.pseudonymizers.old_miseq_pseudonymizer import OldMiseqPseudonymizer
Expand Down Expand Up @@ -36,9 +36,17 @@ def _copy_pseudonymizer_run_to_sc(self, run_name):
def _initialize_based_on_record_type(self, full_run_path) -> (FileRemover, RunPseudonymizer):
if "SoftwareVersionsFile.csv" in os.listdir(full_run_path) or "Alignment_1" in os.listdir(full_run_path):
pseudonymizer = NewMiseqPseudonymizer(full_run_path, self.pseudonymization_tables_folder)
elif "Something NextSeqSpecific" in os.listdir(full_run_path):
pseudonymizer = NextSeqPseudonymizer()
elif self._is_next_seq_based_on_run_parameters(full_run_path):
pseudonymizer = NextSeqPseudonymizer(full_run_path, self.pseudonymization_tables_folder)
else:
pseudonymizer = OldMiseqPseudonymizer(full_run_path, self.pseudonymization_tables_folder)

return pseudonymizer

def _is_next_seq_based_on_run_parameters(self, full_run_path):
run_parameters = os.path.join(full_run_path, "RunParameters.xml")
tree = ET.parse(run_parameters)
root = tree.getroot()
for child in root:
if child.tag == "RunParametersVersion":
return "nextseq" in child.text.lower()
47 changes: 43 additions & 4 deletions pseudonymization/pseudonymizers/nextseq_pseudonymizer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,49 @@
from .run_pseudonymizer import RunPseudonymizer
import os
import pandas as pd

from .run_pseudonymizer import RunPseudonymizer
from pseudonymization.pseudonimization_api.pseudonimize_predictive import PseudonymizePredictive
from pseudonymization.removers.nextseq_remover import NextSeqRemover

class NextSeqPseudonymizer(RunPseudonymizer):

def __init__(self):
pass
def __init__(self, run_path, pseudo_tables_folder_path):
self.patient_pseudo_table = os.path.join(pseudo_tables_folder_path, "patients.json")
self.predictive_pseudo_table = os.path.join(pseudo_tables_folder_path, "predictive.json")
self.sample_pseudo_table = os.path.join(pseudo_tables_folder_path, "samples.json")
self.run_path = run_path

def pseudonymize(self):
pass
# NextSeqRemover(self.run_path).remove_files()
pred_pseudo_tuples = self._get_all_predictive_numbers_pseudonymize_sample_sheet()
return pred_pseudo_tuples


def _get_all_predictive_numbers_pseudonymize_sample_sheet(self):
sample_sheet_path = os.path.join(self.run_path, "SampleSheet.csv")
df = pd.read_csv(sample_sheet_path,
delimiter=",",
names=["[Header]", "Unnamed: 1", "Unnamed: 2", "Unnamed: 3", "Unnamed: 4",
"Unnamed: 5", "Unnamed: 6", "Unnamed: 7", "Unnamed: 8"])

sample_list_header = df["[Header]"].to_list()
sample_list_last = df["Unnamed: 8"].to_list()
sample_ids_start = sample_list_header.index("Sample_ID") + 1

predictive_numbers = sample_list_header[sample_ids_start:]
predictive_pseudonimizer = PseudonymizePredictive(self.predictive_pseudo_table)
pseudo_numbers = [predictive_pseudonimizer.pseudonymize(pred_number) for pred_number in predictive_numbers]

new_column_header = sample_list_header[:sample_ids_start] + pseudo_numbers
new_column_last = sample_list_last[:sample_ids_start] + pseudo_numbers

df.drop(["[Header]", "Unnamed: 8"], axis=1, inplace=True)
df.insert(loc=0, column="[Header]", value=new_column_header)
df["Unnamed: 8"] = new_column_last
df.columns = ["[Header]"] + ["" for _ in range(len(df.columns) - 1)]
df.fillna("", inplace=True)
df.to_csv(sample_sheet_path, header=False, index=False)

predictive_pseudo_tuples = [(predictive_numbers[i], pseudo_numbers[i]) for i in range(len(predictive_numbers))]

return predictive_pseudo_tuples
2 changes: 0 additions & 2 deletions pseudonymization/removers/new_miseq_remover.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import os.path
from typing import Optional

from .remover import FileRemover
from pseudonymization.helpers.file_helpers import remove_path_if_exist

Expand Down
22 changes: 21 additions & 1 deletion pseudonymization/removers/nextseq_remover.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import os.path

from .remover import FileRemover
from pseudonymization.helpers.file_helpers import remove_path_if_exist


class NextSeqRemover(FileRemover):
Expand All @@ -7,4 +10,21 @@ def __init__(self, sequencing_folder_path):
self.folder_path = sequencing_folder_path

def remove_files(self) -> None:
pass
paths_to_remove = [
"Config",
"Recipe",
"Images",
"Logs",
"InstrumentAnalyticsLogs",
"InterOp",
"RTALogs",
"CopyComplete.txt",
"RTAComplete.txt",
"RTARead1Complete.txt",
"RTARead2Complete.txt",
"RTARead3Complete.txt",
"RTARead4Complete.txt",
]

for path in paths_to_remove:
remove_path_if_exist(os.path.join(self.folder_path, path))
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Empty file.
Loading

0 comments on commit ff03c82

Please sign in to comment.