Skip to content

Commit

Permalink
newest changes
Browse files Browse the repository at this point in the history
  • Loading branch information
Tomáš Houfek committed Oct 22, 2024
1 parent 6bf26be commit 7397437
Show file tree
Hide file tree
Showing 17 changed files with 213 additions and 115 deletions.
Empty file added __init__.py
Empty file.
8 changes: 8 additions & 0 deletions api_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import molgenis.client


session = molgenis.client.Session("http://147.251.124.190:8080/api/")
session.login("admin", "admin")

table = session.get("fair-genomes_Analysis")
print(table)
17 changes: 10 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,27 +8,30 @@


def run(organised_files_foldes, wsi_folders, libraries_folders):
molgenis_login = os.environ["CATALOG_LOGIN"]
molgenis_password = os.environ["CATALOG_PASSWORD"]
molgenis_login = None # os.environ["CATALOG_LOGIN"]
molgenis_password = None # os.environ["CATALOG_PASSWORD"]

run_paths_for_catalogue_upload = get_all_runs_with_data_for_catalogue(organised_files_foldes)

for absolute_run_path in run_paths_for_catalogue_upload:
print(absolute_run_path)
logging.info("Collecting metadata...")
upload_to_catalog = CollectRunMetadata(absolute_run_path)()

catalog_info_folder = os.path.join(absolute_run_path, "catalog_info_per_pred_number")
for sample_id in os.listdir(catalog_info_folder):
sample_id = sample_id.replace(".json", "")
sample_stat_info = os.path.join(absolute_run_path, "Samples", sample_id, "Analysis", "Reports",
f"{sample_id}_StatInfo.txt")
CollectSampleMetadata(absolute_run_path, sample_stat_info, catalog_info_folder)()
sample_path = os.path.join(absolute_run_path, "Samples", sample_id)
CollectSampleMetadata(absolute_run_path, sample_path, catalog_info_folder)()

logging.info("Uploading data to catalog...")
importer = MolgenisImporter(absolute_run_path, wsi_folders, libraries_folders, molgenis_login,
importer = MolgenisImporter(absolute_run_path,
wsi_folders,
libraries_folders,
molgenis_login,
molgenis_password)
importer()
del importer
# del importer


# Press the green button in the gutter to run the script.
Expand Down
Binary file not shown.
5 changes: 2 additions & 3 deletions tests/test_miseq_sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
"2020_M00000_0000_00000000-00000")

FAKE_SAMPLE_DATA = os.path.join(COMPLETE_FAKE_RUN_FOR_TESTING, "Samples",
"mmci_predictive_00000000-0000-0000-0000-000000000001",
"Analysis",
"mmci_predictive_00000000-0000-0000-0000-000000000001_StatInfo.txt")
"mmci_predictive_00000000-0000-0000-0000-000000000001")

FAKE_CATALOGUE_INFO = os.path.join(COMPLETE_FAKE_RUN_FOR_TESTING, "catalog_info_per_pred_number")

Expand All @@ -35,6 +33,7 @@ def setup_and_teardown_organise_files(request):


def test_files_created():

CollectSampleMetadata(COMPLETE_FAKE_RUN_FOR_TESTING, FAKE_SAMPLE_DATA, FAKE_CATALOGUE_INFO)()

assert os.path.exists(os.path.join(COMPLETE_FAKE_RUN_FOR_TESTING, "sample_metadata",
Expand Down
5 changes: 3 additions & 2 deletions uploader/file_helpers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os

import re

def get_all_runs_with_data_for_catalogue(organised_folder: str) -> [str]:
runs_to_precess_for_catalogue = []
for year in os.listdir(organised_folder):
years = [year for year in os.listdir(organised_folder) if re.match(r'^[\d]{4}$', year)]
for year in years:
for run_type in os.listdir(os.path.join(organised_folder, year)):
multiple_runs_path = os.path.join(organised_folder, year, run_type)
for run in os.listdir(multiple_runs_path):
Expand Down
88 changes: 26 additions & 62 deletions uploader/import_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@
import json
import os


from molgenis_models.Analysis import Analysis
from molgenis_models.Clinical import Clinical
from molgenis_models.IndividualConsent import IndividualConsent
from molgenis_models.Material import Material
from molgenis_models.Personal import Personal
from molgenis_models.SamplePreparation import SamplePreparation
from molgenis_models.Sequencing import Sequencing
from uploader.molgenis_models.Analysis import Analysis
from uploader.molgenis_models.Clinical import Clinical
from uploader.molgenis_models.IndividualConsent import IndividualConsent
from uploader.molgenis_models.Material import Material
from uploader.molgenis_models.Personal import Personal
from uploader.molgenis_models.SamplePreparation import SamplePreparation
from uploader.molgenis_models.Sequencing import Sequencing


class MolgenisImporter:
Expand All @@ -23,15 +22,15 @@ class MolgenisImporter:
FAIR_INDI_CONSENT = "fair-genomes_IndividualConsent"

def __init__(self, run_path, wsi_path, libraries_path, login, password):
self.session = molgenis.client.Session("https://data.bbmri.cz/api/")
self.session.login(login, password)
self.run_path = run_path
# self.session = molgenis.client.Session("https://data.bbmri.cz/api/")
# self.session.login(login, password)
self.run_path = run_path
self.catalog_info_folder = os.path.join(run_path, "catalog_info_per_pred_number")
self.samples_metadata_folder = os.path.join(run_path, "sample_metadata")
self.wsi_path = wsi_path
self.libraries_path = libraries_path
self.sample_sheet_path = os.path.join(run_path,"SampleSheet.csv")
with open(os.path.join(run_path,"run_metadata.json"), "r") as f:
self.sample_sheet_path = os.path.join(run_path, "SampleSheet.csv")
with open(os.path.join(run_path, "run_metadata.json"), "r") as f:
self.run_metadata = json.load(f)

def __call__(self):
Expand All @@ -45,55 +44,20 @@ def __call__(self):
with open(sample_metadata_file, "r") as f:
sample_metadata_file = json.load(f)

personal = Personal(clinical_info_file)
personal_ids = [val["PersonalIdentifier"] for val in self.session.get(self.FAIR_PERSONAL)]
if personal.PersonalIdentifier not in personal_ids:
self._add_data(personal, self.FAIR_PERSONAL)

consent = IndividualConsent(clinical_info_file)
consent_ids = [val["IndividualConsentIdentifier"] for val in self.session.get(self.FAIR_INDI_CONSENT)]
if consent.IndividualConsentIdentifier not in consent_ids:
self._add_data(consent, self.FAIR_INDI_CONSENT)

clinical = Clinical(clinical_info_file)
clinical_ids = [val["ClinicalIdentifier"] for val in self.session.get(self.FAIR_CLINICAL)]
if clinical.ClinicalIdentifier not in clinical_ids:
self._add_data(clinical, self.FAIR_CLINICAL)

material = Material(self.wsi_path, clinical_info_file, sample_metadata_file)
material_ids = [val["MaterialIdentifier"] for val in self.session.get(self.FAIR_MATERIAL)]
if material.MaterialIdentifier not in material_ids:
self._add_data(material, self.FAIR_MATERIAL)
upload_sequence = [
Personal(clinical_info_file),
IndividualConsent(clinical_info_file),
Clinical(clinical_info_file),
Material(self.wsi_path, clinical_info_file, sample_metadata_file),
SamplePreparation(self.run_path, self.libraries_path, self.sample_sheet_path, clinical_info_file),
Sequencing(clinical_info_file, sample_metadata_file, self.run_metadata),
Analysis(clinical_info_file)
]

sample_preparation = SamplePreparation(self.run_path, self.libraries_path, self.sample_sheet_path, clinical_info_file)
sample_prep_ids = [val["SampleprepIdentifier"] for val in self.session.get(self.FAIR_SAMPLE_PREP)]
if sample_preparation.SampleprepIdentifier not in sample_prep_ids:
self._add_data(sample_preparation, self.FAIR_SAMPLE_PREP)

sequencing = Sequencing(clinical_info_file, sample_metadata_file, self.run_metadata)
sequencing_ids = [val["SequencingIdentifier"] for val in self.session.get(self.FAIR_SEQUENCING)]
if sequencing.SequencingIdentifier not in sequencing_ids:
self._add_data(sequencing, self.FAIR_SEQUENCING)

analysis = Analysis(clinical_info_file)
analysis_ids = [val["AnalysisIdentifier"] for val in self.session.get(self.FAIR_ANALYSIS)]
if analysis.AnalysisIdentifier not in analysis_ids:
self._add_data(analysis, self.FAIR_ANALYSIS)
for molgenis_object in upload_sequence:
print(molgenis_object.serialize)
# molgenis_object.add_to_catalog_if_not_exists(self.session)

def __del__(self):
self.session.logout()

def _directorize(self, object):
d = {}
for key, value in object.__dict__.items():
if value.__class__== tuple:
d[key]= value[0]
else:
d[key] = value
return d

def _add_data(self, data, data_type):
data_dict = self._directorize(data)
datas = [data_dict]
self.session.add_all(data_type, datas)

pass
# self.session.logout()
Binary file not shown.
10 changes: 5 additions & 5 deletions uploader/metadata_managers/miseq_sample_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,23 @@ def __init__(self):

class CollectSampleMetadata:

def __init__(self, run_path, sample_stat_info_path, catalog_info_path):
def __init__(self, run_path, sample_path, catalog_info_path):
self.sample_info = SampleInfoMMCI()
self.run_path = run_path
self.sample_path = sample_stat_info_path
self.sample_path = sample_path
self.clinical_data_path = catalog_info_path

def __call__(self):
sample_id = os.path.basename(self.sample_path).replace("_StatInfo.txt", "")
sample_id = os.path.basename(self.sample_path)
metadata = self._find_sample_metadata(sample_id)
if not os.path.exists(os.path.join(self.run_path, "sample_metadata")):
os.mkdir(os.path.join(self.run_path, "sample_metadata"))
with open(os.path.join(self.run_path, "sample_metadata", f'{sample_id}.json'), "w+") as outfile:
json.dump(metadata, outfile, indent=4)

def _find_sample_metadata(self, sample_id):
self._find_data_in_statinfo(self.sample_path)
self._find_data_in_CCRS(os.path.join(self.run_path, "Samples", sample_id, "Analysis", "Reports",
self._find_data_in_statinfo(os.path.join(self.sample_path, "Analysis", f"{sample_id}_StatInfo.txt"))
self._find_data_in_CCRS(os.path.join(self.sample_path, "Analysis", "Reports",
f"{sample_id}_Coverage_Curve_Report1_Statistics.txt"))
self._find_data_in_clinical_info(self.clinical_data_path, sample_id)
json_str = self.sample_info.__dict__
Expand Down
21 changes: 19 additions & 2 deletions uploader/molgenis_models/Analysis.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
class Analysis:
from abc import ABC

from uploader.molgenis_models.MolgenisObject import MolgenisObject


class Analysis(MolgenisObject):

TYPE = "fair-genomes_Analysis"

def __init__(self, patient_dict):
sample = patient_dict["samples"][0]
self.AnalysisIdentifier = sample["pseudo_ID"].replace("predictive", "analysis")
Expand All @@ -7,4 +15,13 @@ def __init__(self, patient_dict):
self.AbstractDataLocation = "Sensitive Cloud Institute of Computer Science"
self.DataFormatsStored = ["BAM", "VCF"]
self.ReferenceGenomeUsed = "GRCh37"
self.BioinformaticProtocolUsed = "NextGENe"
self.BioinformaticProtocolUsed = "NextGENe"

def add_to_catalog_if_not_exist(self, session):
analysis_ids = [val["AnalysisIdentifier"] for val in session.get(self.TYPE)]
if self.AnalysisIdentifier not in analysis_ids:
self._add_to_catalog(session)

def _add_to_catalog(self, session):
data_dict = self.serialize
session.add_all(self.TYPE, [data_dict])
37 changes: 28 additions & 9 deletions uploader/molgenis_models/Clinical.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import uuid
from uploader.molgenis_models.MolgenisObject import MolgenisObject
from datetime import datetime


class Clinical(MolgenisObject):

TYPE = "fair-genomes_Clinical"

class Clinical:
def __init__(self, patient_dict):
sample = patient_dict["samples"][0]
self.ClinicalIdentifier = f"mmci_clinical_{uuid.UUID(int=int(sample['biopsy_number'].replace('/', '').replace('-', '')))}"
Expand All @@ -9,18 +15,31 @@ def __init__(self, patient_dict):
self.UnobservedPhenotype = ["NoInformation (NI, nullflavor)"]
self.ClinicalDiagnosis = self._adjust_diagnosis(sample["diagnosis"]) if sample["material"] != "genome" else None
self.MolecularDiagnosisGene = ["NoInformation (NI, nullflavor)"]
self.AgeAtDiagnosis = self._calculate_age_at_diagnosis(patient_dict["birth"].split("/")[1], sample)
self.AgeAtLastScreening = self._calculate_age_at_diagnosis(patient_dict["birth"].split("/")[1], sample)
self.AgeAtDiagnosis = self._calculate_age_at_diagnosis(patient_dict["birth"], sample)
self.AgeAtLastScreening = self._calculate_age_at_diagnosis(patient_dict["birth"], sample)
self.Medication = ["NoInformation (NI, nullflavor)"]
self.MedicalHistory = ["NoInformation (NI, nullflavor)"]

def _calculate_age_at_diagnosis(self, birth, sample):
if sample["material"] == "tissue":
return int(sample["freeze_time"].split("-")[0]) - int(birth)
def add_to_catalog_if_not_exist(self, session):
analysis_ids = [val["ClinicalIdentifier"] for val in session.get(self.TYPE)]
if self.ClinicalIdentifier not in analysis_ids:
self._add_to_catalog(session)

def _add_to_catalog(self, session):
data_dict = self.serialize
session.add_all(self.TYPE, [data_dict])

@staticmethod
def _calculate_age_at_diagnosis(birth, sample):
datetime_format = "%d/%m/%Y"
if sample["material"] == "Tissue":
return datetime.strptime(sample["freeze_time"],
datetime_format + ", %H:%M:%S") - datetime.strptime(birth, datetime_format)
else:
return int(sample["taking_date"].split("-")[0]) - int(birth)
return datetime.strptime(sample["taking_date"], datetime_format) - datetime.strptime(birth, datetime_format)

def _adjust_diagnosis(self, diagnosis):
@staticmethod
def _adjust_diagnosis(diagnosis):
if len(diagnosis) == 4:
return diagnosis[:3] + "." + diagnosis[3]
return diagnosis
return diagnosis
24 changes: 19 additions & 5 deletions uploader/molgenis_models/IndividualConsent.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
class IndividualConsent:
from uploader.molgenis_models.MolgenisObject import MolgenisObject


class IndividualConsent(MolgenisObject):

TYPE = "fair-genomes_IndividualConsent"

def __init__(self, patient_dict):
sample = patient_dict["samples"][0]
sample = patient_dict["samples"][0]
self.IndividualConsentIdentifier = patient_dict["ID"].replace("patient", "consent")
self.PersonConsenting = patient_dict["ID"]
self.ConsentFormUsed = "mmci_consentform_1"
self.CollectedBy= "Masaryk Memorial Cancer Institute"
self.SigningDate = sample["freeze_time"] if sample["material"] == "tissue" else sample["taking_date"]
self.CollectedBy = "Masaryk Memorial Cancer Institute"
self.SigningDate = sample["freeze_time"] if sample["material"] == "Tissue" else sample["taking_date"]
self.RepresentedBy = "patient"
self.DataUsePermissions = "general research use"
self.DataUsePermissions = "general research use"

def add_to_catalog_if_not_exist(self, session):
analysis_ids = [val["IndividualConsentIdentifier"] for val in session.get(self.TYPE)]
if self.IndividualConsentIdentifier not in analysis_ids:
self._add_to_catalog(session)

def _add_to_catalog(self, session):
data_dict = self.serialize
session.add_all(self.TYPE, [data_dict])
25 changes: 19 additions & 6 deletions uploader/molgenis_models/Material.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,35 @@
import os
import uuid
from uploader.molgenis_models.MolgenisObject import MolgenisObject

class Material:

class Material(MolgenisObject):

TYPE = "fair-genomes_Material"

def __init__(self, wsi_path, patient_dict, sample_dict):
sample = patient_dict["samples"][0]
sample = patient_dict["samples"][0]
self.MaterialIdentifier = sample["sample_ID"]
self.CollectedFromPerson = patient_dict["ID"]
self.BelongsToDiagnosis = f"mmci_clinical_{uuid.UUID(int=int(sample['biopsy_number'].replace('/', '').replace('-', '')))}"
self.SamplingTimestamp = sample["cut_time"] if sample["material"] == "tissue" else sample["taking_date"]
self.RegistrationTimestamp = sample["freeze_time"] if sample["material"] == "tissue" else sample["taking_date"]
self.SamplingTimestamp = sample["cut_time"] if sample["material"] == "Tissue" else sample["taking_date"]
self.RegistrationTimestamp = sample["freeze_time"] if sample["material"] == "Tissue" else sample["taking_date"]
self.BiospecimenType = sample_dict["bioSpeciType"]
self.PathologicalState = sample_dict["pathoState"]
self.StorageConditions = sample_dict["storCond"]
self.PercentageTumourCells = "NotAvailable (NA, nullflavor)"
self.PhysicalLocation = "MMCI Bank of Biological Material"
self.wholeslideimagesavailability = self._look_for_wsi(wsi_path, sample["biopsy_number"])
self.radiotherapyimagesavailability = False
self.radiotherapyimagesavailability = False

def add_to_catalog_if_not_exist(self, session):
analysis_ids = [val["MaterialIdentifier"] for val in session.get(self.TYPE)]
if self.MaterialIdentifier not in analysis_ids:
self._add_to_catalog(session)

def _add_to_catalog(self, session):
data_dict = self.serialize
session.add_all(self.TYPE, [data_dict])

def _look_for_wsi(self, wsi_path, biopsy_number):
wsi_folder, biopsy_start = self._make_path_from_biopsy_number(biopsy_number)
Expand All @@ -30,4 +43,4 @@ def _make_path_from_biopsy_number(self, biopsy_number):
remaining = biopsy_number.split("/")[1].split("-")[0].zfill(5)
fixed_biopsy = f"{year}_{remaining}-{biopsy_number.split('/')[1].split('-')[1].zfill(2)}"

return os.path.join(year, remaining[:2], remaining[2:]), fixed_biopsy
return os.path.join(year, remaining[:2], remaining[2:]), fixed_biopsy
Loading

0 comments on commit 7397437

Please sign in to comment.