diff --git a/Dockerfile b/Dockerfile.dev similarity index 85% rename from Dockerfile rename to Dockerfile.dev index 5d7391d..568cced 100644 --- a/Dockerfile +++ b/Dockerfile.dev @@ -5,7 +5,7 @@ RUN mkdir /pseudo-app WORKDIR /pseudo-app ADD requirements.txt . -ADD run_pseudonymization_pipeline.py . +ADD main.py . ADD pseudonymization/ pseudonymization/ ADD tests/ tests/ diff --git a/Dockerfile.prod b/Dockerfile.prod new file mode 100644 index 0000000..fecbf06 --- /dev/null +++ b/Dockerfile.prod @@ -0,0 +1,16 @@ +FROM bitnami/python:3.10 + +RUN mkdir /pseudo-app + +WORKDIR /pseudo-app + +RUN groupadd -g 1002 sequencers && useradd -u 1005 -g sequencers export + +ADD requirements.txt . +ADD main.py . +ADD pseudonymization/ pseudonymization/ +ADD tests/ tests/ + +RUN pip install -r requirements.txt + +USER export \ No newline at end of file diff --git a/README.md b/README.md index c9a7add..220068b 100644 --- a/README.md +++ b/README.md @@ -2,32 +2,50 @@ This is the repository for the pseudonymisation part of the BBMRI.cz data catalog. ## Pseudonymisation - -### MiSEQ Pseudonymizes predictive numbers, collects clinical data and removes unnecessary files before moving the data to SensitiveCloud at ICS-MUNI. -Pseudonymisation itself consists of multiple Python and Bash scripts, the whole pipeline is defined within **pseudonymize_pipeline.sh**: - 1. it sets paths to important folders and files, - 2. it ensures data (HIS exports) transfer from remote server, - 3. it looks for exports having "predictive number" value, - 4. it indentify and remove duplicates - 5. then it iterates through sequencing data folder (consisting of sequencing run output + following analysis output files) and runs scripts: - - **remove_files.sh** - removes defined unnecessary files - - **pseudonymisation.py** - performs pseudonymisation creating class **Pseudonymizer** and using function **pseudonymize_run()** or **\_\_call\_\_()**. The function performs the following tasks: - 1. Pseudonymization of samplesheets - 2. Adding clinical and biobank data - 3. Pseudonymizing file names - - **clinical_finder.py**, this clinical_finder.py Python script contains: - - class **Material** definition (with properties pseudo_ID, biopsy_number, sample_ID, sample_number, available_samples_number, material_type) - - The class **Material** has following subclasses: - - **Tissue** (with properties material, pTNM, morphology, diagnosis, cut_time, freeze_time) - - **Serum** (with properties material, diagnosis, taking_date) - - **Genome** (with properties material, taking_date) - - class **Patient** definition (with properties ID, birth, sex, samples) - - class **FindClinicalInfo** definition (with properties export_path, predictive_numbers, pseudo_pred_table_path, pseudo_patient_table_path, pseudo_sample_table_path, run_path) and functions: - - **\_\_call\_\_()** performs the following steps: - 1. Collects all clinical information in export and convert it to nicer json format - 2. Splits clinical info per patient and removes duplicated values - 3. Splits clinical info per pseudo_id and only takes one material per pseudo_id - - **get_pseudo_ids()** returns all pseudo_ids generated in the search process. - - **replace_predictive.sh** - replaces each predictive number appearance in all the files of sequencing data folder with created pseudo ID in previous step \ No newline at end of file +## Supported sequencing types +Miseq, New Miseq, MammaPrint + +## How to run the scripts +### Locally - Development +#### Using main.py +1. Install requirements +```bash +pip install -r requiremenents.txt +``` +2. Run main.py +```bash +python main.py -s /path/to/runs/for/pseudonymization -d /path/to/sensitive/cloud/destination + -t /path/to/pseudonymisation/tables/folder -l /path/to/libraries + -lsc /path/to/sensitive/cloud/libraries" +``` +#### Using docker-compose +```bash +docker-compose up -f compose.dev.yml -d --build +``` +### In production +#### Using docker-compose +```bash +# connect to seq server +su export +cd /home/export/data-catalogue-pseudonymisation +docker-compose up -f compose.prod.yml -d +``` +#### Deployment in cron +```bash +# connect to seq serve +su export +crontab -e +# setting cron to run every Monday, Wednesday, Friday at 22:00 +0 22 * * 1,3,5 /usr/local/bin/docker-compose -f /home/export/data-catalogue-pseudonymisation/compose.prod.yml up -d &>> /home/export/logs/`date +\%Y\%m\%d\%H\%M\%S`.log +``` +## Deploying new version in production +```bash +su export +cd /home/export/data-catalogue-pseudonymization +git switch main +git pull +``` +The new version shouldthe new version should automatically start in production once the cronjob is run automatically start in production once the cronjob is run. + diff --git a/compose.dev.yml b/compose.dev.yml new file mode 100644 index 0000000..84534de --- /dev/null +++ b/compose.dev.yml @@ -0,0 +1,19 @@ +version: '3.0' +services: + run: + build: Dockerfile.dev + user: "1001:998" + volumes: + - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/:/sc + - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/seq/TRANSFER/:/TRANSFER + - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/pseudonymisation_table/:/pseudonymisation_tables/ + - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/Libraries/:/Libraries + - /var/run/docker.sock:/var/run/docker.sock + extra_hosts: + - "host.docker.internal:host-gateway" + command: bash -c "python main.py + -s /TRANSFER + -d /sc/PseudonymizedRuns + -t /pseudonymisation_tables + -l /Libraries + -lsc /sc/Libraries" \ No newline at end of file diff --git a/compose.prod.yml b/compose.prod.yml new file mode 100644 index 0000000..c1578d5 --- /dev/null +++ b/compose.prod.yml @@ -0,0 +1,10 @@ +version: '3.0' +services: + run: + build: Dockerfile.prod + user: "1005:1002" + volumes: + - /muni-ss/:/sc + - /seq/NO-BACKUP-SPACE/:/seq + - /pseudo_tables/:/pseudo_tables + command: ["bash", "-c", "python main.py -s /seq/TRANSFER/MiSEQ -d /sc/PSEUDONYMIZED/ -t /pseudo_tables -l /seq/Libraries -lsc /sc/Libraries"] \ No newline at end of file diff --git a/compose.yml b/compose.yml deleted file mode 100644 index c163fed..0000000 --- a/compose.yml +++ /dev/null @@ -1,18 +0,0 @@ -version: '3.0' -services: - run: - build: . - volumes: - - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/:/sc - - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/seq/TRANSFER/:/TRANSFER - - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/pseudonymisation_table/:/pseudonymization_tables/ - - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/Libraries/:/libraries - extra_hosts: - - "host.docker.internal:host-gateway" - command: bash -c "python run_pseudonymization_pipeline.py - /TRANSFER - /sc/ - /pseudonymization_tables - /libraries - /sc/Libraries - 'MiSEQ'" \ No newline at end of file