From c9f338bae1fe1427747182b71e88cf659cd5a203 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Houfek?= <tomas.houfek@mou.cz>
Date: Mon, 28 Oct 2024 15:49:58 +0100
Subject: [PATCH] Added production and development docker and compose files,
 updated readme

---
 Dockerfile => Dockerfile.dev |  2 +-
 Dockerfile.prod              | 16 ++++++++
 README.md                    | 72 ++++++++++++++++++++++--------------
 compose.dev.yml              | 19 ++++++++++
 compose.prod.yml             | 10 +++++
 compose.yml                  | 18 ---------
 6 files changed, 91 insertions(+), 46 deletions(-)
 rename Dockerfile => Dockerfile.dev (85%)
 create mode 100644 Dockerfile.prod
 create mode 100644 compose.dev.yml
 create mode 100644 compose.prod.yml
 delete mode 100644 compose.yml

diff --git a/Dockerfile b/Dockerfile.dev
similarity index 85%
rename from Dockerfile
rename to Dockerfile.dev
index 5d7391d..568cced 100644
--- a/Dockerfile
+++ b/Dockerfile.dev
@@ -5,7 +5,7 @@ RUN mkdir /pseudo-app
 WORKDIR /pseudo-app
 
 ADD requirements.txt .
-ADD run_pseudonymization_pipeline.py .
+ADD main.py .
 ADD pseudonymization/ pseudonymization/
 ADD tests/ tests/
 
diff --git a/Dockerfile.prod b/Dockerfile.prod
new file mode 100644
index 0000000..fecbf06
--- /dev/null
+++ b/Dockerfile.prod
@@ -0,0 +1,16 @@
+FROM bitnami/python:3.10
+
+RUN mkdir /pseudo-app
+
+WORKDIR /pseudo-app
+
+RUN groupadd -g 1002 sequencers && useradd -u 1005 -g sequencers export
+
+ADD requirements.txt .
+ADD main.py .
+ADD pseudonymization/ pseudonymization/
+ADD tests/ tests/
+
+RUN pip install -r requirements.txt
+
+USER export
\ No newline at end of file
diff --git a/README.md b/README.md
index c9a7add..220068b 100644
--- a/README.md
+++ b/README.md
@@ -2,32 +2,50 @@
 This is the repository for the pseudonymisation part of the BBMRI.cz data catalog.
 
 ## Pseudonymisation
-
-### MiSEQ
 Pseudonymizes predictive numbers, collects clinical data and removes unnecessary files before moving the data to SensitiveCloud at ICS-MUNI.
 
-Pseudonymisation itself consists of multiple Python and Bash scripts, the whole pipeline is defined within **pseudonymize_pipeline.sh**:
-  1. it sets paths to important folders and files,
-  2. it ensures data (HIS exports) transfer from remote server,
-  3. it looks for exports having "predictive number" value,
-  4. it indentify and remove duplicates
-  5. then it iterates through sequencing data folder (consisting of sequencing run output + following analysis output files) and runs scripts:
-        - **remove_files.sh** - removes defined unnecessary files
-        - **pseudonymisation.py** - performs pseudonymisation creating class **Pseudonymizer** and using function **pseudonymize_run()**  or **\_\_call\_\_()**. The function performs the following tasks:
-            1. Pseudonymization of samplesheets
-            2. Adding clinical and biobank data 
-            3. Pseudonymizing file names
-        - **clinical_finder.py**, this clinical_finder.py Python script contains:
-          - class **Material** definition (with properties pseudo_ID, biopsy_number, sample_ID, sample_number, available_samples_number, material_type) 
-          - The class **Material** has following subclasses:
-            - **Tissue** (with properties material, pTNM, morphology, diagnosis, cut_time, freeze_time)
-            - **Serum** (with properties material, diagnosis, taking_date)
-            - **Genome** (with properties material, taking_date)
-          - class **Patient** definition (with properties ID, birth, sex, samples)
-          - class **FindClinicalInfo** definition (with properties export_path, predictive_numbers, pseudo_pred_table_path, pseudo_patient_table_path, pseudo_sample_table_path, run_path) and functions:
-            - **\_\_call\_\_()** performs the following steps:
-              1. Collects all clinical information in export and convert it to nicer json format
-              2. Splits clinical info per patient and removes duplicated values
-              3. Splits clinical info per pseudo_id and only takes one material per pseudo_id
-            - **get_pseudo_ids()** returns all pseudo_ids generated in the search process.
-      - **replace_predictive.sh** - replaces each predictive number appearance in all the files of sequencing data folder with created pseudo ID in previous step
\ No newline at end of file
+## Supported sequencing types
+Miseq, New Miseq, MammaPrint
+
+## How to run the scripts
+### Locally - Development
+#### Using main.py
+1. Install requirements
+```bash
+pip install -r requiremenents.txt
+```
+2. Run main.py
+```bash
+python main.py -s /path/to/runs/for/pseudonymization -d /path/to/sensitive/cloud/destination 
+               -t /path/to/pseudonymisation/tables/folder -l /path/to/libraries 
+               -lsc /path/to/sensitive/cloud/libraries"
+```
+#### Using docker-compose
+```bash
+docker-compose up -f compose.dev.yml -d --build
+```
+### In production
+#### Using docker-compose
+```bash
+# connect to seq server
+su export
+cd /home/export/data-catalogue-pseudonymisation
+docker-compose up -f compose.prod.yml -d
+```
+#### Deployment in cron
+```bash
+# connect to seq serve
+su export
+crontab -e
+# setting cron to run every Monday, Wednesday, Friday at 22:00
+0 22 * * 1,3,5 /usr/local/bin/docker-compose -f /home/export/data-catalogue-pseudonymisation/compose.prod.yml up -d &>> /home/export/logs/`date +\%Y\%m\%d\%H\%M\%S`.log
+```
+## Deploying new version in production
+```bash
+su export
+cd /home/export/data-catalogue-pseudonymization
+git switch main
+git pull
+```
+The new version shouldthe new version should automatically start in production once the cronjob is run automatically start in production once the cronjob is run.
+
diff --git a/compose.dev.yml b/compose.dev.yml
new file mode 100644
index 0000000..84534de
--- /dev/null
+++ b/compose.dev.yml
@@ -0,0 +1,19 @@
+version: '3.0'
+services:
+  run:
+    build: Dockerfile.dev
+    user: "1001:998"
+    volumes:
+      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/:/sc
+      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/seq/TRANSFER/:/TRANSFER
+      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/pseudonymisation_table/:/pseudonymisation_tables/
+      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/Libraries/:/Libraries
+      - /var/run/docker.sock:/var/run/docker.sock
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
+    command: bash -c "python main.py
+                              -s /TRANSFER
+                              -d /sc/PseudonymizedRuns
+                              -t /pseudonymisation_tables
+                              -l /Libraries
+                              -lsc /sc/Libraries"
\ No newline at end of file
diff --git a/compose.prod.yml b/compose.prod.yml
new file mode 100644
index 0000000..c1578d5
--- /dev/null
+++ b/compose.prod.yml
@@ -0,0 +1,10 @@
+version: '3.0'
+services:
+  run:
+    build: Dockerfile.prod
+    user: "1005:1002"
+    volumes:
+      - /muni-ss/:/sc
+      - /seq/NO-BACKUP-SPACE/:/seq
+      - /pseudo_tables/:/pseudo_tables
+    command: ["bash", "-c", "python main.py -s /seq/TRANSFER/MiSEQ -d /sc/PSEUDONYMIZED/ -t /pseudo_tables -l /seq/Libraries -lsc /sc/Libraries"]
\ No newline at end of file
diff --git a/compose.yml b/compose.yml
deleted file mode 100644
index c163fed..0000000
--- a/compose.yml
+++ /dev/null
@@ -1,18 +0,0 @@
-version: '3.0'
-services:
-  run:
-    build: .
-    volumes:
-      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/muni-sc/:/sc
-      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/seq/TRANSFER/:/TRANSFER
-      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/pseudonymisation_table/:/pseudonymization_tables/
-      - /home/houfek/Work/MMCI/sequencing_pipeline/data-catalogue-playground/Libraries/:/libraries
-    extra_hosts:
-      - "host.docker.internal:host-gateway"
-    command: bash -c "python run_pseudonymization_pipeline.py 
-                              /TRANSFER
-                              /sc/
-                              /pseudonymization_tables
-                              /libraries
-                              /sc/Libraries
-                              'MiSEQ'"
\ No newline at end of file