From b55af8f515e45de0a2dc87578ae1a6e8af34f9ce Mon Sep 17 00:00:00 2001
From: Ali Hamraoui <hamraoui@bio.ens.psl.eu>
Date: Wed, 17 Jul 2024 11:11:25 +0200
Subject: [PATCH 1/9] Fix: Recognize the standard barcode arrangement
 BC|RB|NB|BP in the sample sheet.

---
 toulligqc/toulligqc.py | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/toulligqc/toulligqc.py b/toulligqc/toulligqc.py
index 01ade2d..d947863 100644
--- a/toulligqc/toulligqc.py
+++ b/toulligqc/toulligqc.py
@@ -352,17 +352,25 @@ def main():
         sys.exit("ERROR: dico_path is empty")
 
     # Get barcode selection
+    allowed_patterns = r'(BC|RB|NB|BP|BARCODE)(\d{2})'
+
     if config_dictionary['barcoding'].lower() == 'true':
         config_dictionary['barcode_selection'] = []
 
-        if 'barcodes' in config_dictionary:
+        if 'samplesheet' in config_dictionary:
+            samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
+            config_dictionary['barcodes'] = ",".join(list(samplesheet['barcode']))
+            config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, 
+                                                           index=samplesheet.barcode).to_dict()
+
+        if 'barcodes' in config_dictionary or 'samplesheet' in config_dictionary:
             barcode_set = set()
             if ":" in config_dictionary['barcodes']:
                 start, end  = config_dictionary['barcodes'].strip().split(':')
-                pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', start.strip().upper())
+                pattern = re.search(allowed_patterns, start.strip().upper())
                 if pattern:
                     start_number = int(pattern.group(2))
-                pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', end.strip().upper())
+                pattern = re.search(allowed_patterns, end.strip().upper())
                 if pattern:
                     end_number = int(pattern.group(2)) 
                 for i in range(start_number, end_number + 1):
@@ -371,13 +379,15 @@ def main():
                     
             else:
                 for b in config_dictionary['barcodes'].strip().split(','):
-                    pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', b.strip().upper())
+                    pattern = re.search(allowed_patterns, b.strip().upper())
                     if pattern:
                         barcode = 'barcode{}'.format(pattern.group(2))
                         barcode_set.add(barcode)
                     else: 
                         sys.stderr.write("\033[93mWarning:\033[0m Barcode '{}' is non-standard custom arrangement.\n".format(b))
                         barcode_set.add(b)
+                    if 'samplesheet' in config_dictionary:
+                        config_dictionary['barcode_alias'][barcode] = config_dictionary['barcode_alias'].pop(b)
 
             barcode_selection = sorted(barcode_set)
 
@@ -385,12 +395,6 @@ def main():
                 sys.exit("ERROR: No known barcode found in provided list of barcodes")
             config_dictionary['barcode_selection'] = barcode_selection
 
-        elif 'samplesheet' in config_dictionary:
-            samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
-            config_dictionary['barcode_selection'] = list(samplesheet['barcode'])
-            config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, 
-                                                           index=samplesheet.barcode).to_dict()
-
     else:
         config_dictionary['barcode_selection'] = ''
 

From b855e0cce3ad3f1ea5eb741144e94abf8bfe6c47 Mon Sep 17 00:00:00 2001
From: Ali Hamraoui <hamraoui@bio.ens.psl.eu>
Date: Wed, 17 Jul 2024 16:48:18 +0200
Subject: [PATCH 2/9] FIX: Correct N50 and L50 computation issue (closes #28)

---
 toulligqc/common_statistics.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/toulligqc/common_statistics.py b/toulligqc/common_statistics.py
index 67198d6..4a59905 100644
--- a/toulligqc/common_statistics.py
+++ b/toulligqc/common_statistics.py
@@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x):
     cum_sum = 0
     count = 0
     for v in data:
-        cum_sum += v
+        cum_sum += int(v)
         count += 1
         if cum_sum >= half_sum:
             return count
@@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x):
     half_sum = data.sum() * x / 100
     cum_sum = 0
     for v in data:
-        cum_sum += v
+        cum_sum += int(v)
         if cum_sum >= half_sum:
             return int(v)
 

From 4045146105d457afaf86c3e05ff2c7a7fae355e8 Mon Sep 17 00:00:00 2001
From: Ali Hamraoui <hamraoui@bio.ens.psl.eu>
Date: Wed, 17 Jul 2024 16:50:27 +0200
Subject: [PATCH 3/9] ADD: Conda environmment .yaml file

---
 environment.yml | 57 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 environment.yml

diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..0e8c781
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,57 @@
+name: toulligqc
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1=main
+  - _openmp_mutex=5.1=1_gnu
+  - bzip2=1.0.8=h5eee18b_6
+  - ca-certificates=2024.7.2=h06a4308_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.4.4=h6a678d5_1
+  - libgcc-ng=11.2.0=h1234567_1
+  - libgomp=11.2.0=h1234567_1
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libuuid=1.41.5=h5eee18b_0
+  - ncurses=6.4=h6a678d5_0
+  - openssl=3.0.14=h5eee18b_0
+  - pip=24.0=py311h06a4308_0
+  - python=3.11.9=h955ad1f_0
+  - readline=8.2=h5eee18b_0
+  - setuptools=69.5.1=py311h06a4308_0
+  - sqlite=3.45.3=h5eee18b_0
+  - tk=8.6.14=h39e8969_0
+  - wheel=0.43.0=py311h06a4308_0
+  - xz=5.4.6=h5eee18b_1
+  - zlib=1.2.13=h5eee18b_1
+  - pip:
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - fonttools==4.53.1
+      - h5py==3.11.0
+      - iso8601==2.1.0
+      - joblib==1.4.2
+      - kiwisolver==1.4.5
+      - lib-pod5==0.3.12
+      - matplotlib==3.9.1
+      - more-itertools==10.3.0
+      - numpy==2.0.0
+      - packaging==24.1
+      - pandas==1.26.4
+      - pillow==10.4.0
+      - plotly==5.22.0
+      - pod5==0.3.12
+      - polars==0.20.31
+      - pyarrow==16.1.0
+      - pyparsing==3.1.2
+      - pysam==0.22.1
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - scikit-learn==1.5.1
+      - scipy==1.14.0
+      - six==1.16.0
+      - tenacity==8.5.0
+      - threadpoolctl==3.5.0
+      - tqdm==4.66.4
+      - tzdata==2024.1
+      - vbz-h5py-plugin==1.0.1
+prefix: /users/depbio/hamraoui/miniconda3/envs/toulligqc2.7

From b015f1e934aed13af480dd5f775cd86971a6566a Mon Sep 17 00:00:00 2001
From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com>
Date: Wed, 17 Jul 2024 16:52:17 +0200
Subject: [PATCH 4/9] Update environment.yml

---
 environment.yml | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/environment.yml b/environment.yml
index 0e8c781..0537935 100644
--- a/environment.yml
+++ b/environment.yml
@@ -2,27 +2,27 @@ name: toulligqc
 channels:
   - defaults
 dependencies:
-  - _libgcc_mutex=0.1=main
-  - _openmp_mutex=5.1=1_gnu
-  - bzip2=1.0.8=h5eee18b_6
-  - ca-certificates=2024.7.2=h06a4308_0
-  - ld_impl_linux-64=2.38=h1181459_1
-  - libffi=3.4.4=h6a678d5_1
-  - libgcc-ng=11.2.0=h1234567_1
-  - libgomp=11.2.0=h1234567_1
-  - libstdcxx-ng=11.2.0=h1234567_1
-  - libuuid=1.41.5=h5eee18b_0
-  - ncurses=6.4=h6a678d5_0
-  - openssl=3.0.14=h5eee18b_0
-  - pip=24.0=py311h06a4308_0
-  - python=3.11.9=h955ad1f_0
-  - readline=8.2=h5eee18b_0
-  - setuptools=69.5.1=py311h06a4308_0
-  - sqlite=3.45.3=h5eee18b_0
-  - tk=8.6.14=h39e8969_0
-  - wheel=0.43.0=py311h06a4308_0
-  - xz=5.4.6=h5eee18b_1
-  - zlib=1.2.13=h5eee18b_1
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=5.1
+  - bzip2=1.0.8
+  - ca-certificates=2024.7.2
+  - ld_impl_linux-64=2.38
+  - libffi=3.4.4
+  - libgcc-ng=11.2.0
+  - libgomp=11.2.0
+  - libstdcxx-ng=11.2.0
+  - libuuid=1.41.5
+  - ncurses=6.4
+  - openssl=3.0.14
+  - pip=24.0
+  - python=3.11.9
+  - readline=8.2
+  - setuptools=69.5.1
+  - sqlite=3.45.3
+  - tk=8.6.14
+  - wheel=0.43.0
+  - xz=5.4.6
+  - zlib=1.2.13
   - pip:
       - contourpy==1.2.1
       - cycler==0.12.1

From 792943841f5a10aa93febfd518965f61508fe219 Mon Sep 17 00:00:00 2001
From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com>
Date: Wed, 17 Jul 2024 16:54:11 +0200
Subject: [PATCH 5/9] Update environment.yml

---
 environment.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 0537935..55cca46 100644
--- a/environment.yml
+++ b/environment.yml
@@ -54,4 +54,3 @@ dependencies:
       - tqdm==4.66.4
       - tzdata==2024.1
       - vbz-h5py-plugin==1.0.1
-prefix: /users/depbio/hamraoui/miniconda3/envs/toulligqc2.7

From 410fa6eb9837a70ab0c49728d144d5b5f362329c Mon Sep 17 00:00:00 2001
From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com>
Date: Fri, 19 Jul 2024 15:39:21 +0200
Subject: [PATCH 6/9] Update README.md

---
 README.md | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 61e8f0c..a97c34c 100644
--- a/README.md
+++ b/README.md
@@ -53,15 +53,25 @@ $ cd toulligqc && python3 setup.py build install
 ToulligQC is written with Python 3.
 To run ToulligQC without Docker, you need to install the following Python modules:
 
-* matplotlib
-* plotly
-* h5py
+ * matplotlib
+ * plotly
+ * h5py
 * pandas
 * numpy
 * scipy
 * scikit-learn
 * pysam
+* tqdm
+* pod5
 
+* **Conda environemnt**
+
+You can use a conda environment to install the required packages:
+
+```
+conda env create -f environment.yml
+conda activate toulliqc
+```
 
 <a name="pypi-installation"></a>
 ### 1.2 Using a PyPi package

From 52d6c15003b033b6e66260a030b17c89ea198127 Mon Sep 17 00:00:00 2001
From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com>
Date: Fri, 19 Jul 2024 15:50:19 +0200
Subject: [PATCH 7/9] Update README.md

Add conda env installation
---
 README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index a97c34c..d18e7db 100644
--- a/README.md
+++ b/README.md
@@ -64,17 +64,20 @@ To run ToulligQC without Docker, you need to install the following Python module
 * tqdm
 * pod5
 
-* **Conda environemnt**
+<a name="Conda-environemnt"></a>
+### 1.2 Conda environemnt**
 
 You can use a conda environment to install the required packages:
 
 ```
+git clone https://github.com/GenomicParisCentre/toulligQC.git
+cd toulligqc && python3 setup.py build install
 conda env create -f environment.yml
 conda activate toulliqc
 ```
 
 <a name="pypi-installation"></a>
-### 1.2 Using a PyPi package
+### 1.3 Using a PyPi package
 
 ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line  will install the latest version of ToulligQC:
 ```bash
@@ -82,7 +85,7 @@ $ pip3 install toulligqc
 ```
 
 <a name="docker"></a>
-### 1.3 Using Docker
+### 1.4 Using Docker
 ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (<https://docs.docker.com/engine/installation/>).
 Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host.
 <a name="docker-image-recovery"></a>

From ab764669c0b21ba09d8aa0fc65207675d25b4a4c Mon Sep 17 00:00:00 2001
From: Ali Hamraoui <hamraoui@bio.ens.psl.eu>
Date: Wed, 21 Aug 2024 16:32:16 +0200
Subject: [PATCH 8/9] FIX: FIX Barcoding for the Fastq_extractor

---
 toulligqc/extractor_common.py |  2 ++
 toulligqc/fastq_extractor.py  | 66 +++++++++++++++++++++++------------
 2 files changed, 46 insertions(+), 22 deletions(-)

diff --git a/toulligqc/extractor_common.py b/toulligqc/extractor_common.py
index cacfebd..872e71f 100644
--- a/toulligqc/extractor_common.py
+++ b/toulligqc/extractor_common.py
@@ -432,6 +432,8 @@ def add_image_to_result(quiet, image_list, start_time, image):
 def timeISO_to_float(iso_datetime, format):
         """
         """
+        if '+' in iso_datetime:
+            iso_datetime = iso_datetime.split('+')[0]
         dt = datetime.strptime(iso_datetime, format)
         unix_timestamp = dt.timestamp()
         return unix_timestamp
diff --git a/toulligqc/fastq_extractor.py b/toulligqc/fastq_extractor.py
index ab74da6..3e73b33 100644
--- a/toulligqc/fastq_extractor.py
+++ b/toulligqc/fastq_extractor.py
@@ -119,32 +119,45 @@ def graph_generation(self, result_dict):
 
         add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))
+
         if self.rich:
             add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
+
         if self.rich:
             add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
+
         if self.rich:
             add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
             add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
-        if self.is_barcode:
-            add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
-                                                                                                       self.barcode_selection,
-                                                                                                       self.images_directory))
 
-            read_fail = self.dataframe_dict["read.fail.barcoded"]
-            if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
-                add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
-                                                                                                      self.barcode_selection,
-                                                                                                      self.images_directory))
-
-            add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
-                                                                                            self.images_directory))
-
-            add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
-                                                                                                    self.images_directory))
+            if self.is_barcode:
+                if "barcode_alias" in self.config_dictionary:
+                    barcode_alias = self.config_dictionary['barcode_alias']
+                else:
+                    barcode_alias = None
+
+                add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
+                                                                                                        self.barcode_selection,
+                                                                                                        self.images_directory,
+                                                                                                        barcode_alias))
+
+                read_fail = self.dataframe_dict["read.fail.barcoded"]
+                if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
+                    add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
+                                                                                                        self.barcode_selection,
+                                                                                                        self.images_directory,
+                                                                                                        barcode_alias))
+
+                add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
+                                                                                                self.images_directory,
+                                                                                                barcode_alias))
+
+                add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
+                                                                                                        self.images_directory,
+                                                                                                        barcode_alias))
         return images
 
 
@@ -211,7 +224,7 @@ def extract(self, result_dict):
                       "pass.reads.sequence.length")
         describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"],
                       "fail.reads.sequence.length")
-        if self.is_barcode:
+        if self.rich and self.is_barcode:
             extract_barcode_info(self, result_dict,
                                  self.barcode_selection,
                                  self.dataframe_dict,
@@ -258,8 +271,9 @@ def _load_fastq_data(self):
         columns = ['sequence_length', 'mean_qscore', 'passes_filtering']
         if self.rich:
             columns.extend(['start_time', 'channel'])
-        if self.is_barcode:
-            columns.append('barcode_arrangement')
+
+            if self.is_barcode:
+                columns.append('barcode_arrangement')
 
         fq_df = pd.DataFrame(fq_df, columns=columns)
 
@@ -271,8 +285,10 @@ def _load_fastq_data(self):
             fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min()
             fq_df['start_time'] = fq_df['start_time'].astype(np.float64)
             fq_df['channel'] = fq_df['channel'].astype(np.int16)
-        if self.is_barcode:
-            fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
+
+            if self.is_barcode:
+                fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
+
         return fq_df 
 
 
@@ -346,8 +362,11 @@ def check_fastq(self):
             self.is_barcode = False
         if 'model_version_id' not in metadata:
                 metadata['model_version_id'] = 'Unknow'
+        run_info = []
         try:
-            return metadata['runid'] , metadata['sampleid'] , metadata['model_version_id'] 
+           sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid'
+           run_id = 'run_id' if 'run_id' in metadata else 'runid'
+           return metadata[run_id] , metadata[sample_id] , metadata['model_version_id'] 
         except:
             return None
 
@@ -356,7 +375,10 @@ def _extract_info_from_name(self, name):
         """
         """
         metadata = dict(x.split("=") for x in name.split(" ")[1:])
-        start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ')
+        try:
+            start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ')
+        except:
+            start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f')
         if self.is_barcode:
             return  start_time, metadata['ch'], metadata['barcode']
         return  start_time, metadata['ch']
\ No newline at end of file

From 4043ad6b800f3e399e5481799913751cccb1588c Mon Sep 17 00:00:00 2001
From: Ali Hamraoui <hamraoui@bio.ens.psl.eu>
Date: Thu, 22 Aug 2024 14:39:28 +0200
Subject: [PATCH 9/9] FIX: fix time format for BAM and FASTQ extractors

---
 toulligqc/extractor_common.py | 8 +++++---
 toulligqc/fastq_extractor.py  | 5 +----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/toulligqc/extractor_common.py b/toulligqc/extractor_common.py
index 872e71f..33aab14 100644
--- a/toulligqc/extractor_common.py
+++ b/toulligqc/extractor_common.py
@@ -432,9 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image):
 def timeISO_to_float(iso_datetime, format):
         """
         """
-        if '+' in iso_datetime:
-            iso_datetime = iso_datetime.split('+')[0]
-        dt = datetime.strptime(iso_datetime, format)
+        try:
+            dt = datetime.strptime(iso_datetime, format)
+        except:
+            format = '%Y-%m-%dT%H:%M:%SZ'
+            dt = datetime.strptime(iso_datetime, format)
         unix_timestamp = dt.timestamp()
         return unix_timestamp
 
diff --git a/toulligqc/fastq_extractor.py b/toulligqc/fastq_extractor.py
index 3e73b33..cb565df 100644
--- a/toulligqc/fastq_extractor.py
+++ b/toulligqc/fastq_extractor.py
@@ -375,10 +375,7 @@ def _extract_info_from_name(self, name):
         """
         """
         metadata = dict(x.split("=") for x in name.split(" ")[1:])
-        try:
-            start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ')
-        except:
-            start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f')
+        start_time = timeISO_to_float(metadata['start_time'],  '%Y-%m-%dT%H:%M:%S.%f%z')
         if self.is_barcode:
             return  start_time, metadata['ch'], metadata['barcode']
         return  start_time, metadata['ch']
\ No newline at end of file