Merge pull request #31 from GenomiqueENS/v2.7.1

V2.7.1
GenomiqueENS · Aug 22, 2024 · 4024de3 · 4024de3
2 parents 39e9663 + 4043ad6
commit 4024de3
Show file tree

Hide file tree

Showing 6 changed files with 136 additions and 40 deletions.
diff --git a/README.md b/README.md
@@ -53,26 +53,39 @@ $ cd toulligqc && python3 setup.py build install
 ToulligQC is written with Python 3.
 To run ToulligQC without Docker, you need to install the following Python modules:
 
-* matplotlib
-* plotly
-* h5py
+ * matplotlib
+ * plotly
+ * h5py
 * pandas
 * numpy
 * scipy
 * scikit-learn
 * pysam
+* tqdm
+* pod5
 
+<a name="Conda-environemnt"></a>
+### 1.2 Conda environemnt**
+
+You can use a conda environment to install the required packages:
+
+```
+git clone https://github.com/GenomicParisCentre/toulligQC.git
+cd toulligqc && python3 setup.py build install
+conda env create -f environment.yml
+conda activate toulliqc
+```
 
 <a name="pypi-installation"></a>
-### 1.2 Using a PyPi package
+### 1.3 Using a PyPi package
 
 ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line  will install the latest version of ToulligQC:
 ```bash
 $ pip3 install toulligqc
 ```
 
 <a name="docker"></a>
-### 1.3 Using Docker
+### 1.4 Using Docker
 ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (<https://docs.docker.com/engine/installation/>).
 Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host.
 <a name="docker-image-recovery"></a>

diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,56 @@
+name: toulligqc
+channels:
+  - defaults
+dependencies:
+  - _libgcc_mutex=0.1
+  - _openmp_mutex=5.1
+  - bzip2=1.0.8
+  - ca-certificates=2024.7.2
+  - ld_impl_linux-64=2.38
+  - libffi=3.4.4
+  - libgcc-ng=11.2.0
+  - libgomp=11.2.0
+  - libstdcxx-ng=11.2.0
+  - libuuid=1.41.5
+  - ncurses=6.4
+  - openssl=3.0.14
+  - pip=24.0
+  - python=3.11.9
+  - readline=8.2
+  - setuptools=69.5.1
+  - sqlite=3.45.3
+  - tk=8.6.14
+  - wheel=0.43.0
+  - xz=5.4.6
+  - zlib=1.2.13
+  - pip:
+      - contourpy==1.2.1
+      - cycler==0.12.1
+      - fonttools==4.53.1
+      - h5py==3.11.0
+      - iso8601==2.1.0
+      - joblib==1.4.2
+      - kiwisolver==1.4.5
+      - lib-pod5==0.3.12
+      - matplotlib==3.9.1
+      - more-itertools==10.3.0
+      - numpy==2.0.0
+      - packaging==24.1
+      - pandas==1.26.4
+      - pillow==10.4.0
+      - plotly==5.22.0
+      - pod5==0.3.12
+      - polars==0.20.31
+      - pyarrow==16.1.0
+      - pyparsing==3.1.2
+      - pysam==0.22.1
+      - python-dateutil==2.9.0.post0
+      - pytz==2024.1
+      - scikit-learn==1.5.1
+      - scipy==1.14.0
+      - six==1.16.0
+      - tenacity==8.5.0
+      - threadpoolctl==3.5.0
+      - tqdm==4.66.4
+      - tzdata==2024.1
+      - vbz-h5py-plugin==1.0.1
diff --git a/toulligqc/common_statistics.py b/toulligqc/common_statistics.py
@@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x):
     cum_sum = 0
     count = 0
     for v in data:
-        cum_sum += v
+        cum_sum += int(v)
         count += 1
         if cum_sum >= half_sum:
             return count
@@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x):
     half_sum = data.sum() * x / 100
     cum_sum = 0
     for v in data:
-        cum_sum += v
+        cum_sum += int(v)
         if cum_sum >= half_sum:
             return int(v)
 

diff --git a/toulligqc/extractor_common.py b/toulligqc/extractor_common.py
@@ -432,7 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image):
 def timeISO_to_float(iso_datetime, format):
         """
         """
-        dt = datetime.strptime(iso_datetime, format)
+        try:
+            dt = datetime.strptime(iso_datetime, format)
+        except:
+            format = '%Y-%m-%dT%H:%M:%SZ'
+            dt = datetime.strptime(iso_datetime, format)
         unix_timestamp = dt.timestamp()
         return unix_timestamp
 

diff --git a/toulligqc/fastq_extractor.py b/toulligqc/fastq_extractor.py
@@ -119,32 +119,45 @@ def graph_generation(self, result_dict):
 
         add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))
+
         if self.rich:
             add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))
+
         if self.rich:
             add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
         add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))
+
         if self.rich:
             add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
             add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
-        if self.is_barcode:
-            add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
-                                                                                                       self.barcode_selection,
-                                                                                                       self.images_directory))
 
-            read_fail = self.dataframe_dict["read.fail.barcoded"]
-            if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
-                add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
-                                                                                                      self.barcode_selection,
-                                                                                                      self.images_directory))
-
-            add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
-                                                                                            self.images_directory))
-
-            add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
-                                                                                                    self.images_directory))
+            if self.is_barcode:
+                if "barcode_alias" in self.config_dictionary:
+                    barcode_alias = self.config_dictionary['barcode_alias']
+                else:
+                    barcode_alias = None
+
+                add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
+                                                                                                        self.barcode_selection,
+                                                                                                        self.images_directory,
+                                                                                                        barcode_alias))
+
+                read_fail = self.dataframe_dict["read.fail.barcoded"]
+                if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
+                    add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
+                                                                                                        self.barcode_selection,
+                                                                                                        self.images_directory,
+                                                                                                        barcode_alias))
+
+                add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
+                                                                                                self.images_directory,
+                                                                                                barcode_alias))
+
+                add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
+                                                                                                        self.images_directory,
+                                                                                                        barcode_alias))
         return images
 
 
@@ -211,7 +224,7 @@ def extract(self, result_dict):
                       "pass.reads.sequence.length")
         describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"],
                       "fail.reads.sequence.length")
-        if self.is_barcode:
+        if self.rich and self.is_barcode:
             extract_barcode_info(self, result_dict,
                                  self.barcode_selection,
                                  self.dataframe_dict,
@@ -258,8 +271,9 @@ def _load_fastq_data(self):
         columns = ['sequence_length', 'mean_qscore', 'passes_filtering']
         if self.rich:
             columns.extend(['start_time', 'channel'])
-        if self.is_barcode:
-            columns.append('barcode_arrangement')
+
+            if self.is_barcode:
+                columns.append('barcode_arrangement')
 
         fq_df = pd.DataFrame(fq_df, columns=columns)
 
@@ -271,8 +285,10 @@ def _load_fastq_data(self):
             fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min()
             fq_df['start_time'] = fq_df['start_time'].astype(np.float64)
             fq_df['channel'] = fq_df['channel'].astype(np.int16)
-        if self.is_barcode:
-            fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
+
+            if self.is_barcode:
+                fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")
+
         return fq_df 
 
 
@@ -346,8 +362,11 @@ def check_fastq(self):
             self.is_barcode = False
         if 'model_version_id' not in metadata:
                 metadata['model_version_id'] = 'Unknow'
+        run_info = []
         try:
-            return metadata['runid'] , metadata['sampleid'] , metadata['model_version_id'] 
+           sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid'
+           run_id = 'run_id' if 'run_id' in metadata else 'runid'
+           return metadata[run_id] , metadata[sample_id] , metadata['model_version_id'] 
         except:
             return None
 
@@ -356,7 +375,7 @@ def _extract_info_from_name(self, name):
         """
         """
         metadata = dict(x.split("=") for x in name.split(" ")[1:])
-        start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ')
+        start_time = timeISO_to_float(metadata['start_time'],  '%Y-%m-%dT%H:%M:%S.%f%z')
         if self.is_barcode:
             return  start_time, metadata['ch'], metadata['barcode']
         return  start_time, metadata['ch']
diff --git a/toulligqc/toulligqc.py b/toulligqc/toulligqc.py
@@ -352,17 +352,25 @@ def main():
         sys.exit("ERROR: dico_path is empty")
 
     # Get barcode selection
+    allowed_patterns = r'(BC|RB|NB|BP|BARCODE)(\d{2})'
+
     if config_dictionary['barcoding'].lower() == 'true':
         config_dictionary['barcode_selection'] = []
 
-        if 'barcodes' in config_dictionary:
+        if 'samplesheet' in config_dictionary:
+            samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
+            config_dictionary['barcodes'] = ",".join(list(samplesheet['barcode']))
+            config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, 
+                                                           index=samplesheet.barcode).to_dict()
+
+        if 'barcodes' in config_dictionary or 'samplesheet' in config_dictionary:
             barcode_set = set()
             if ":" in config_dictionary['barcodes']:
                 start, end  = config_dictionary['barcodes'].strip().split(':')
-                pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', start.strip().upper())
+                pattern = re.search(allowed_patterns, start.strip().upper())
                 if pattern:
                     start_number = int(pattern.group(2))
-                pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', end.strip().upper())
+                pattern = re.search(allowed_patterns, end.strip().upper())
                 if pattern:
                     end_number = int(pattern.group(2)) 
                 for i in range(start_number, end_number + 1):
@@ -371,26 +379,22 @@ def main():
 
             else:
                 for b in config_dictionary['barcodes'].strip().split(','):
-                    pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', b.strip().upper())
+                    pattern = re.search(allowed_patterns, b.strip().upper())
                     if pattern:
                         barcode = 'barcode{}'.format(pattern.group(2))
                         barcode_set.add(barcode)
                     else: 
                         sys.stderr.write("\033[93mWarning:\033[0m Barcode '{}' is non-standard custom arrangement.\n".format(b))
                         barcode_set.add(b)
+                    if 'samplesheet' in config_dictionary:
+                        config_dictionary['barcode_alias'][barcode] = config_dictionary['barcode_alias'].pop(b)
 
             barcode_selection = sorted(barcode_set)
 
             if len(barcode_selection) == 0:
                 sys.exit("ERROR: No known barcode found in provided list of barcodes")
             config_dictionary['barcode_selection'] = barcode_selection
 
-        elif 'samplesheet' in config_dictionary:
-            samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
-            config_dictionary['barcode_selection'] = list(samplesheet['barcode'])
-            config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, 
-                                                           index=samplesheet.barcode).to_dict()
-
     else:
         config_dictionary['barcode_selection'] = ''