From b55af8f515e45de0a2dc87578ae1a6e8af34f9ce Mon Sep 17 00:00:00 2001 From: Ali Hamraoui Date: Wed, 17 Jul 2024 11:11:25 +0200 Subject: [PATCH 1/9] Fix: Recognize the standard barcode arrangement BC|RB|NB|BP in the sample sheet. --- toulligqc/toulligqc.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/toulligqc/toulligqc.py b/toulligqc/toulligqc.py index 01ade2d..d947863 100644 --- a/toulligqc/toulligqc.py +++ b/toulligqc/toulligqc.py @@ -352,17 +352,25 @@ def main(): sys.exit("ERROR: dico_path is empty") # Get barcode selection + allowed_patterns = r'(BC|RB|NB|BP|BARCODE)(\d{2})' + if config_dictionary['barcoding'].lower() == 'true': config_dictionary['barcode_selection'] = [] - if 'barcodes' in config_dictionary: + if 'samplesheet' in config_dictionary: + samplesheet = parse_samplesheet(config_dictionary['samplesheet']) + config_dictionary['barcodes'] = ",".join(list(samplesheet['barcode'])) + config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, + index=samplesheet.barcode).to_dict() + + if 'barcodes' in config_dictionary or 'samplesheet' in config_dictionary: barcode_set = set() if ":" in config_dictionary['barcodes']: start, end = config_dictionary['barcodes'].strip().split(':') - pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', start.strip().upper()) + pattern = re.search(allowed_patterns, start.strip().upper()) if pattern: start_number = int(pattern.group(2)) - pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', end.strip().upper()) + pattern = re.search(allowed_patterns, end.strip().upper()) if pattern: end_number = int(pattern.group(2)) for i in range(start_number, end_number + 1): @@ -371,13 +379,15 @@ def main(): else: for b in config_dictionary['barcodes'].strip().split(','): - pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', b.strip().upper()) + pattern = re.search(allowed_patterns, b.strip().upper()) if pattern: barcode = 'barcode{}'.format(pattern.group(2)) barcode_set.add(barcode) else: sys.stderr.write("\033[93mWarning:\033[0m Barcode '{}' is non-standard custom arrangement.\n".format(b)) barcode_set.add(b) + if 'samplesheet' in config_dictionary: + config_dictionary['barcode_alias'][barcode] = config_dictionary['barcode_alias'].pop(b) barcode_selection = sorted(barcode_set) @@ -385,12 +395,6 @@ def main(): sys.exit("ERROR: No known barcode found in provided list of barcodes") config_dictionary['barcode_selection'] = barcode_selection - elif 'samplesheet' in config_dictionary: - samplesheet = parse_samplesheet(config_dictionary['samplesheet']) - config_dictionary['barcode_selection'] = list(samplesheet['barcode']) - config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, - index=samplesheet.barcode).to_dict() - else: config_dictionary['barcode_selection'] = '' From b855e0cce3ad3f1ea5eb741144e94abf8bfe6c47 Mon Sep 17 00:00:00 2001 From: Ali Hamraoui Date: Wed, 17 Jul 2024 16:48:18 +0200 Subject: [PATCH 2/9] FIX: Correct N50 and L50 computation issue (closes #28) --- toulligqc/common_statistics.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/toulligqc/common_statistics.py b/toulligqc/common_statistics.py index 67198d6..4a59905 100644 --- a/toulligqc/common_statistics.py +++ b/toulligqc/common_statistics.py @@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x): cum_sum = 0 count = 0 for v in data: - cum_sum += v + cum_sum += int(v) count += 1 if cum_sum >= half_sum: return count @@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x): half_sum = data.sum() * x / 100 cum_sum = 0 for v in data: - cum_sum += v + cum_sum += int(v) if cum_sum >= half_sum: return int(v) From 4045146105d457afaf86c3e05ff2c7a7fae355e8 Mon Sep 17 00:00:00 2001 From: Ali Hamraoui Date: Wed, 17 Jul 2024 16:50:27 +0200 Subject: [PATCH 3/9] ADD: Conda environmment .yaml file --- environment.yml | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 environment.yml diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..0e8c781 --- /dev/null +++ b/environment.yml @@ -0,0 +1,57 @@ +name: toulligqc +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2024.7.2=h06a4308_0 + - ld_impl_linux-64=2.38=h1181459_1 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.14=h5eee18b_0 + - pip=24.0=py311h06a4308_0 + - python=3.11.9=h955ad1f_0 + - readline=8.2=h5eee18b_0 + - setuptools=69.5.1=py311h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h39e8969_0 + - wheel=0.43.0=py311h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - contourpy==1.2.1 + - cycler==0.12.1 + - fonttools==4.53.1 + - h5py==3.11.0 + - iso8601==2.1.0 + - joblib==1.4.2 + - kiwisolver==1.4.5 + - lib-pod5==0.3.12 + - matplotlib==3.9.1 + - more-itertools==10.3.0 + - numpy==2.0.0 + - packaging==24.1 + - pandas==1.26.4 + - pillow==10.4.0 + - plotly==5.22.0 + - pod5==0.3.12 + - polars==0.20.31 + - pyarrow==16.1.0 + - pyparsing==3.1.2 + - pysam==0.22.1 + - python-dateutil==2.9.0.post0 + - pytz==2024.1 + - scikit-learn==1.5.1 + - scipy==1.14.0 + - six==1.16.0 + - tenacity==8.5.0 + - threadpoolctl==3.5.0 + - tqdm==4.66.4 + - tzdata==2024.1 + - vbz-h5py-plugin==1.0.1 +prefix: /users/depbio/hamraoui/miniconda3/envs/toulligqc2.7 From b015f1e934aed13af480dd5f775cd86971a6566a Mon Sep 17 00:00:00 2001 From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:52:17 +0200 Subject: [PATCH 4/9] Update environment.yml --- environment.yml | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/environment.yml b/environment.yml index 0e8c781..0537935 100644 --- a/environment.yml +++ b/environment.yml @@ -2,27 +2,27 @@ name: toulligqc channels: - defaults dependencies: - - _libgcc_mutex=0.1=main - - _openmp_mutex=5.1=1_gnu - - bzip2=1.0.8=h5eee18b_6 - - ca-certificates=2024.7.2=h06a4308_0 - - ld_impl_linux-64=2.38=h1181459_1 - - libffi=3.4.4=h6a678d5_1 - - libgcc-ng=11.2.0=h1234567_1 - - libgomp=11.2.0=h1234567_1 - - libstdcxx-ng=11.2.0=h1234567_1 - - libuuid=1.41.5=h5eee18b_0 - - ncurses=6.4=h6a678d5_0 - - openssl=3.0.14=h5eee18b_0 - - pip=24.0=py311h06a4308_0 - - python=3.11.9=h955ad1f_0 - - readline=8.2=h5eee18b_0 - - setuptools=69.5.1=py311h06a4308_0 - - sqlite=3.45.3=h5eee18b_0 - - tk=8.6.14=h39e8969_0 - - wheel=0.43.0=py311h06a4308_0 - - xz=5.4.6=h5eee18b_1 - - zlib=1.2.13=h5eee18b_1 + - _libgcc_mutex=0.1 + - _openmp_mutex=5.1 + - bzip2=1.0.8 + - ca-certificates=2024.7.2 + - ld_impl_linux-64=2.38 + - libffi=3.4.4 + - libgcc-ng=11.2.0 + - libgomp=11.2.0 + - libstdcxx-ng=11.2.0 + - libuuid=1.41.5 + - ncurses=6.4 + - openssl=3.0.14 + - pip=24.0 + - python=3.11.9 + - readline=8.2 + - setuptools=69.5.1 + - sqlite=3.45.3 + - tk=8.6.14 + - wheel=0.43.0 + - xz=5.4.6 + - zlib=1.2.13 - pip: - contourpy==1.2.1 - cycler==0.12.1 From 792943841f5a10aa93febfd518965f61508fe219 Mon Sep 17 00:00:00 2001 From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:54:11 +0200 Subject: [PATCH 5/9] Update environment.yml --- environment.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/environment.yml b/environment.yml index 0537935..55cca46 100644 --- a/environment.yml +++ b/environment.yml @@ -54,4 +54,3 @@ dependencies: - tqdm==4.66.4 - tzdata==2024.1 - vbz-h5py-plugin==1.0.1 -prefix: /users/depbio/hamraoui/miniconda3/envs/toulligqc2.7 From 410fa6eb9837a70ab0c49728d144d5b5f362329c Mon Sep 17 00:00:00 2001 From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:39:21 +0200 Subject: [PATCH 6/9] Update README.md --- README.md | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 61e8f0c..a97c34c 100644 --- a/README.md +++ b/README.md @@ -53,15 +53,25 @@ $ cd toulligqc && python3 setup.py build install ToulligQC is written with Python 3. To run ToulligQC without Docker, you need to install the following Python modules: -* matplotlib -* plotly -* h5py + * matplotlib + * plotly + * h5py * pandas * numpy * scipy * scikit-learn * pysam +* tqdm +* pod5 +* **Conda environemnt** + +You can use a conda environment to install the required packages: + +``` +conda env create -f environment.yml +conda activate toulliqc +``` ### 1.2 Using a PyPi package From 52d6c15003b033b6e66260a030b17c89ea198127 Mon Sep 17 00:00:00 2001 From: Hamraoui Ali <72599648+alihamraoui@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:50:19 +0200 Subject: [PATCH 7/9] Update README.md Add conda env installation --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a97c34c..d18e7db 100644 --- a/README.md +++ b/README.md @@ -64,17 +64,20 @@ To run ToulligQC without Docker, you need to install the following Python module * tqdm * pod5 -* **Conda environemnt** + +### 1.2 Conda environemnt** You can use a conda environment to install the required packages: ``` +git clone https://github.com/GenomicParisCentre/toulligQC.git +cd toulligqc && python3 setup.py build install conda env create -f environment.yml conda activate toulliqc ``` -### 1.2 Using a PyPi package +### 1.3 Using a PyPi package ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line will install the latest version of ToulligQC: ```bash @@ -82,7 +85,7 @@ $ pip3 install toulligqc ``` -### 1.3 Using Docker +### 1.4 Using Docker ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (). Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host. From ab764669c0b21ba09d8aa0fc65207675d25b4a4c Mon Sep 17 00:00:00 2001 From: Ali Hamraoui Date: Wed, 21 Aug 2024 16:32:16 +0200 Subject: [PATCH 8/9] FIX: FIX Barcoding for the Fastq_extractor --- toulligqc/extractor_common.py | 2 ++ toulligqc/fastq_extractor.py | 66 +++++++++++++++++++++++------------ 2 files changed, 46 insertions(+), 22 deletions(-) diff --git a/toulligqc/extractor_common.py b/toulligqc/extractor_common.py index cacfebd..872e71f 100644 --- a/toulligqc/extractor_common.py +++ b/toulligqc/extractor_common.py @@ -432,6 +432,8 @@ def add_image_to_result(quiet, image_list, start_time, image): def timeISO_to_float(iso_datetime, format): """ """ + if '+' in iso_datetime: + iso_datetime = iso_datetime.split('+')[0] dt = datetime.strptime(iso_datetime, format) unix_timestamp = dt.timestamp() return unix_timestamp diff --git a/toulligqc/fastq_extractor.py b/toulligqc/fastq_extractor.py index ab74da6..3e73b33 100644 --- a/toulligqc/fastq_extractor.py +++ b/toulligqc/fastq_extractor.py @@ -119,32 +119,45 @@ def graph_generation(self, result_dict): add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory)) + if self.rich: add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory)) + if self.rich: add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory)) + if self.rich: add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory)) - if self.is_barcode: - add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict, - self.barcode_selection, - self.images_directory)) - read_fail = self.dataframe_dict["read.fail.barcoded"] - if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0): - add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict, - self.barcode_selection, - self.images_directory)) - - add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict, - self.images_directory)) - - add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict, - self.images_directory)) + if self.is_barcode: + if "barcode_alias" in self.config_dictionary: + barcode_alias = self.config_dictionary['barcode_alias'] + else: + barcode_alias = None + + add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict, + self.barcode_selection, + self.images_directory, + barcode_alias)) + + read_fail = self.dataframe_dict["read.fail.barcoded"] + if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0): + add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict, + self.barcode_selection, + self.images_directory, + barcode_alias)) + + add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict, + self.images_directory, + barcode_alias)) + + add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict, + self.images_directory, + barcode_alias)) return images @@ -211,7 +224,7 @@ def extract(self, result_dict): "pass.reads.sequence.length") describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"], "fail.reads.sequence.length") - if self.is_barcode: + if self.rich and self.is_barcode: extract_barcode_info(self, result_dict, self.barcode_selection, self.dataframe_dict, @@ -258,8 +271,9 @@ def _load_fastq_data(self): columns = ['sequence_length', 'mean_qscore', 'passes_filtering'] if self.rich: columns.extend(['start_time', 'channel']) - if self.is_barcode: - columns.append('barcode_arrangement') + + if self.is_barcode: + columns.append('barcode_arrangement') fq_df = pd.DataFrame(fq_df, columns=columns) @@ -271,8 +285,10 @@ def _load_fastq_data(self): fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min() fq_df['start_time'] = fq_df['start_time'].astype(np.float64) fq_df['channel'] = fq_df['channel'].astype(np.int16) - if self.is_barcode: - fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category") + + if self.is_barcode: + fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category") + return fq_df @@ -346,8 +362,11 @@ def check_fastq(self): self.is_barcode = False if 'model_version_id' not in metadata: metadata['model_version_id'] = 'Unknow' + run_info = [] try: - return metadata['runid'] , metadata['sampleid'] , metadata['model_version_id'] + sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid' + run_id = 'run_id' if 'run_id' in metadata else 'runid' + return metadata[run_id] , metadata[sample_id] , metadata['model_version_id'] except: return None @@ -356,7 +375,10 @@ def _extract_info_from_name(self, name): """ """ metadata = dict(x.split("=") for x in name.split(" ")[1:]) - start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ') + try: + start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ') + except: + start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f') if self.is_barcode: return start_time, metadata['ch'], metadata['barcode'] return start_time, metadata['ch'] \ No newline at end of file From 4043ad6b800f3e399e5481799913751cccb1588c Mon Sep 17 00:00:00 2001 From: Ali Hamraoui Date: Thu, 22 Aug 2024 14:39:28 +0200 Subject: [PATCH 9/9] FIX: fix time format for BAM and FASTQ extractors --- toulligqc/extractor_common.py | 8 +++++--- toulligqc/fastq_extractor.py | 5 +---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/toulligqc/extractor_common.py b/toulligqc/extractor_common.py index 872e71f..33aab14 100644 --- a/toulligqc/extractor_common.py +++ b/toulligqc/extractor_common.py @@ -432,9 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image): def timeISO_to_float(iso_datetime, format): """ """ - if '+' in iso_datetime: - iso_datetime = iso_datetime.split('+')[0] - dt = datetime.strptime(iso_datetime, format) + try: + dt = datetime.strptime(iso_datetime, format) + except: + format = '%Y-%m-%dT%H:%M:%SZ' + dt = datetime.strptime(iso_datetime, format) unix_timestamp = dt.timestamp() return unix_timestamp diff --git a/toulligqc/fastq_extractor.py b/toulligqc/fastq_extractor.py index 3e73b33..cb565df 100644 --- a/toulligqc/fastq_extractor.py +++ b/toulligqc/fastq_extractor.py @@ -375,10 +375,7 @@ def _extract_info_from_name(self, name): """ """ metadata = dict(x.split("=") for x in name.split(" ")[1:]) - try: - start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ') - except: - start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f') + start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f%z') if self.is_barcode: return start_time, metadata['ch'], metadata['barcode'] return start_time, metadata['ch'] \ No newline at end of file