diff --git a/README.md b/README.md index 61e8f0c..d18e7db 100644 --- a/README.md +++ b/README.md @@ -53,18 +53,31 @@ $ cd toulligqc && python3 setup.py build install ToulligQC is written with Python 3. To run ToulligQC without Docker, you need to install the following Python modules: -* matplotlib -* plotly -* h5py + * matplotlib + * plotly + * h5py * pandas * numpy * scipy * scikit-learn * pysam +* tqdm +* pod5 + +### 1.2 Conda environemnt** + +You can use a conda environment to install the required packages: + +``` +git clone https://github.com/GenomicParisCentre/toulligQC.git +cd toulligqc && python3 setup.py build install +conda env create -f environment.yml +conda activate toulliqc +``` -### 1.2 Using a PyPi package +### 1.3 Using a PyPi package ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line will install the latest version of ToulligQC: ```bash @@ -72,7 +85,7 @@ $ pip3 install toulligqc ``` -### 1.3 Using Docker +### 1.4 Using Docker ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (). Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host. diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..55cca46 --- /dev/null +++ b/environment.yml @@ -0,0 +1,56 @@ +name: toulligqc +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1 + - _openmp_mutex=5.1 + - bzip2=1.0.8 + - ca-certificates=2024.7.2 + - ld_impl_linux-64=2.38 + - libffi=3.4.4 + - libgcc-ng=11.2.0 + - libgomp=11.2.0 + - libstdcxx-ng=11.2.0 + - libuuid=1.41.5 + - ncurses=6.4 + - openssl=3.0.14 + - pip=24.0 + - python=3.11.9 + - readline=8.2 + - setuptools=69.5.1 + - sqlite=3.45.3 + - tk=8.6.14 + - wheel=0.43.0 + - xz=5.4.6 + - zlib=1.2.13 + - pip: + - contourpy==1.2.1 + - cycler==0.12.1 + - fonttools==4.53.1 + - h5py==3.11.0 + - iso8601==2.1.0 + - joblib==1.4.2 + - kiwisolver==1.4.5 + - lib-pod5==0.3.12 + - matplotlib==3.9.1 + - more-itertools==10.3.0 + - numpy==2.0.0 + - packaging==24.1 + - pandas==1.26.4 + - pillow==10.4.0 + - plotly==5.22.0 + - pod5==0.3.12 + - polars==0.20.31 + - pyarrow==16.1.0 + - pyparsing==3.1.2 + - pysam==0.22.1 + - python-dateutil==2.9.0.post0 + - pytz==2024.1 + - scikit-learn==1.5.1 + - scipy==1.14.0 + - six==1.16.0 + - tenacity==8.5.0 + - threadpoolctl==3.5.0 + - tqdm==4.66.4 + - tzdata==2024.1 + - vbz-h5py-plugin==1.0.1 diff --git a/toulligqc/common_statistics.py b/toulligqc/common_statistics.py index 67198d6..4a59905 100644 --- a/toulligqc/common_statistics.py +++ b/toulligqc/common_statistics.py @@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x): cum_sum = 0 count = 0 for v in data: - cum_sum += v + cum_sum += int(v) count += 1 if cum_sum >= half_sum: return count @@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x): half_sum = data.sum() * x / 100 cum_sum = 0 for v in data: - cum_sum += v + cum_sum += int(v) if cum_sum >= half_sum: return int(v) diff --git a/toulligqc/extractor_common.py b/toulligqc/extractor_common.py index cacfebd..33aab14 100644 --- a/toulligqc/extractor_common.py +++ b/toulligqc/extractor_common.py @@ -432,7 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image): def timeISO_to_float(iso_datetime, format): """ """ - dt = datetime.strptime(iso_datetime, format) + try: + dt = datetime.strptime(iso_datetime, format) + except: + format = '%Y-%m-%dT%H:%M:%SZ' + dt = datetime.strptime(iso_datetime, format) unix_timestamp = dt.timestamp() return unix_timestamp diff --git a/toulligqc/fastq_extractor.py b/toulligqc/fastq_extractor.py index ab74da6..cb565df 100644 --- a/toulligqc/fastq_extractor.py +++ b/toulligqc/fastq_extractor.py @@ -119,32 +119,45 @@ def graph_generation(self, result_dict): add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory)) + if self.rich: add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory)) + if self.rich: add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory)) + if self.rich: add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory)) add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory)) - if self.is_barcode: - add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict, - self.barcode_selection, - self.images_directory)) - read_fail = self.dataframe_dict["read.fail.barcoded"] - if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0): - add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict, - self.barcode_selection, - self.images_directory)) - - add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict, - self.images_directory)) - - add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict, - self.images_directory)) + if self.is_barcode: + if "barcode_alias" in self.config_dictionary: + barcode_alias = self.config_dictionary['barcode_alias'] + else: + barcode_alias = None + + add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict, + self.barcode_selection, + self.images_directory, + barcode_alias)) + + read_fail = self.dataframe_dict["read.fail.barcoded"] + if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0): + add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict, + self.barcode_selection, + self.images_directory, + barcode_alias)) + + add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict, + self.images_directory, + barcode_alias)) + + add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict, + self.images_directory, + barcode_alias)) return images @@ -211,7 +224,7 @@ def extract(self, result_dict): "pass.reads.sequence.length") describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"], "fail.reads.sequence.length") - if self.is_barcode: + if self.rich and self.is_barcode: extract_barcode_info(self, result_dict, self.barcode_selection, self.dataframe_dict, @@ -258,8 +271,9 @@ def _load_fastq_data(self): columns = ['sequence_length', 'mean_qscore', 'passes_filtering'] if self.rich: columns.extend(['start_time', 'channel']) - if self.is_barcode: - columns.append('barcode_arrangement') + + if self.is_barcode: + columns.append('barcode_arrangement') fq_df = pd.DataFrame(fq_df, columns=columns) @@ -271,8 +285,10 @@ def _load_fastq_data(self): fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min() fq_df['start_time'] = fq_df['start_time'].astype(np.float64) fq_df['channel'] = fq_df['channel'].astype(np.int16) - if self.is_barcode: - fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category") + + if self.is_barcode: + fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category") + return fq_df @@ -346,8 +362,11 @@ def check_fastq(self): self.is_barcode = False if 'model_version_id' not in metadata: metadata['model_version_id'] = 'Unknow' + run_info = [] try: - return metadata['runid'] , metadata['sampleid'] , metadata['model_version_id'] + sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid' + run_id = 'run_id' if 'run_id' in metadata else 'runid' + return metadata[run_id] , metadata[sample_id] , metadata['model_version_id'] except: return None @@ -356,7 +375,7 @@ def _extract_info_from_name(self, name): """ """ metadata = dict(x.split("=") for x in name.split(" ")[1:]) - start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ') + start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f%z') if self.is_barcode: return start_time, metadata['ch'], metadata['barcode'] return start_time, metadata['ch'] \ No newline at end of file diff --git a/toulligqc/toulligqc.py b/toulligqc/toulligqc.py index 01ade2d..d947863 100644 --- a/toulligqc/toulligqc.py +++ b/toulligqc/toulligqc.py @@ -352,17 +352,25 @@ def main(): sys.exit("ERROR: dico_path is empty") # Get barcode selection + allowed_patterns = r'(BC|RB|NB|BP|BARCODE)(\d{2})' + if config_dictionary['barcoding'].lower() == 'true': config_dictionary['barcode_selection'] = [] - if 'barcodes' in config_dictionary: + if 'samplesheet' in config_dictionary: + samplesheet = parse_samplesheet(config_dictionary['samplesheet']) + config_dictionary['barcodes'] = ",".join(list(samplesheet['barcode'])) + config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, + index=samplesheet.barcode).to_dict() + + if 'barcodes' in config_dictionary or 'samplesheet' in config_dictionary: barcode_set = set() if ":" in config_dictionary['barcodes']: start, end = config_dictionary['barcodes'].strip().split(':') - pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', start.strip().upper()) + pattern = re.search(allowed_patterns, start.strip().upper()) if pattern: start_number = int(pattern.group(2)) - pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', end.strip().upper()) + pattern = re.search(allowed_patterns, end.strip().upper()) if pattern: end_number = int(pattern.group(2)) for i in range(start_number, end_number + 1): @@ -371,13 +379,15 @@ def main(): else: for b in config_dictionary['barcodes'].strip().split(','): - pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', b.strip().upper()) + pattern = re.search(allowed_patterns, b.strip().upper()) if pattern: barcode = 'barcode{}'.format(pattern.group(2)) barcode_set.add(barcode) else: sys.stderr.write("\033[93mWarning:\033[0m Barcode '{}' is non-standard custom arrangement.\n".format(b)) barcode_set.add(b) + if 'samplesheet' in config_dictionary: + config_dictionary['barcode_alias'][barcode] = config_dictionary['barcode_alias'].pop(b) barcode_selection = sorted(barcode_set) @@ -385,12 +395,6 @@ def main(): sys.exit("ERROR: No known barcode found in provided list of barcodes") config_dictionary['barcode_selection'] = barcode_selection - elif 'samplesheet' in config_dictionary: - samplesheet = parse_samplesheet(config_dictionary['samplesheet']) - config_dictionary['barcode_selection'] = list(samplesheet['barcode']) - config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values, - index=samplesheet.barcode).to_dict() - else: config_dictionary['barcode_selection'] = ''