Skip to content

Commit

Permalink
Merge pull request #31 from GenomiqueENS/v2.7.1
Browse files Browse the repository at this point in the history
V2.7.1
  • Loading branch information
alihamraoui authored Aug 22, 2024
2 parents 39e9663 + 4043ad6 commit 4024de3
Show file tree
Hide file tree
Showing 6 changed files with 136 additions and 40 deletions.
23 changes: 18 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,26 +53,39 @@ $ cd toulligqc && python3 setup.py build install
ToulligQC is written with Python 3.
To run ToulligQC without Docker, you need to install the following Python modules:

* matplotlib
* plotly
* h5py
* matplotlib
* plotly
* h5py
* pandas
* numpy
* scipy
* scikit-learn
* pysam
* tqdm
* pod5

<a name="Conda-environemnt"></a>
### 1.2 Conda environemnt**

You can use a conda environment to install the required packages:

```
git clone https://github.com/GenomicParisCentre/toulligQC.git
cd toulligqc && python3 setup.py build install
conda env create -f environment.yml
conda activate toulliqc
```

<a name="pypi-installation"></a>
### 1.2 Using a PyPi package
### 1.3 Using a PyPi package

ToulligQC can be more easlily installed with a pip package availlable on the PyPi repository. The following command line will install the latest version of ToulligQC:
```bash
$ pip3 install toulligqc
```

<a name="docker"></a>
### 1.3 Using Docker
### 1.4 Using Docker
ToulligQC and its dependencies are available through a Docker image. To install docker on your system, go to the Docker website (<https://docs.docker.com/engine/installation/>).
Even if Docker can run on Windows or macOS virtual machines, we recommend to run ToulligQC on a Linux host.
<a name="docker-image-recovery"></a>
Expand Down
56 changes: 56 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: toulligqc
channels:
- defaults
dependencies:
- _libgcc_mutex=0.1
- _openmp_mutex=5.1
- bzip2=1.0.8
- ca-certificates=2024.7.2
- ld_impl_linux-64=2.38
- libffi=3.4.4
- libgcc-ng=11.2.0
- libgomp=11.2.0
- libstdcxx-ng=11.2.0
- libuuid=1.41.5
- ncurses=6.4
- openssl=3.0.14
- pip=24.0
- python=3.11.9
- readline=8.2
- setuptools=69.5.1
- sqlite=3.45.3
- tk=8.6.14
- wheel=0.43.0
- xz=5.4.6
- zlib=1.2.13
- pip:
- contourpy==1.2.1
- cycler==0.12.1
- fonttools==4.53.1
- h5py==3.11.0
- iso8601==2.1.0
- joblib==1.4.2
- kiwisolver==1.4.5
- lib-pod5==0.3.12
- matplotlib==3.9.1
- more-itertools==10.3.0
- numpy==2.0.0
- packaging==24.1
- pandas==1.26.4
- pillow==10.4.0
- plotly==5.22.0
- pod5==0.3.12
- polars==0.20.31
- pyarrow==16.1.0
- pyparsing==3.1.2
- pysam==0.22.1
- python-dateutil==2.9.0.post0
- pytz==2024.1
- scikit-learn==1.5.1
- scipy==1.14.0
- six==1.16.0
- tenacity==8.5.0
- threadpoolctl==3.5.0
- tqdm==4.66.4
- tzdata==2024.1
- vbz-h5py-plugin==1.0.1
4 changes: 2 additions & 2 deletions toulligqc/common_statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def compute_LXX(dataframe_dict, x):
cum_sum = 0
count = 0
for v in data:
cum_sum += v
cum_sum += int(v)
count += 1
if cum_sum >= half_sum:
return count
Expand All @@ -31,7 +31,7 @@ def compute_NXX(dataframe_dict, x):
half_sum = data.sum() * x / 100
cum_sum = 0
for v in data:
cum_sum += v
cum_sum += int(v)
if cum_sum >= half_sum:
return int(v)

Expand Down
6 changes: 5 additions & 1 deletion toulligqc/extractor_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,11 @@ def add_image_to_result(quiet, image_list, start_time, image):
def timeISO_to_float(iso_datetime, format):
"""
"""
dt = datetime.strptime(iso_datetime, format)
try:
dt = datetime.strptime(iso_datetime, format)
except:
format = '%Y-%m-%dT%H:%M:%SZ'
dt = datetime.strptime(iso_datetime, format)
unix_timestamp = dt.timestamp()
return unix_timestamp

Expand Down
63 changes: 41 additions & 22 deletions toulligqc/fastq_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,32 +119,45 @@ def graph_generation(self, result_dict):

add_image_to_result(self.quiet, images, time.time(), pgg.read_count_histogram(result_dict, self.images_directory))
add_image_to_result(self.quiet, images, time.time(), pgg.read_length_scatterplot(self.dataframe_dict, self.images_directory))

if self.rich:
add_image_to_result(self.quiet, images, time.time(), pgg.yield_plot(self.dataframe_1d, self.images_directory))
add_image_to_result(self.quiet, images, time.time(), pgg.read_quality_multiboxplot(self.dataframe_dict, self.images_directory))
add_image_to_result(self.quiet, images, time.time(), pgg.allphred_score_frequency(self.dataframe_dict, self.images_directory))

if self.rich:
add_image_to_result(self.quiet, images, time.time(), pgg.plot_performance(self.dataframe_1d, self.images_directory))
add_image_to_result(self.quiet, images, time.time(), pgg.twod_density(self.dataframe_dict, self.images_directory))

if self.rich:
add_image_to_result(self.quiet, images, time.time(), pgg.sequence_length_over_time(self.dataframe_dict, self.images_directory))
add_image_to_result(self.quiet, images, time.time(), pgg.phred_score_over_time(self.dataframe_dict, result_dict, self.images_directory))
if self.is_barcode:
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
self.barcode_selection,
self.images_directory))

read_fail = self.dataframe_dict["read.fail.barcoded"]
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
self.barcode_selection,
self.images_directory))

add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
self.images_directory))

add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
self.images_directory))
if self.is_barcode:
if "barcode_alias" in self.config_dictionary:
barcode_alias = self.config_dictionary['barcode_alias']
else:
barcode_alias = None

add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_pass(self.dataframe_dict,
self.barcode_selection,
self.images_directory,
barcode_alias))

read_fail = self.dataframe_dict["read.fail.barcoded"]
if not (len(read_fail) == 1 and read_fail["other barcodes"] == 0):
add_image_to_result(self.quiet, images, time.time(), pgg.barcode_percentage_pie_chart_fail(self.dataframe_dict,
self.barcode_selection,
self.images_directory,
barcode_alias))

add_image_to_result(self.quiet, images, time.time(), pgg.barcode_length_boxplot(self.dataframe_dict,
self.images_directory,
barcode_alias))

add_image_to_result(self.quiet, images, time.time(), pgg.barcoded_phred_score_frequency(self.dataframe_dict,
self.images_directory,
barcode_alias))
return images


Expand Down Expand Up @@ -211,7 +224,7 @@ def extract(self, result_dict):
"pass.reads.sequence.length")
describe_dict(self, result_dict, self.dataframe_dict["fail.reads.sequence.length"],
"fail.reads.sequence.length")
if self.is_barcode:
if self.rich and self.is_barcode:
extract_barcode_info(self, result_dict,
self.barcode_selection,
self.dataframe_dict,
Expand Down Expand Up @@ -258,8 +271,9 @@ def _load_fastq_data(self):
columns = ['sequence_length', 'mean_qscore', 'passes_filtering']
if self.rich:
columns.extend(['start_time', 'channel'])
if self.is_barcode:
columns.append('barcode_arrangement')

if self.is_barcode:
columns.append('barcode_arrangement')

fq_df = pd.DataFrame(fq_df, columns=columns)

Expand All @@ -271,8 +285,10 @@ def _load_fastq_data(self):
fq_df["start_time"] = fq_df["start_time"] - fq_df["start_time"].min()
fq_df['start_time'] = fq_df['start_time'].astype(np.float64)
fq_df['channel'] = fq_df['channel'].astype(np.int16)
if self.is_barcode:
fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")

if self.is_barcode:
fq_df['barcode_arrangement'] = fq_df['barcode_arrangement'].astype("category")

return fq_df


Expand Down Expand Up @@ -346,8 +362,11 @@ def check_fastq(self):
self.is_barcode = False
if 'model_version_id' not in metadata:
metadata['model_version_id'] = 'Unknow'
run_info = []
try:
return metadata['runid'] , metadata['sampleid'] , metadata['model_version_id']
sample_id = 'sample_id' if 'sample_id' in metadata else 'sampleid'
run_id = 'run_id' if 'run_id' in metadata else 'runid'
return metadata[run_id] , metadata[sample_id] , metadata['model_version_id']
except:
return None

Expand All @@ -356,7 +375,7 @@ def _extract_info_from_name(self, name):
"""
"""
metadata = dict(x.split("=") for x in name.split(" ")[1:])
start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%SZ')
start_time = timeISO_to_float(metadata['start_time'], '%Y-%m-%dT%H:%M:%S.%f%z')
if self.is_barcode:
return start_time, metadata['ch'], metadata['barcode']
return start_time, metadata['ch']
24 changes: 14 additions & 10 deletions toulligqc/toulligqc.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,17 +352,25 @@ def main():
sys.exit("ERROR: dico_path is empty")

# Get barcode selection
allowed_patterns = r'(BC|RB|NB|BP|BARCODE)(\d{2})'

if config_dictionary['barcoding'].lower() == 'true':
config_dictionary['barcode_selection'] = []

if 'barcodes' in config_dictionary:
if 'samplesheet' in config_dictionary:
samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
config_dictionary['barcodes'] = ",".join(list(samplesheet['barcode']))
config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values,
index=samplesheet.barcode).to_dict()

if 'barcodes' in config_dictionary or 'samplesheet' in config_dictionary:
barcode_set = set()
if ":" in config_dictionary['barcodes']:
start, end = config_dictionary['barcodes'].strip().split(':')
pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', start.strip().upper())
pattern = re.search(allowed_patterns, start.strip().upper())
if pattern:
start_number = int(pattern.group(2))
pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', end.strip().upper())
pattern = re.search(allowed_patterns, end.strip().upper())
if pattern:
end_number = int(pattern.group(2))
for i in range(start_number, end_number + 1):
Expand All @@ -371,26 +379,22 @@ def main():

else:
for b in config_dictionary['barcodes'].strip().split(','):
pattern = re.search(r'(BC|RB|NB|BP|BARCODE)(\d{2})', b.strip().upper())
pattern = re.search(allowed_patterns, b.strip().upper())
if pattern:
barcode = 'barcode{}'.format(pattern.group(2))
barcode_set.add(barcode)
else:
sys.stderr.write("\033[93mWarning:\033[0m Barcode '{}' is non-standard custom arrangement.\n".format(b))
barcode_set.add(b)
if 'samplesheet' in config_dictionary:
config_dictionary['barcode_alias'][barcode] = config_dictionary['barcode_alias'].pop(b)

barcode_selection = sorted(barcode_set)

if len(barcode_selection) == 0:
sys.exit("ERROR: No known barcode found in provided list of barcodes")
config_dictionary['barcode_selection'] = barcode_selection

elif 'samplesheet' in config_dictionary:
samplesheet = parse_samplesheet(config_dictionary['samplesheet'])
config_dictionary['barcode_selection'] = list(samplesheet['barcode'])
config_dictionary['barcode_alias'] = pd.Series(samplesheet.alias.values,
index=samplesheet.barcode).to_dict()

else:
config_dictionary['barcode_selection'] = ''

Expand Down

0 comments on commit 4024de3

Please sign in to comment.