From ca469e6ad029ad75917baab56f4540e9459a81a1 Mon Sep 17 00:00:00 2001 From: Lennart Date: Wed, 1 Aug 2018 14:47:14 +0200 Subject: [PATCH 1/6] Isolated import_bed | Enabled paired-end testing --- wisecondorX/wisetools.py | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/wisecondorX/wisetools.py b/wisecondorX/wisetools.py index 034383d..b9adb09 100755 --- a/wisecondorX/wisetools.py +++ b/wisecondorX/wisetools.py @@ -112,10 +112,10 @@ def flush(read_buff, counts): reads_seen += 1 larp = read.pos - # Flush after we're done - flush(read_buff, counts) - chromosomes[chrom_name] = counts - reads_kept += sum(counts) + # Flush after we're done + flush(read_buff, counts) + chromosomes[chrom_name] = counts + reads_kept += sum(counts) # print reads_seen,reads_kept qual_info = {'mapped': sam_file.mapped, @@ -413,7 +413,6 @@ def get_reference(corrected_data, chromosome_bins, chromosome_bin_sums, def try_sample(test_data, test_copy, indexes, distances, chromosome_bins, chromosome_bin_sums, cutoff): bincount = chromosome_bin_sums[-1] - results_z = np.zeros(bincount) results_r = np.zeros(bincount) ref_sizes = np.zeros(bincount) @@ -584,15 +583,19 @@ def get_median_within_segment_variance(segments, binratios): return np.median([x for x in vars if not np.isnan(x)]) -def apply_blacklist(args, binsize, results_r, results_z, results_w): - blacklist = {} - - for line in open(args.blacklist): +def import_bed(file, binsize): + bed = {} + for line in open(file): bchr, bstart, bstop = line.strip().split("\t") bchr = bchr[3:] - if bchr not in blacklist.keys(): - blacklist[bchr] = [] - blacklist[bchr].append([int(int(bstart) / binsize), int(int(bstop) / binsize) + 1]) + if bchr not in bed.keys(): + bed[bchr] = [] + bed[bchr].append([int(int(bstart) / binsize), int(int(bstop) / binsize) + 1]) + return bed + + +def apply_blacklist(args, binsize, results_r, results_z, results_w): + blacklist = import_bed(args.blacklist, binsize) for chrom in blacklist.keys(): for s_s in blacklist[chrom]: From bd448228ddd5360a3e6c775272947a4e297c20c5 Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 28 Aug 2018 12:53:10 +0200 Subject: [PATCH 2/6] Changed the definition of the z-score --- wisecondorX/main.py | 2 +- wisecondorX/wisetools.py | 22 ++++++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/wisecondorX/main.py b/wisecondorX/main.py index 7302a6d..3d6264f 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -4,7 +4,7 @@ from scipy.stats import norm -from wisecondorX.wisetools import * +from wisetools import * def tool_convert(args): diff --git a/wisecondorX/wisetools.py b/wisecondorX/wisetools.py index b9adb09..cbb580c 100755 --- a/wisecondorX/wisetools.py +++ b/wisecondorX/wisetools.py @@ -522,24 +522,30 @@ def generate_txt_output(args, out_dict): statistics_file = open(args.outid + "_chr_statistics.txt", "w") statistics_file.write("chr\tratio.mean\tratio.median\tzscore\n") chrom_scores = [] + + stouffer_scores = [] for chr_i in range(len(results_r)): + stouffer = np.sum(np.array(results_z[chr_i]) * np.array(results_w[chr_i])) \ + / np.sqrt(np.sum(np.power(np.array(results_w[chr_i]), 2))) + stouffer_scores.append(stouffer) + + for chr_i, stouffer_score in enumerate(stouffer_scores): + + zz_score = (stouffer_score - np_mean(np.delete(stouffer_scores, chr_i)))/np_std(np.delete(stouffer_scores, chr_i)) + chrom = str(chr_i + 1) if chrom == "23": chrom = "X" if chrom == "24": chrom = "Y" R = [x for x in results_r[chr_i] if x != 0] - - stouffer = np.sum(np.array(results_z[chr_i]) * np.array(results_w[chr_i])) \ - / np.sqrt(np.sum(np.power(np.array(results_w[chr_i]), 2))) - chrom_ratio_mean = np.mean(R) chrom_ratio_median = np.median(R) statistics_file.write(str(chrom) + "\t" + str(chrom_ratio_median) + "\t" + str(chrom_ratio_mean) - + "\t" + str(stouffer) + + "\t" + str(zz_score) + "\n") chrom_scores.append(chrom_ratio_mean) @@ -653,9 +659,13 @@ def cbs(args, results_r, results_z, results_w, reference_gender, wc_dir): # Save results + zz_scores = [] + for i, stouffer_score in enumerate(stouffer_scores): + zz_scores.append((stouffer_score - np_mean(np.delete(stouffer_scores, i)))/np_std(np.delete(stouffer_scores, i))) + cbs_calls = [] for cbs_call_index in range(len(cbs_data[0])): cbs_calls.append( [cbs_data[0][cbs_call_index], cbs_data[1][cbs_call_index] - 1, cbs_data[2][cbs_call_index] - 1, - stouffer_scores[cbs_call_index], cbs_data[4][cbs_call_index]]) + zz_scores[cbs_call_index], cbs_data[4][cbs_call_index]]) return cbs_calls From ed282f290820b5a307f7fcfe2d0789aa32047cc1 Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 28 Aug 2018 12:58:43 +0200 Subject: [PATCH 3/6] Enabled paired-end processing --- wisecondorX/main.py | 7 ++++++- wisecondorX/wisetools.py | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/wisecondorX/main.py b/wisecondorX/main.py index 3d6264f..b3a1c99 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -11,8 +11,10 @@ def tool_convert(args): logging.info('Starting conversion') logging.info('Importing data ...') logging.info('Converting bam ... This might take a while ...') + print(args.paired) sample, qual_info = convert_bam(args.infile, binsize=args.binsize, - min_shift=args.retdist, threshold=args.retthres) + min_shift=args.retdist, threshold=args.retthres, demand_pair=args.paired) + if args.gender: gender = args.gender logging.info('Gender {}'.format(gender)) @@ -437,6 +439,9 @@ def main(): type=str, choices=["F", "M"], help='Gender of the case. If not given, WisecondorX will predict it') + parser_convert.add_argument('--paired', + action="store_true", + help='Use paired-end reads | default is single-end') parser_convert.add_argument('--gonmapr', type=float, default=2, diff --git a/wisecondorX/wisetools.py b/wisecondorX/wisetools.py index cbb580c..71271c4 100755 --- a/wisecondorX/wisetools.py +++ b/wisecondorX/wisetools.py @@ -25,7 +25,7 @@ find_pos = bisect.bisect -def convert_bam(bamfile, binsize, min_shift, threshold, mapq=1, demand_pair=False): +def convert_bam(bamfile, binsize, min_shift, threshold, demand_pair, mapq=1): # Prepare the list of chromosomes chromosomes = dict() for chromosome in range(1, 25): From d1685955aedc12b2718206d4937a92b23b00e75e Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 28 Aug 2018 13:09:36 +0200 Subject: [PATCH 4/6] Imporved gender determination --- README.md | 3 ++- wisecondorX/main.py | 9 ++++----- wisecondorX/wisetools.py | 13 +------------ 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 4c65a0b..597efc4 100755 --- a/README.md +++ b/README.md @@ -75,7 +75,8 @@ WisecondorX convert input.bam output.npz [--optional arguments] `--retdist x` | Max amount of bp's between reads to consider them part of the same tower (default: x=4) `--retthres x` | Threshold for a group of reads to be considered a tower. These will be removed (default: x=4) `--gender x` | When not used (which is recommended), WisecondorX will predict the gender. Manually assigning gender could be useful for highly aberrant samples (choices: x=F, x=M) -`--gonmapr x` | Represents the overall mappability ratio between X and Y. Concerning short single-end read mapping, an X bin is twice (default) as mappable compared to a Y bin. Used to predict gender (default: x=2) +`--ycutoff x` | A cutoff value representing the ratio 'Y reads/total reads'. Used to predict gender. Might require training for selecting the optimal value (default: x=0.004) +`--paired` | Enables conversion for paired-end reads → Bash recipe (example for NIPT) at `./pipeline/convert.sh` diff --git a/wisecondorX/main.py b/wisecondorX/main.py index b3a1c99..b355bb1 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -442,12 +442,11 @@ def main(): parser_convert.add_argument('--paired', action="store_true", help='Use paired-end reads | default is single-end') - parser_convert.add_argument('--gonmapr', + parser_convert.add_argument('--ycutoff', type=float, - default=2, - help='The gonosomal mappabality ratio between X and Y. Concerning short single-end ' - 'read mapping, a Y bin is two times (default) less mappable compared to an X bin. ' - 'Used to predict gender') + default=0.004, + help='A cutoff value representing the ratio \'Y reads/total reads\'. ' + 'Used to predict gender. Might require training for selecting the optimal value') parser_convert.set_defaults(func=tool_convert) # Get gender diff --git a/wisecondorX/wisetools.py b/wisecondorX/wisetools.py index 71271c4..020379e 100755 --- a/wisecondorX/wisetools.py +++ b/wisecondorX/wisetools.py @@ -131,20 +131,9 @@ def flush(read_buff, counts): def get_gender(args, sample): tot_reads = float(sum([sum(sample[str(x)]) for x in range(1, 25)])) - X_reads = float(sum(sample["23"])) - X_len = float(len(sample["23"])) Y_reads = float(sum(sample["24"])) - Y_len = float(len(sample["24"])) - X = (X_reads / tot_reads) / X_len * (1. / args.gonmapr) - Y = (Y_reads / tot_reads) / Y_len - - # X/Y = ? - # 1/1 (MALE) = 1 - # 2/noise (FEMALE) = [4,8] - # cut-off 3 -- should be robust vs noise, mosaic large subchromosomal duplication/deletions, and male pregnancies - - if X / Y < 3: + if Y_reads / tot_reads > args.ycutoff: return "M" else: return "F" From 79cef500a78c90bd8025e2ac7a30856e5f9e014a Mon Sep 17 00:00:00 2001 From: Lennart Date: Tue, 28 Aug 2018 13:58:41 +0200 Subject: [PATCH 5/6] Added output interpretation to readme --- README.md | 37 +++++++++++++++++++++++++++++++++++++ wisecondorX/main.py | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 597efc4..d8e7ed9 100755 --- a/README.md +++ b/README.md @@ -155,6 +155,43 @@ fix.convert.npz.py input.npz output.npz To get the (predicted) gender of a sample, one can use `WisecondorX gender input.npz`. +# Interpretation results + +## Plots + +Every dot represents a bin. The dots range across the X-axis from chromosome 1 to X (or Y, in case of a male). The value +of a dot represents the ratio between the observed number of reads and the expected number of reads, the latter being +the 'healthy' state. As these values are log2-transformed, healthy dots should be close-to 0. Importantly, notice that +the dots are always subject to Gaussian noise. Therefore, segments, indicated by horizontal grey bars, cover bins of +predicted equal copy number. Vertical grey bars represent the blacklist, which will match hypervariable loci and repeats. +Finally, the vertical colored dotted lines show where the constitutional 1n and 3n states are expected (when constitutional +DNA is at 100% purity, at least). Often, an aberration does not surpass these limits, which has several potential causes: +depending on your type of analysis, the sample could be subject to tumor fraction, fetal fraction, mosaicisms, etc ... +Sometimes, the segments do surpass these limits: here it's likely you are dealing with 4n, 5n, 6n, ... + +## Tables + +### sample_bins.bed + +This file contains all bin-wise information. When data is 'NaN', the corresponding bin is included in the blacklist. +The Z-scores are calculated using the within-sample reference bins. + +### sample_segments.bed + +This file contains all segment-wise information. A combined Z-score is calculated using Stouffer's approach. The final +z-score quantifies the difference between a certain segment and the others segments. Note that this score thus does not +differentiate significant from non-significant 'aberrations'. This score could be particularly interesting for NIPT, +where none (healthy) or one (e.g. tri21) aberrations are often expected. + +### sample_aberrations.bed + +This file contains segments with a ratio surpassing a certain cutoff value, defined by the `--beta` parameter. + +### sample_chr_statistics.bed + +This file contains some interesting statistics for each chromosome. The definition of the Z-scores matches the one from +the 'sample_segments.bed'. Might be interesting for NIPT. + # Dependencies - R (v3.4) packages diff --git a/wisecondorX/main.py b/wisecondorX/main.py index b355bb1..561d113 100755 --- a/wisecondorX/main.py +++ b/wisecondorX/main.py @@ -4,7 +4,7 @@ from scipy.stats import norm -from wisetools import * +from wisecondorX.wisetools import * def tool_convert(args): From 086128e6526679b00b71115a7ba6a74292ec2a9b Mon Sep 17 00:00:00 2001 From: Matthias De Smet Date: Fri, 31 Aug 2018 14:56:01 +0200 Subject: [PATCH 6/6] bumped subversion --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 16e7ef9..e9171cb 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ #! /usr/bin/env python from setuptools import setup, find_packages -version = '0.2.0' +version = '0.2.1' dl_version = 'master' if 'dev' in version else '{}'.format(version) setup(