Merge pull request #17 from CenterForMedicalGeneticsGhent/alt.chr

Alt.chr
CenterForMedicalGeneticsGhent · Aug 31, 2018 · ce261de · ce261de
2 parents 5a51436 + 086128e
commit ce261de
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 39 deletions.
diff --git a/README.md b/README.md
@@ -75,7 +75,8 @@ WisecondorX convert input.bam output.npz [--optional arguments]
 `--retdist x` | Max amount of bp's between reads to consider them part of the same tower (default: x=4)  
 `--retthres x` | Threshold for a group of reads to be considered a tower. These will be removed (default: x=4)  
 `--gender x` | When not used (which is recommended), WisecondorX will predict the gender. Manually assigning gender could be useful for highly aberrant samples (choices: x=F, x=M)  
-`--gonmapr x` | Represents the overall mappability ratio between X and Y. Concerning short single-end read mapping, an X bin is twice (default) as mappable compared to a Y bin. Used to predict gender (default: x=2)  
+`--ycutoff x` | A cutoff value representing the ratio 'Y reads/total reads'. Used to predict gender. Might require training for selecting the optimal value (default: x=0.004)
+`--paired` | Enables conversion for paired-end reads  
 
 
 &rarr; Bash recipe (example for NIPT) at `./pipeline/convert.sh`
@@ -154,6 +155,43 @@ fix.convert.npz.py input.npz output.npz
 
 To get the (predicted) gender of a sample, one can use `WisecondorX gender input.npz`.  
 
+# Interpretation results
+
+## Plots
+
+Every dot represents a bin. The dots range across the X-axis from chromosome 1 to X (or Y, in case of a male). The value 
+of a dot represents the ratio between the observed number of reads and the expected number of reads, the latter being 
+the 'healthy' state. As these values are log2-transformed, healthy dots should be close-to 0. Importantly, notice that 
+the dots are always subject to Gaussian noise. Therefore, segments, indicated by horizontal grey bars, cover bins of 
+predicted equal copy number. Vertical grey bars represent the blacklist, which will match hypervariable loci and repeats. 
+Finally, the vertical colored dotted lines show where the constitutional 1n and 3n states are expected (when constitutional
+DNA is at 100% purity, at least). Often, an aberration does not surpass these limits, which has several potential causes:
+depending on your type of analysis, the sample could be subject to tumor fraction, fetal fraction, mosaicisms, etc ...
+Sometimes, the segments do surpass these limits: here it's likely you are dealing with 4n, 5n, 6n, ...
+
+## Tables
+
+### sample_bins.bed
+
+This file contains all bin-wise information. When data is 'NaN', the corresponding bin is included in the blacklist. 
+The Z-scores are calculated using the within-sample reference bins.
+
+### sample_segments.bed
+
+This file contains all segment-wise information. A combined Z-score is calculated using Stouffer's approach. The final 
+z-score quantifies the difference between a certain segment and the others segments. Note that this score thus does not 
+differentiate significant from non-significant 'aberrations'. This score could be particularly interesting for NIPT, 
+where none (healthy) or one (e.g. tri21) aberrations are often expected.
+
+### sample_aberrations.bed
+
+This file contains segments with a ratio surpassing a certain cutoff value, defined by the `--beta` parameter.
+
+### sample_chr_statistics.bed
+
+This file contains some interesting statistics for each chromosome. The definition of the Z-scores matches the one from
+the 'sample_segments.bed'. Might be interesting for NIPT.
+
 # Dependencies
 
 - R (v3.4) packages

diff --git a/setup.py b/setup.py
@@ -1,7 +1,7 @@
 #! /usr/bin/env python
 from setuptools import setup, find_packages
 
-version = '0.2.0'
+version = '0.2.1'
 dl_version = 'master' if 'dev' in version else '{}'.format(version)
 
 setup(

diff --git a/wisecondorX/main.py b/wisecondorX/main.py
@@ -11,8 +11,10 @@ def tool_convert(args):
     logging.info('Starting conversion')
     logging.info('Importing data ...')
     logging.info('Converting bam ... This might take a while ...')
+    print(args.paired)
     sample, qual_info = convert_bam(args.infile, binsize=args.binsize,
-                                    min_shift=args.retdist, threshold=args.retthres)
+                                    min_shift=args.retdist, threshold=args.retthres, demand_pair=args.paired)
+
     if args.gender:
         gender = args.gender
         logging.info('Gender {}'.format(gender))
@@ -437,12 +439,14 @@ def main():
                                 type=str,
                                 choices=["F", "M"],
                                 help='Gender of the case. If not given, WisecondorX will predict it')
-    parser_convert.add_argument('--gonmapr',
+    parser_convert.add_argument('--paired',
+                                action="store_true",
+                                help='Use paired-end reads | default is single-end')
+    parser_convert.add_argument('--ycutoff',
                                 type=float,
-                                default=2,
-                                help='The gonosomal mappabality ratio between X and Y. Concerning short single-end '
-                                     'read mapping, a Y bin is two times (default) less mappable compared to an X bin. '
-                                     'Used to predict gender')
+                                default=0.004,
+                                help='A cutoff value representing the ratio \'Y reads/total reads\'. '
+                                     'Used to predict gender. Might require training for selecting the optimal value')
     parser_convert.set_defaults(func=tool_convert)
 
     # Get gender

diff --git a/wisecondorX/wisetools.py b/wisecondorX/wisetools.py
@@ -25,7 +25,7 @@
 find_pos = bisect.bisect
 
 
-def convert_bam(bamfile, binsize, min_shift, threshold, mapq=1, demand_pair=False):
+def convert_bam(bamfile, binsize, min_shift, threshold, demand_pair, mapq=1):
     # Prepare the list of chromosomes
     chromosomes = dict()
     for chromosome in range(1, 25):
@@ -112,10 +112,10 @@ def flush(read_buff, counts):
                 reads_seen += 1
                 larp = read.pos
 
-            # Flush after we're done
-            flush(read_buff, counts)
-            chromosomes[chrom_name] = counts
-            reads_kept += sum(counts)
+        # Flush after we're done
+        flush(read_buff, counts)
+        chromosomes[chrom_name] = counts
+        reads_kept += sum(counts)
 
     # print reads_seen,reads_kept
     qual_info = {'mapped': sam_file.mapped,
@@ -131,20 +131,9 @@ def flush(read_buff, counts):
 
 def get_gender(args, sample):
     tot_reads = float(sum([sum(sample[str(x)]) for x in range(1, 25)]))
-    X_reads = float(sum(sample["23"]))
-    X_len = float(len(sample["23"]))
     Y_reads = float(sum(sample["24"]))
-    Y_len = float(len(sample["24"]))
 
-    X = (X_reads / tot_reads) / X_len * (1. / args.gonmapr)
-    Y = (Y_reads / tot_reads) / Y_len
-
-    # X/Y               = ?
-    # 1/1 (MALE)        = 1
-    # 2/noise (FEMALE)  = [4,8]
-    # cut-off 3 -- should be robust vs noise, mosaic large subchromosomal duplication/deletions, and male pregnancies
-
-    if X / Y < 3:
+    if Y_reads / tot_reads > args.ycutoff:
         return "M"
     else:
         return "F"
@@ -413,7 +402,6 @@ def get_reference(corrected_data, chromosome_bins, chromosome_bin_sums,
 def try_sample(test_data, test_copy, indexes, distances, chromosome_bins,
                chromosome_bin_sums, cutoff):
     bincount = chromosome_bin_sums[-1]
-
     results_z = np.zeros(bincount)
     results_r = np.zeros(bincount)
     ref_sizes = np.zeros(bincount)
@@ -523,24 +511,30 @@ def generate_txt_output(args, out_dict):
     statistics_file = open(args.outid + "_chr_statistics.txt", "w")
     statistics_file.write("chr\tratio.mean\tratio.median\tzscore\n")
     chrom_scores = []
+
+    stouffer_scores = []
     for chr_i in range(len(results_r)):
+        stouffer = np.sum(np.array(results_z[chr_i]) * np.array(results_w[chr_i])) \
+                   / np.sqrt(np.sum(np.power(np.array(results_w[chr_i]), 2)))
+        stouffer_scores.append(stouffer)
+
+    for chr_i, stouffer_score in enumerate(stouffer_scores):
+
+        zz_score = (stouffer_score - np_mean(np.delete(stouffer_scores, chr_i)))/np_std(np.delete(stouffer_scores, chr_i))
+
         chrom = str(chr_i + 1)
         if chrom == "23":
             chrom = "X"
         if chrom == "24":
             chrom = "Y"
         R = [x for x in results_r[chr_i] if x != 0]
-
-        stouffer = np.sum(np.array(results_z[chr_i]) * np.array(results_w[chr_i])) \
-                   / np.sqrt(np.sum(np.power(np.array(results_w[chr_i]), 2)))
-
         chrom_ratio_mean = np.mean(R)
         chrom_ratio_median = np.median(R)
 
         statistics_file.write(str(chrom)
                               + "\t" + str(chrom_ratio_median)
                               + "\t" + str(chrom_ratio_mean)
-                              + "\t" + str(stouffer)
+                              + "\t" + str(zz_score)
                               + "\n")
         chrom_scores.append(chrom_ratio_mean)
 
@@ -584,15 +578,19 @@ def get_median_within_segment_variance(segments, binratios):
     return np.median([x for x in vars if not np.isnan(x)])
 
 
-def apply_blacklist(args, binsize, results_r, results_z, results_w):
-    blacklist = {}
-
-    for line in open(args.blacklist):
+def import_bed(file, binsize):
+    bed = {}
+    for line in open(file):
         bchr, bstart, bstop = line.strip().split("\t")
         bchr = bchr[3:]
-        if bchr not in blacklist.keys():
-            blacklist[bchr] = []
-        blacklist[bchr].append([int(int(bstart) / binsize), int(int(bstop) / binsize) + 1])
+        if bchr not in bed.keys():
+            bed[bchr] = []
+            bed[bchr].append([int(int(bstart) / binsize), int(int(bstop) / binsize) + 1])
+    return bed
+
+
+def apply_blacklist(args, binsize, results_r, results_z, results_w):
+    blacklist = import_bed(args.blacklist, binsize)
 
     for chrom in blacklist.keys():
         for s_s in blacklist[chrom]:
@@ -650,9 +648,13 @@ def cbs(args, results_r, results_z, results_w, reference_gender, wc_dir):
 
     # Save results
 
+    zz_scores = []
+    for i, stouffer_score in enumerate(stouffer_scores):
+        zz_scores.append((stouffer_score - np_mean(np.delete(stouffer_scores, i)))/np_std(np.delete(stouffer_scores, i)))
+
     cbs_calls = []
     for cbs_call_index in range(len(cbs_data[0])):
         cbs_calls.append(
             [cbs_data[0][cbs_call_index], cbs_data[1][cbs_call_index] - 1, cbs_data[2][cbs_call_index] - 1,
-             stouffer_scores[cbs_call_index], cbs_data[4][cbs_call_index]])
+             zz_scores[cbs_call_index], cbs_data[4][cbs_call_index]])
     return cbs_calls