From 39b6adf5d89d6bab29d74cce4603f147c2fd5946 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Sat, 16 Jan 2016 22:07:06 -0500 Subject: [PATCH 1/4] adding script to generate reference bed --- scripts/GenerateLobSTRRefBed.py | 685 ++++++++++++++++++++++++++++++++ 1 file changed, 685 insertions(+) create mode 100755 scripts/GenerateLobSTRRefBed.py diff --git a/scripts/GenerateLobSTRRefBed.py b/scripts/GenerateLobSTRRefBed.py new file mode 100755 index 0000000..1732909 --- /dev/null +++ b/scripts/GenerateLobSTRRefBed.py @@ -0,0 +1,685 @@ +#!/usr/bin/env python +""" +python GenerateLobSTRRefBed.py +""" + +import collections +import glob +import os +import pyfasta +import sys +import tempfile +from subprocess import Popen, PIPE, STDOUT + +###################### +# TRF params - at some point could be made options +TRF_MIN_SCORE = 5 +TRF_MATCH_WT = 2 +TRF_MISMATCH_PEN = 7 +TRF_INDEL_PEN = 7 +TRF_P_MATCH = 80 +TRF_P_INDEL = 10 +TRF_MAX_PERIOD = 500 + +# For fixing TRF output +MAX_PERIOD = 6 +PATTERN_INDEX = 13 +PERIOD_INDEX = 2 +NREPEAT_INDEX = 3 +CONS_SIZE_INDEX = 4 +START_INDEX = 0 +STOP_INDEX = 1 +SCORE_INDEX = 7 +NUM_TOKENS = 15 + +# For parsing TRF output +PARSE_NUM_TOKENS = 15 +PARSE_SCORE_INDEX = 7 +PARSE_PERIOD_INDEX = 4 +PARSE_START_INDEX = 0 +PARSE_STOP_INDEX = 1 +PARSE_NREPEAT_INDEX = 3 + +###################### + +def PROGRESS(msg): + sys.stderr.write(msg.strip() + "\n") + +def ERROR(msg): + sys.stderr.write(msg.strip() + "\n") + sys.exit(1) + +def RunCommand(cmd): + p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, \ + stderr=STDOUT, close_fds=True) + ex = p.wait() + if ex != 0: + stdout, stderr = "", "" + if p.stdout is not None: stdout = p.stdout.read() + if p.stderr is not None: stderr = p.stderr.read() + return ex + +def CheckProgram(program): + """ Check whether a program is installed """ + def is_exe(fpath): + return os.path.isfile(fpath) and os.access(fpath, os.X_OK) + for path in os.environ["PATH"].split(os.pathsep): + path = path.strip('"') + exe_file = os.path.join(path, program) + if is_exe(exe_file): return True + return False + + +# Create a new file containing the same information as contained in INPUT_FILE, +# except that the period, pattern and number of repeats for entries +# with incorrect patterns are corrected. Also filters out entries with a corrected +# period greater than MAX_PERIOD +# USE FOR FILES CONTAINING LONG PERIODS +def correct_pattern_errors(input_file, output_prefix, max_period=6): + data = open(input_file, "r") + + fail_count = 0 + skip_count = 0 + + current_chrom = None + output = None + for line in data: + if line.startswith("Sequence"): + current_chrom = line.split()[1].strip() + if output is not None: output.close() + output = open(output_prefix + ".%s"%current_chrom, "w") + continue + + tokens = line.strip().split() + + if len(tokens) != NUM_TOKENS: + skip_count = skip_count + 1 + continue + + pattern = tokens[PATTERN_INDEX] + subseq = has_subseq_long(pattern) + if subseq: + new_patt = subseq + tokens[NREPEAT_INDEX] = str(float(tokens[NREPEAT_INDEX])*len(pattern)/len(new_patt)) + tokens[PATTERN_INDEX] = new_patt + tokens[PERIOD_INDEX] = str(len(new_patt)) + tokens[CONS_SIZE_INDEX] = str(len(new_patt)) + new_line = ' '.join(tokens) + '\n' + + if len(new_patt) <= max_period: + output.write(new_line) + else: + fail_count += 1 + else: + if int(tokens[CONS_SIZE_INDEX]) <= max_period: + output.write(line) + else: + fail_count += 1 + + data.close() + output.close() + PROGRESS("# REMOVED PATTERNS = " + str(fail_count)) + +# Run the Z-algorithm on the provided sequence. Returns +# an array of length len(NMER), where arr[i] corresponds +# to the length of the sequence starting at position +# i that is a prefix of NMER +def zalgorithm(nmer): + res = len(nmer)*[0] + l = -1 + r = -1 + + for i in xrange(1, len(nmer), 1): + if i > r: + j = i + while j < len(nmer) and nmer[j] == nmer[j-i]: + j = j + 1 + + res[i] = j-i + l = i + r = j-1 + else: + i_prime = i-l + beta = r-i+1 + + if res[i_prime] < beta: + res[i] = res[i_prime] + elif res[i_prime] == beta: + j = r + 1 + while j < len(nmer) and nmer[j] == nmer[j-i]: + j = j + 1 + + res[i] = j-i + l = i + r = j-1 + else: + res[i] = beta + res[0] = len(nmer) + return res + +# Returns a 2D array of length n, where +# the ith index contains all nmers of length i +def generate_nmers(max_n): + nmers = [] + for i in xrange(max_n+1): + nmers.append([]) + + nmers[0].append('') + for i in xrange(1, max_n+1, 1): + for nmer in nmers[i-1]: + nmers[i].append(nmer+'A') + nmers[i].append(nmer+'C') + nmers[i].append(nmer+'G') + nmers[i].append(nmer+'T') + return nmers + +# Returns the substring from start to stop +# (inclusive) of the provided sequence +def get_string(sequence, start, stop): + if start > stop: + return sequence[start:]+sequence[:stop+1] + else: + return sequence[start:stop+1] + + +# Returns the repeating subunit of the provided sequence if it +# consists of exactly 2 or more copies. Otherwise, returns False +# USE FOR LONG SEQUENCES +def has_subseq_long(nmer): + prefix_lengths = zalgorithm(nmer) + + for k in xrange(1, len(nmer), 1): + if len(nmer) % k == 0: + match = True + + for segment in xrange(len(nmer)/k): + coord = segment*k + + if prefix_lengths[coord] < k: + match = False + break + + if match: + return nmer[0:k] + return False + + +# Returns the repeating subunit of the provided sequence if it consits +# of exactly 2 or more exact copies. Otherwise, returns False +# ONLY USE FOR SHORT SEQUENCES +def has_subseq(nmer): + for k in xrange(1, len(nmer), 1): + if len(nmer) % k == 0: + seg_len = k + + seq = get_string(nmer, 0, (-1 + seg_len)%len(nmer)) + match = True + + for segment in xrange(len(nmer)/k): + p1 = (segment*seg_len) % len(nmer) + p2 = (-1 + (segment+1)*seg_len) % len(nmer) + + if get_string(nmer, p1, p2) != seq: + match = False + break + + if match: + return seq + return False + + +# Returns a dictionary that contains all kmers of length <= MAX_N +# that consist of a repeating subunit as keys and their smallest subunits as values +def create_subseq_dict(max_n): + nmers = generate_nmers(max_n) + subseqs = {} + for i in xrange(1, max_n+1, 1): + for nmer in nmers[i]: + subseq = has_subseq(nmer) + if subseq: + subseqs[nmer] = subseq + return subseqs + +# Create a new file containing the same information as contained in INPUT_FILE, +# except that the period, pattern and number of repeats for entries +# with incorrect patterns are corrected +def process_file(input_file, output_file, subseqs): + data = open(input_file, "r") + output = open(output_file, "w") + fail_count = 0 + + for line in data: + tokens = line.strip().split() + pattern = tokens[PATTERN_INDEX] + if pattern in subseqs: + new_patt = subseqs[pattern] + tokens[NREPEAT_INDEX] = str(float(tokens[NREPEAT_INDEX])*len(pattern)/len(new_patt)) + tokens[PATTERN_INDEX] = new_patt + tokens[PERIOD_INDEX] = str(len(new_patt)) + tokens[CONS_SIZE_INDEX] = str(len(new_patt)) + new_line = '\t'.join(tokens) + '\n' + output.write(new_line) + fail_count = fail_count + 1 + else: + output.write(line) + data.close() + output.close() + PROGRESS("# PATTERN CORRECTIONS = " + str(fail_count)) + +# Create a new file containing the same information as contained in INPUT_FILE, +# except that some overlapping TR intervals are removed to get rid of all overlaps. +# Scanning from low to high coordinates on a chromosome, when an overlap occurs, +# the region with the higher score is kept while the region with the lower +# score is discarded and not written to the output file. If a discard file +# is provided, the removed regions are written to the discard file. +def resolve_overlaps(input_file, output_file, discard_file=None): + data = open(input_file, "r") + lines = [] + starts = [] + stops = [] + scores = [] + for line in data: + tokens = line.strip().split() + starts.append(int(tokens[START_INDEX])) + stops.append(int(tokens[STOP_INDEX])) + scores.append(int(tokens[SCORE_INDEX])) + lines.append(line) + data.close() + + indices = sorted(range(len(starts)), key = lambda x: (starts[x], stops[x])) + max_stop = -1 + index = -1 + score = -1 + num_rem = 0 + inc_tr = len(starts)*[True] + + for i in xrange(len(starts)): + if starts[indices[i]] <= max_stop: + num_rem = num_rem + 1 + + if scores[indices[i]] > score: + max_stop = stops[indices[i]] + score = scores[indices[i]] + inc_tr[index] = False + index = indices[i] + else: + inc_tr[indices[i]] = False + else: + max_stop = stops[indices[i]] + score = scores[indices[i]] + index = indices[i] + + output = open(output_file, "w") + if discard_file is not None: + discard = open(discard_file, "w") + + for i in xrange(len(starts)): + if inc_tr[i]: + output.write(lines[i]) + elif discard_file is not None: + discard.write(lines[i]) + + output.close() + if discard_file is not None: + discard.close() + + PROGRESS("# REMOVED TANDEM REPEATS = " + str(num_rem)) + PROGRESS("TOTAL TANDEM REPEATS = " + str(len(starts))) + +def parse_trf_file(input_file, store_vals=PARSE_NUM_TOKENS*[True]): + data = open(input_file, "r") + + # start stop period num_repeats consensus_size percent_match percent_indel score a_frac c_frac g_frac t_frac entropy consensus sequence + operations = [int, int, int, float, int, int, int, int, int, int, int, int, float, str, str] + vals = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []] + + skip_count = 0 + line_count = 0 + for line in data: + tokens = line.strip().split() + line_count = line_count + 1 + + if len(tokens) != PARSE_NUM_TOKENS: + skip_count = skip_count + 1 + continue + + for i in xrange(PARSE_NUM_TOKENS): + if store_vals[i]: + vals[i].append(operations[i](tokens[i])) + + return_vals = [] + for i in xrange(PARSE_NUM_TOKENS): + if store_vals[i]: + return_vals.append(vals[i]) + + PROGRESS("Skipped " + str(skip_count)+ "/" + str(line_count) + " lines") + return return_vals + + +def create_filtered_trf_bed_file(input_prefix, output_file): + max_period = 6 + period_vals = [1, 2, 3, 4, 5, 6] + period_thresholds = [20, 22, 28, 28, 32, 34] + + # Get input files and chromosomes, store in input_files and chromosomes + input_files = glob.glob(input_prefix + "*") + chromosomes = [os.path.basename(f).split(".")[-1] for f in input_files] + + # Output fields and their associated indices in the input array + # start, end, period, refcopy, period, start, end, score, percA, percC, percG, percT, entropy, repseq + data_indices = [0, 1, 4, 3, 4, 0, 1, 7, 8, 9, 10, 11, 12, 13] + + vals = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], []] + chrs = [] + for i in xrange(len(input_files)): + PROGRESS("Processing " + input_files[i]) + file_vals = parse_trf_file(input_files[i]) + + for j in xrange(len(file_vals[0])): + if (file_vals[PARSE_PERIOD_INDEX][j] <= 6) and (file_vals[PARSE_SCORE_INDEX][j] > period_thresholds[file_vals[PARSE_PERIOD_INDEX][j]-1]): + for k in xrange(len(vals)): + vals[k].append(file_vals[k][j]) + chrs.append(chromosomes[i]) + + indices = range(len(chrs)) + indices = sorted(indices, key = lambda x: (chrs[x], vals[PARSE_START_INDEX][x], vals[PARSE_STOP_INDEX][x])) + + output = open(output_file, "w") + for i in xrange(len(indices)): + line = str(chrs[indices[i]]) + for j in xrange(len(data_indices)): + line = line + "\t" + str(vals[data_indices[j]][indices[i]]) + line = line + "\n" + output.write(line) + output.close() + +def analyze_overlaps(input_file, pass_file, fail_file): + input_fh = open(input_file, "r") + pass_fh = open(pass_file, "w") + fail_fh = open(fail_file, "w") + + pass_count = 0 + fail_count = 0 + + regions = [] + min_start = 0 + max_stop = -1 + for line in input_fh: + tokens = line.strip().split("\t") + chrom = tokens[0] + start, stop, period = map(int, tokens[1:4]) + num_repeats = float(tokens[4]) + score = float(tokens[8]) + if start <= max_stop: + max_stop = max(max_stop, stop) + else: + # Analyze the previous set of overlapping regions + region_size = max_stop - min_start + 1 + max_score = 0 + max_index = -1 + cov_frac = 0.0 + for index,region in enumerate(regions): + if region[4] >= max_score: + max_index = index + max_score = region[5] + cov_frac = 1.0*(region[2]-region[1]+1)/region_size + + if max_index != -1: + region = regions[max_index] + if cov_frac > 0.85: + pass_fh.write("%s\t%s\t%d\t%d\t%.1f\n"%(region[0], region[1], region[2], region[3], region[4])) + pass_count += (len(regions) > 1) + else: + for region in regions: + fail_fh.write("%s\t%s\t%d\t%d\t%.1f\n"%(region[0], region[1], region[2], region[3], region[4])) + fail_count += (len(regions) > 1) + regions = [] + min_start = start + max_stop = stop + regions.append((chrom, start, stop, period, num_repeats, score)) + + input_fh.close() + pass_fh.close() + fail_fh.close() + PROGRESS("Pass: %s, Fail: %s"%(pass_count, fail_count)) + +def min_perm(seq): + min_perm = seq + for i in xrange(len(seq)): + other = seq[i:]+seq[0:i] + if other < min_perm: + min_perm = other + return min_perm + +def longest_perfect_array(allele, motif): + max_start = -1 + max_length = -1 + for i in xrange(len(allele)): + for offset in xrange(len(motif)): + seq_index = i + while seq_index < len(allele) and allele[seq_index] == motif[(offset+seq_index-i)%len(motif)]: + seq_index += 1 + if seq_index - i > max_length: + max_start = i + max_length = seq_index - i + return max_length, max_start + +# Complements for each of the 4 DNA bases +complement_table = { + 'A':'T', + 'C':'G', + 'G':'C', + 'T':'A', + } + +def complement(dna_sequence): + res = "" + for i in xrange(len(dna_sequence)): + res = res + complement_table[dna_sequence[i]] + return res + +def reverse_complement(dna_sequence): + return complement(dna_sequence[::-1]) + +def calc_alignment_purity(sequence, motif): + # Traceback constants + LEFT = -1 + DIAG = 0 + UP = 1 + + # Scoring constants + MATCH = 0 + MISMATCH = 1 + INDEL = 1 + + # Create a "perfect" sequence consisting of copies of the motif + perfect_seq = (int)(len(sequence)*3.0/len(motif) + 1)*motif + + # Create scoring and traceback matrices + score_matrix = [] + trace_matrix = [] + for i in xrange(len(sequence)+1): + score_matrix.append((len(perfect_seq)+1)*[None]) + trace_matrix.append((len(perfect_seq)+1)*[None]) + + # Initialize matrices with no endspace penalty for the "perfect" sequence + score_matrix[0][0] = 0 + trace_matrix[0][0] = None + for j in xrange(1, len(perfect_seq)+1): + score_matrix[0][j] = 0 + trace_matrix[0][j] = LEFT + for i in xrange(1, len(sequence)+1): + score_matrix[i][0] = i*INDEL + trace_matrix[i][0] = UP + + # Perform alignment + for i in xrange(1, len(sequence)+1): + for j in xrange(1, len(perfect_seq)+1): + v1 = score_matrix[i-1][j]+INDEL + v2 = score_matrix[i][j-1]+INDEL + v3 = score_matrix[i-1][j-1] + (MATCH if sequence[i-1] == perfect_seq[j-1] else MISMATCH) + + if v1 < v2: + if v1 < v3: + score_matrix[i][j] = v1 + trace_matrix[i][j] = UP + else: + score_matrix[i][j] = v3 + trace_matrix[i][j] = DIAG + else: + if v2 < v3: + score_matrix[i][j] = v2 + trace_matrix[i][j] = LEFT + else: + score_matrix[i][j] = v3 + trace_matrix[i][j] = DIAG + + # Find the optimal alignment that uses the entire provided sequence + min_score = score_matrix[len(sequence)][0] + min_index = 0 + for j in xrange(len(perfect_seq)+1): + if score_matrix[len(sequence)][j] < min_score: + min_score = score_matrix[len(sequence)][j] + min_index = j + + # Traceback the optimal alignment + pseq_alignment = "" + qseq_alignment = "" + i = len(sequence) + j = min_index + while i > 0: + if trace_matrix[i][j] == LEFT: + pseq_alignment += perfect_seq[j-1] + qseq_alignment += "-" + j -= 1 + elif trace_matrix[i][j] == UP: + pseq_alignment += "-" + qseq_alignment += sequence[i-1] + i -= 1 + elif trace_matrix[i][j] == DIAG: + pseq_alignment += perfect_seq[j-1] + qseq_alignment += sequence[i-1] + i -= 1 + j -= 1 + else: + ERROR("Invalid condition encountered in alignment traceback. Exiting...") + pseq_alignment = pseq_alignment[::-1] + qseq_alignment = qseq_alignment[::-1] + num_errors = min_score + purity_score = 1.0 - 1.0*num_errors/(len(sequence)) + return purity_score, num_errors + +def annotate_reference(reffasta, input_file, output_file): + input_fh = open(input_file, "r") + output_fh = open(output_file, "w") + fasta = pyfasta.Fasta(reffasta) + keymap = {} + for k in fasta.keys(): + keymap[k.split()[0]] = k + + for line in input_fh: + tokens = line.strip().split() + chrom = tokens[0] + start, stop = map(int, tokens[1:3]) + period = int(tokens[3]) + seq = fasta[keymap[chrom]][start-1:stop].upper() + + if stop - start > 200: + continue + + motif_counts = collections.defaultdict(int) + for i in xrange(len(seq)-period): + motif_counts[min_perm(seq[i:i+period])] += 1 + most_common_motif = sorted(motif_counts.items(), key = lambda x: x[1])[-1][0] + edit_dist = calc_alignment_purity(seq, most_common_motif)[1] + pure_len = longest_perfect_array(seq, most_common_motif)[0] + most_common_motif = min(most_common_motif, min_perm(reverse_complement(most_common_motif))) + output_fh.write(line.strip() + "\t" + str(most_common_motif) + "\t" + str(edit_dist) + "\t" + str(pure_len) + "\n") + input_fh.close() + output_fh.close() + +############################ +def main(): + try: + FASTAFILE = sys.argv[1] + OUTFILE = os.path.abspath(sys.argv[2]) + except: + print __doc__ + sys.exit(1) + + # Check for required programs + if not CheckProgram("trf"): + ERROR("Could not find trf installed on the PATH") + # Check input file exists + if not os.path.exists(FASTAFILE): + ERROR("%s does not exist"%FASTAFILE) + name = os.path.basename(FASTAFILE) + + # Set up temporary directory + tmpdir = tempfile.mkdtemp(prefix="lobref.") + os.chdir(tmpdir) + + # Run TRF + PROGRESS("Running TRF on %s"%FASTAFILE) + os.mkdir("source_results") + os.chdir("source_results") + trf_cmd = "trf %s %s %s %s %s %s %s %s -h -d"%(FASTAFILE, TRF_MATCH_WT, TRF_MISMATCH_PEN, TRF_INDEL_PEN, \ + TRF_P_MATCH, TRF_P_INDEL, TRF_MIN_SCORE, TRF_MAX_PERIOD) + if RunCommand(trf_cmd): + #ERROR("Error running %s"%trf_cmd) + PROGRESS("WARNING: trf exited with code 1") + mv_trf_cmd = "mv %s.%s.%s.%s.%s.%s.%s.%s.dat %s"%(name, TRF_MATCH_WT, TRF_MISMATCH_PEN, TRF_INDEL_PEN, \ + TRF_P_MATCH, TRF_P_INDEL, TRF_MIN_SCORE, TRF_MAX_PERIOD, name) + if RunCommand(mv_trf_cmd): + ERROR("Error running %s"%mv_trf_cmd) + os.chdir("..") + + # Fix errors in TRF patterns and remove loci with P > 6 + PROGRESS("Adjusting TRF patterns") + os.mkdir("fixed_results") + correct_pattern_errors(os.path.join(tmpdir, "source_results", name), \ + os.path.join(tmpdir, "fixed_results", name), \ + max_period=MAX_PERIOD) + + # Filter regions by scoring threshold + PROGRESS("Filter regions by scoring threshold") + os.mkdir("filtered_results") + create_filtered_trf_bed_file(os.path.join(tmpdir, "fixed_results", name), \ + os.path.join(tmpdir, "filtered_results", "all_chrs")) + + + # Split back up by chromosome + PROGRESS("Split back up by chromosome") + split_cmd = "awk '{print > \"%s/\"$1}' %s/all_chrs"%(os.path.join(tmpdir, "filtered_results"), \ + os.path.join(tmpdir, "filtered_results")) + if RunCommand(split_cmd): + ERROR("Error running %s"%split_cmd) + + # Combine overlapping TRFs contingent upon mering criteria + PROGRESS("Combine overlapping TRFs") + os.mkdir("merged_results") + for chrfile in glob.glob(os.path.join(tmpdir, "filtered_results/*")): + if "all_chrs" in chrfile: continue + analyze_overlaps(chrfile, \ + os.path.join(tmpdir, "merged_results", os.path.basename(chrfile)+".pass"), \ + os.path.join(tmpdir, "merged_results", os.path.basename(chrfile)+".fail")) + + # Create final reference + PROGRESS("Create final reference") + merge_cmd = "cat %s/*.pass | sort -k 1,1V -k 2,2n > %s"%(os.path.join(tmpdir, "merged_results"), \ + os.path.join(tmpdir, "passing_str_reference.bed")) + if RunCommand(merge_cmd): + ERROR("Error running %s"%merge_cmd) + + # Annotate with motif and number of imperfections + annotate_reference(FASTAFILE, os.path.join(tmpdir, "passing_str_reference.bed"), os.path.join(tmpdir, "ann_ref.bed")) + + # Generate lobSTR table + # TODO make score + score = -1 # TODO + col_cmd = "cat %s | awk '{print $1 \"\t\" $2 \"\t\" $3 \"\t\" $4 \"\t\" $5 \"\tNA\tNA\tNA\t\" \"%s\" \"\tNA\tNA\tNA\tNA\tNA\t\" $6}' > %s"%(os.path.join(tmpdir, "ann_ref.bed"), score, OUTFILE) + if RunCommand(col_cmd): + ERROR("Error running %s"%col_cmd) + +if __name__ == "__main__": + main() From f7688800c9b6c96af81d5e65eb8fe63b0e27298c Mon Sep 17 00:00:00 2001 From: mgymrek Date: Sun, 17 Jan 2016 10:35:40 -0500 Subject: [PATCH 2/4] editing script to generate lobstr ref bed --- scripts/GenerateLobSTRRefBed.py | 48 ++++++++++++++++++++++++++------- scripts/Makefile.am | 3 ++- 2 files changed, 40 insertions(+), 11 deletions(-) diff --git a/scripts/GenerateLobSTRRefBed.py b/scripts/GenerateLobSTRRefBed.py index 1732909..6879d3e 100755 --- a/scripts/GenerateLobSTRRefBed.py +++ b/scripts/GenerateLobSTRRefBed.py @@ -3,6 +3,28 @@ python GenerateLobSTRRefBed.py """ +# Copyright (C) 2016 Melissa Gymrek Thomas Willems +# +# This file is part of lobSTR. +# +# lobSTR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# lobSTR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with lobSTR. If not, see . + +# +# A validation suite to check concordance with gold standard +# capillary electrophoresis calls. +# + import collections import glob import os @@ -482,6 +504,9 @@ def reverse_complement(dna_sequence): return complement(dna_sequence[::-1]) def calc_alignment_purity(sequence, motif): + """ + returns purity_score, num_errors, trf_score + """ # Traceback constants LEFT = -1 DIAG = 0 @@ -495,6 +520,9 @@ def calc_alignment_purity(sequence, motif): # Create a "perfect" sequence consisting of copies of the motif perfect_seq = (int)(len(sequence)*3.0/len(motif) + 1)*motif + # Init trf score + trf_score = 0 + # Create scoring and traceback matrices score_matrix = [] trace_matrix = [] @@ -552,22 +580,27 @@ def calc_alignment_purity(sequence, motif): pseq_alignment += perfect_seq[j-1] qseq_alignment += "-" j -= 1 + trf_score = trf_score - TRF_INDEL_PEN elif trace_matrix[i][j] == UP: pseq_alignment += "-" qseq_alignment += sequence[i-1] i -= 1 + trf_score = trf_score - TRF_INDEL_PEN elif trace_matrix[i][j] == DIAG: pseq_alignment += perfect_seq[j-1] qseq_alignment += sequence[i-1] i -= 1 j -= 1 + if pseq_alignment[-1] == qseq_alignment[-1]: + trf_score = trf_score + TRF_MATCH_WT + else: trf_score = trf_score - TRF_MISMATCH_PEN else: ERROR("Invalid condition encountered in alignment traceback. Exiting...") pseq_alignment = pseq_alignment[::-1] qseq_alignment = qseq_alignment[::-1] num_errors = min_score purity_score = 1.0 - 1.0*num_errors/(len(sequence)) - return purity_score, num_errors + return purity_score, num_errors, trf_score def annotate_reference(reffasta, input_file, output_file): input_fh = open(input_file, "r") @@ -582,19 +615,15 @@ def annotate_reference(reffasta, input_file, output_file): chrom = tokens[0] start, stop = map(int, tokens[1:3]) period = int(tokens[3]) - seq = fasta[keymap[chrom]][start-1:stop].upper() - - if stop - start > 200: - continue - + seq = fasta[keymap[chrom]][start-1:stop].upper() motif_counts = collections.defaultdict(int) for i in xrange(len(seq)-period): motif_counts[min_perm(seq[i:i+period])] += 1 most_common_motif = sorted(motif_counts.items(), key = lambda x: x[1])[-1][0] - edit_dist = calc_alignment_purity(seq, most_common_motif)[1] + trf_score = calc_alignment_purity(seq, most_common_motif)[2] pure_len = longest_perfect_array(seq, most_common_motif)[0] most_common_motif = min(most_common_motif, min_perm(reverse_complement(most_common_motif))) - output_fh.write(line.strip() + "\t" + str(most_common_motif) + "\t" + str(edit_dist) + "\t" + str(pure_len) + "\n") + output_fh.write(line.strip() + "\t" + str(most_common_motif) + "\t" + str(trf_score) + "\t" + str(pure_len) + "\n") input_fh.close() output_fh.close() @@ -676,8 +705,7 @@ def main(): # Generate lobSTR table # TODO make score - score = -1 # TODO - col_cmd = "cat %s | awk '{print $1 \"\t\" $2 \"\t\" $3 \"\t\" $4 \"\t\" $5 \"\tNA\tNA\tNA\t\" \"%s\" \"\tNA\tNA\tNA\tNA\tNA\t\" $6}' > %s"%(os.path.join(tmpdir, "ann_ref.bed"), score, OUTFILE) + col_cmd = "cat %s | awk '{print $1 \"\t\" $2 \"\t\" $3 \"\t\" $4 \"\t\" $5 \"\t\" $4 \"\t\" $2 \"\t\" $3 \"\t\" $7 \"\tNA\tNA\tNA\tNA\tNA\t\" $6 \"\t.\"}' > %s"%(os.path.join(tmpdir, "ann_ref.bed"), OUTFILE) if RunCommand(col_cmd): ERROR("Error running %s"%col_cmd) diff --git a/scripts/Makefile.am b/scripts/Makefile.am index 67489f1..d203990 100644 --- a/scripts/Makefile.am +++ b/scripts/Makefile.am @@ -31,7 +31,8 @@ script_SCRIPTS = \ allelotype_validation_suite.sh \ lobSTR_capillary_comparator.py \ lobSTR_vcf_to_tab.py \ - lobSTR_filter_vcf.py + lobSTR_filter_vcf.py \ + GenerateLobSTRRefBed.py EXTRA_DIST = $(script_SCRIPTS) \ $(bin_SCRIPTS) From ebc039b843ea7bd67bec1df8537af5e6374f8a92 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Sun, 17 Jan 2016 11:07:24 -0500 Subject: [PATCH 3/4] adding option to use alt temp directory for making refd --- scripts/GenerateLobSTRRefBed.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/scripts/GenerateLobSTRRefBed.py b/scripts/GenerateLobSTRRefBed.py index 6879d3e..f7c451e 100755 --- a/scripts/GenerateLobSTRRefBed.py +++ b/scripts/GenerateLobSTRRefBed.py @@ -1,6 +1,6 @@ #!/usr/bin/env python """ -python GenerateLobSTRRefBed.py +python GenerateLobSTRRefBed.py [tmpdir] """ # Copyright (C) 2016 Melissa Gymrek Thomas Willems @@ -635,6 +635,14 @@ def main(): except: print __doc__ sys.exit(1) + try: + ANDIR = sys.argv[3] + except: + ANDIR = os.getcwd() + if not os.path.exists(ANDIR): + ERROR("Tmp directory %s does not exist"%ANDIR) + else: + PROGRESS("Performing steps in %s"%ANDIR) # Check for required programs if not CheckProgram("trf"): @@ -645,7 +653,7 @@ def main(): name = os.path.basename(FASTAFILE) # Set up temporary directory - tmpdir = tempfile.mkdtemp(prefix="lobref.") + tmpdir = tempfile.mkdtemp(prefix="lobref.", dir=ANDIR) os.chdir(tmpdir) # Run TRF @@ -704,10 +712,12 @@ def main(): annotate_reference(FASTAFILE, os.path.join(tmpdir, "passing_str_reference.bed"), os.path.join(tmpdir, "ann_ref.bed")) # Generate lobSTR table - # TODO make score col_cmd = "cat %s | awk '{print $1 \"\t\" $2 \"\t\" $3 \"\t\" $4 \"\t\" $5 \"\t\" $4 \"\t\" $2 \"\t\" $3 \"\t\" $7 \"\tNA\tNA\tNA\tNA\tNA\t\" $6 \"\t.\"}' > %s"%(os.path.join(tmpdir, "ann_ref.bed"), OUTFILE) if RunCommand(col_cmd): ERROR("Error running %s"%col_cmd) + # Remove tmpdir + os.rmdir(tmpdir) + if __name__ == "__main__": main() From 1b9c764c06906880188a0ca732332aadf91e75f7 Mon Sep 17 00:00:00 2001 From: mgymrek Date: Sat, 5 Mar 2016 20:44:56 -0500 Subject: [PATCH 4/4] editing update messages for generating ref --- scripts/GenerateLobSTRRefBed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/GenerateLobSTRRefBed.py b/scripts/GenerateLobSTRRefBed.py index f7c451e..0bca7e9 100755 --- a/scripts/GenerateLobSTRRefBed.py +++ b/scripts/GenerateLobSTRRefBed.py @@ -29,6 +29,7 @@ import glob import os import pyfasta +import shutil import sys import tempfile from subprocess import Popen, PIPE, STDOUT @@ -72,6 +73,7 @@ def ERROR(msg): sys.exit(1) def RunCommand(cmd): + PROGRESS("Running: %s"%cmd) p = Popen(cmd, shell=True, stdin=PIPE, stdout=PIPE, \ stderr=STDOUT, close_fds=True) ex = p.wait() @@ -717,7 +719,7 @@ def main(): ERROR("Error running %s"%col_cmd) # Remove tmpdir - os.rmdir(tmpdir) + shutil.rmtree(tmpdir) if __name__ == "__main__": main()