-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcycles_file_to_fasta.py
executable file
·96 lines (77 loc) · 2.61 KB
/
cycles_file_to_fasta.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import sys
import os
import argparse
from itertools import groupby
"""
Jens Luebeck
UC San Diego, Bioinformatics & Systems Biology
"""
lookup = str.maketrans("ACGTRYKM", "TGCAYRMK")
def rev_complement(seq):
return seq.translate(lookup)[::-1]
def fasta_reader(fasta_file, chroms_to_get, getAll = False):
fasta_dict = {}
print("Reading FASTA: {}".format(fasta_file))
with open(fasta_file, 'r') as infile:
faiter = (x[1] for x in groupby(infile, lambda line: line[0] == ">"))
for header in faiter:
# drop the ">"
seq_name = next(header)[1:].rstrip().rsplit()[0]
# join all sequence lines to one.
seq = "".join(s.rstrip() for s in next(faiter))
if seq_name in chroms_to_get or getAll:
fasta_dict[seq_name] = seq
return fasta_dict
def writeCycleFasta(cyclef, outpre):
with open(outpre + "_cycles.fasta",'w') as cycleFasta, open(cyclef) as infile:
for line in infile:
if "Cycle=" in line:
fields = line.rstrip().rsplit(";")
lineD = {x.rsplit("=")[0]: x.rsplit("=")[1] for x in fields}
segs = lineD["Segments"].rsplit(",")
recSeq = ""
for i in segs:
seg = i[:-1]
if seg != "0":
strand = i[-1]
segSeq = segSeqD[seg].upper()
if strand == "-":
segSeq = rev_complement(segSeq)
recSeq += segSeq
outname = "cycle_" + lineD["Cycle"]
cycleFasta.write(">" + outname + "\n")
cycleFasta.write(recSeq + "\n")
# get the relevant chromosomes the cycles file
def relChroms(cyclef):
chromSet = set()
with open(cyclef) as infile:
for line in infile:
if line.startswith("Segment"):
fields = line.rstrip().rsplit()
chromSet.add(fields[2])
return chromSet
def segsToSeq(cyclef,outpre):
segSeqD = {}
with open(cyclef) as infile, open(outpre + "_segments.fasta",'w') as segFasta:
for line in infile:
if line.startswith("Segment"):
fields = line.rstrip().rsplit()
lowerBound = int(fields[3])
upperBound = int(fields[4])
relSeq = seqD[fields[2]][lowerBound:upperBound+1]
segFasta.write(">" + "_".join(fields) + "\n")
segFasta.write(relSeq + "\n")
segNum = fields[1]
segSeqD[segNum] = relSeq
return segSeqD
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--cycles", help="AA-formatted cycles file", required=True)
parser.add_argument("-r", "--ref", help="path to reference genome", required=True)
parser.add_argument("-o", "--output", help="output prefix", required=True)
args = parser.parse_args()
relChromSet = relChroms(args.cycles)
seqD = fasta_reader(args.ref, relChromSet)
segSeqD = segsToSeq(args.cycles, args.output)
writeCycleFasta(args.cycles, args.output)