-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathbuild_alignment_sam.py
103 lines (88 loc) · 5.31 KB
/
build_alignment_sam.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# build_alignment_sam V 1.0
# Copyright (C) 2019 Ryan C. Shean
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# this script will build alignment .sam files of all species level tax assignments with above MIN_READ_CUTOFF reads going to that species
# This can only be run after the whole pipeline has finished, including tie-breaking.
from Bio import Entrez
from ete3 import NCBITaxa
ncbi = NCBITaxa()
import argparse
import glob
Entrez.email = '[email protected]'
import subprocess
# minimum number of reads that must be assigned to a species in order to assemble
MIN_READ_CUTOFF = 10
# number of chunks that you've split NT into (must be in the same format as your snap indexes
DB_LIST = ['00','01','02','03','04','05','06','07','08','09','10','11','12','12','13']
# This number is how many nodes upwards on the tree should we grab a list of reads to align to the species level reference
# Right now it's set to 2 so if we have a species assignment we'll pull all reads that were assigned to the family that the species belongs to
ASSEMBLY_NODE_OFFSET = -2
# traverse through every report file (for every sample)
for file_name in glob.glob('*report.tsv'):
base = file_name.split('_')[0]
taxid_to_assemble = []
# go through every line in the report file and pull all species level assignments with above MIN_READ_CUTOFF reads
for line in open(file_name):
line_list = line.split('\t')
if line_list[3] == 'S' and int(line_list[2]) >= MIN_READ_CUTOFF:
lineage = ncbi.get_lineage(line_list[4])
# No eukaryotic assembly and no 'uncultured bacterium' assembly
if 2759 not in lineage and line_list[4] != '77133':
taxid_to_assemble.append(line_list[4])
# for every taxid we assemble a list of reads that are at the given species rank or lower
for taxid in taxid_to_assemble:
taxid_search_list = [str(taxid)]
taxid_search_list = taxid_search_list + ncbi.get_descendant_taxa(taxid, get_descendant_taxa=True)
list_of_reads_to_pull = []
for a_line in open(base + '_assignments.txt'):
a_line_list = a_line.split('\t')
if a_line_list[1] in taxid_search_list:
list_of_reads_to_pull.append(a_line_list[0])
acc_num_list = []
# go through all the sam files and grab the accession numbers that the reads we got in the last loop aligned to
for db_num in db_list:
for sam_line in open(base + '.hf.trimmed.fastq_' + db_num + '.sam'):
sam_line_list = sam_line.split('\t')
# only grab accession numbers that have 'complete genome' in the name to avoid assembling to partial segments
if sam_line_list[0] in list_of_reads_to_pull and 'complete_genome' in sam_line_list[2]:
acc_num_list.append(sam_line_list[2].split('.')[0])
# if we couldn't find any accession numbers going to these reads with 'complete genome' just refuse to build an assembly for this taxid
if len(acc_num_list) == 0:
print('No complete genome reference found, not assembling taxid ' + taxid)
# this break just pops us out of this taxid and moves to the next one for the current sample
break
# get the most common accession number for this taxid (in the event of ties this chooses completely randomly )
most_common_acc_num = max(set(acc_num_list), key=acc_num_list.count)
taxid_lineage = ncbi.get_lineage(taxid)
# Change this number to align reads higher up the tree
taxid_to_pull = taxid_lineage[-2]
# pull all reads from the sample that are assigned at or below the taxonomic level we designated in the above line
subprocess.call('python pull_reads.py ' + base + '_assignments.txt ' + str(taxid_to_pull) + ' --r', shell=True)
print('Searching NCBI for Accession number:' + most_common_acc_num + ' for taxid ' + str(taxid_to_pull))
record = Entrez.read(Entrez.esearch(db='nucleotide', term=most_common_acc_num))
try:
h2 = Entrez.efetch(db='nucleotide', id=record['IdList'][0], rettype='fasta', retmode='text')
except:
print(str(taxid) + ' did not return hits - not assembling')
break
# procedurally generate name for the reference fasta and the bowtie database we'll create
ref_fasta = base + '_' + str(taxid) + '_ref.fasta'
ref_db = base + '_' + str(taxid) + '_db'
g = open(ref_fasta, 'w')
g.write(h2.read())
g.close()
print('building bowtie2 index')
subprocess.call('bowtie2-build ' + ref_fasta + ' ' + ref_db + ' > /dev/null 2>&1 ', shell=True)
print('done with index build aligning...')
subprocess.call('bowtie2 -x ' + ref_db + ' -@ 42 -f -U ' + base + '_assignments' + str(taxid_to_pull) + '.fasta --no-unal > ' + base + '_' + str(taxid) + '.sam', shell=True)
# delete bowtie index, I like to keep the refernce fasta just so it's easy to tell what this code pulled
subprocess.call('rm *_db*.bt2', shell = True)