-
Notifications
You must be signed in to change notification settings - Fork 0
/
Seq_grabber4cluster.R
88 lines (62 loc) · 2.45 KB
/
Seq_grabber4cluster.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
###########################################################################################
##########__Austin Marshall_PhD_student_Dept_of_Biology_Clarkson_University__##############
###########################################################################################
### This script is used to read in the bam file after samtools conversion from the sam file produced by Emu minimap2 step
### We load in the bam file and using a modified script from https://gist.github.com/davetang/6460320 convert to dataframe
### We then use the Emu database reference number to find the species hit of interest and select only those sequences to
### provide a better clustering with NGSpeciesID (less noise allows for higher accuracy)
library(Rsamtools)
library(dplyr)
library(data.table)
library(Biostrings)
library(tgutil)
#read in entire BAM file
bam <- scanBam("S1_emu_aln_sorted.bam")
#names of the BAM fields
names(bam[[1]])
# [1] "qname" "flag" "rname" "strand" "pos" "qwidth" "mapq" "cigar"
# [9] "mrnm" "mpos" "isize" "seq" "qual"
#distribution of BAM flags
table(bam[[1]]$flag)
# 0 4 16
#1472261 775200 1652949
#function for collapsing the list of lists into a single list
#as per the Rsamtools vignette
.unlist <- function (x){
## do.call(c, ...) coerces factor to integer, which is undesired
x1 <- x[[1L]]
if (is.factor(x1)){
structure(unlist(x), class = "factor", levels = levels(x1))
} else {
do.call(c, x)
}
}
#store names of BAM fields
bam_field <- names(bam[[1]])
#go through each BAM field and unlist
list <- lapply(bam_field, function(y) .unlist(lapply(bam, "[[", y)))
#store as data frame
bam_df <- do.call("DataFrame", list)
names(bam_df) <- bam_field
dim(bam_df)
#[1] 3900410 13
S1 = data.frame(bam_df)
filt_S1 = S1 %>%
select(qname, rname, seq, qual)%>%
na_if("") %>%
na.omit()
main_ral <- filt_S1[filt_S1$rname %like% "305:emu_db:16544", ]
colnames(main_ral)[1] = "id"
almost_fastq = main_ral %>%
select(id, seq, qual)
shiz = gsub("^", "@", as.character(almost_fastq$id))
shiz2 = shiz %>%
as.data.frame()
colnames(shiz2)[1] = "id_good"
soclose = cbind(shiz2, almost_fastq) %>%
select(id_good, seq, qual)
colnames(soclose)[1] = "id"
### Need to add the @ symbol to beginning of the read ids before writing
### This final writing of the fastq is what will be loaded into NGSpeciesID
write_fastq(soclose, "S1_main_ralstonia.fastq")
#################################################################