-
Notifications
You must be signed in to change notification settings - Fork 20
/
trim2bRAD_2barcodes_dedup_N2.pl
174 lines (157 loc) · 4.91 KB
/
trim2bRAD_2barcodes_dedup_N2.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/perl
$usage="
trim2bRAD_2barcodes_dedup_N2.pl :
version of trimmer/deduplicator script allowing for Ns instead of 'CC' in the upstream tag
This script does three things:
- Filters 2bRAD fastq reads to leave only those with a 100% matching restriction site,
degenerate 5'-leader, secondary 3'-barcode, and adaptor on the far 3'end,
trims away everything except the IIb-fragment itself;
- Deduplicates: removes all but one read sharing the same 64-fold degenerate
leader, the first 34 bases of the insert sequence, and secondary barcode
(this results in 128-fold dynamic range: 64-fold degeneracy x 2 strand orientations);
- Splits reads into separate files according to secondary barcode.
Writes trimmed fastq files named according to the secondary barcodes detected.
Arguments:
input=[fastq filename]
site=[perl-style pattern for restrictase site]
Default is BcgI: \'.{12}CGA.{6}TGC.{12}|.{12}GCA.{6}TCG.{12}\'
bc2=[perls-style barcode pattern] in-line barcode that immediately follows the RAD fragment,
default \'[ATGC]{4}\'
adaptor=[far-end adaptor sequence] default AGATC?
deduplicate=1 whether to remove PCR duplicates; set to 0 to keep them.
clip=[integer] number of bases to clip off the ends of the reads, default 0
minBCcount=[integer] minimum count per in-line barcode to output a separate file.
Default 100000.
sampleID=[integer] the position of name-deriving string in the file name
if separated by underscores, such as:
for input file Sample_RNA_2DVH_L002_R1.cat.fastq
specifying arg2 as \'3\' would create output
file with a name \'2DVH.trim'
Example:
trim2bRAD_2barcodes_dedup.pl input=Myproject_L8_G3.fastq sampleID=3
";
my $input="";
if ("@ARGV"=~/input=(\S+)/){ $input=$1; }
else { die $usage; }
open INP, $1 or die "cannot open input $1\n";
my $site='.{12}CGA.{6}TGC.{12}|.{12}GCA.{6}TCG.{12}';
if ("@ARGV"=~/site=(\S+)/){ $site=$1;}
my $bcod="[ATGC]{4}";
if ("@ARGV"=~/bc2=(\S+)/){ $bcod=$1;}
my $degen="[ATGC][ATGC][AG][AT][CN][CN]";
my $adap="AGATC?";
if ("@ARGV"=~/adaptor=(\S+)/){ $adap=$1;}
my $clip=0;
if ("@ARGV"=~/clip=(\d+)/){ $clip=$1;}
my $sampleid=100;
if("@ARGV"=~/sampleID=(\d+)/){ $sampleid=$1;}
my $deduplicate=1;
if ("@ARGV"=~/deduplicate=0/){ $deduplicate=0;}
my $trim=0;
my $name="";
my $name2="";
my $seq="";
my $qua="";
my $minBCcount=100000;
if ("@ARGV"=~/minBCcount=(\d+)/){$minBCcount=$1;}
my $ll=3;
my %data={};
my %dups={};
my $counter=0;
my %dupnum={};
my $chk;
my $bcc;
my $read;
while (<INP>) {
chomp;
if ($ll==3 && $_=~/^(\@.+)$/) {
$name2=$1;
$counter++;
#print "$seq:";
if ($seq=~/^($degen)($site)($bcod)$adap/ and $seq!~/N/) {
$read=$2;
if ($deduplicate==0) { $read=$1.$2;}
$bcc=$3;
my $chk1=substr($read,0,34);
$chk=$1.$chk1.$bcc;
#print "$1:$2:$3|check:$chk\n";
if ($dups{$chk} and $deduplicate) {
$dupnum{$bcc}++;
#print "\t\tdup found: $chk\n";
}
else {
$dups{$chk}=1;
my $rd=substr($read,$clip,length($read)-$clip*2);
$qua=substr($qua,$clip,length($read)-$clip*2);
$dline="$name bcd=$bcc\n$rd\n+\n$qua\n";
push @{$data{$bcc}}, $dline ;
}
#print "$dline\n";
}
else {
#print "\n\nNOT FINDING $site:$bcod:$adap\n\n";
}
#print "-----------\n$counter\n$_\n";
$seq="";
$ll=0;
$qua="";
@sites=();
$name=$name2;
}
elsif ($ll==0){
chomp;
$seq=$_;
$ll=1;
}
elsif ($ll==2) {
chomp;
$qua=substr($_,3,length($_));
$ll=3;
}
else { $ll=2;}
}
if ($seq=~/^($degen)($site)($bcod)$adap/ and $seq!~/N/) {
$read=$2;
$bcc=$3;
my $chk1=substr($read,0,34);
$chk=$1.$chk1.$bcc;
#print "$1:$2:$3|check:$chk\n";
if ($dups{$chk}) {
$dupnum{$bcc}++;
#print "\t\tdup found: $chk\n";
}
else {
$dups{$chk}=1;
my $rd=substr($read,$clip,length($read)-$clip*2);
$qua=substr($qua,$clip,length($read)-$clip*2);
$dline="$name bcd=$bcc\n$rd\n+\n$qua\n";
push @{$data{$bcc}}, $dline ;
}
#print "$dline\n";
}
else {
#print "\n\nNOT FINDING $site:$bcod:$adap\n\n";
}
my $outname;
if ($sampleid<100) {
my @parts=split('_',$input);
$outname=$parts[$sampleid-1];
}
else {
$input=~s/\..+//;
$outname=$input;
}
my $totgood=0;
my $totdups=0;
foreach my $bc2 (sort keys %data) {
next if ($bc2=~/HASH/);
next if ($#{$data{$bc2}}<$minBCcount);
print $outname,"_","$bc2: goods: ",$#{$data{$bc2}}+1," ; dups: $dupnum{$bc2}\n";
$totgood=$totgood+$#{$data{$bc2}}+1;
$totdups=$totdups+$dupnum{$bc2};
my $outname1=$outname."_".$bc2.".tr0";
open OUT, ">$outname1" or die "cannot create $outname1\n";
foreach $d (@{$data{$bc2}}) { print {OUT} $d; }
close OUT;
}
print "\n$outname: total goods : $totgood ; total dups : $totdups\n";