forked from GenomicsCoreLeuven/GBSX
-
Notifications
You must be signed in to change notification settings - Fork 0
/
GBSX_digest_v1.0.pl
380 lines (319 loc) · 20.8 KB
/
GBSX_digest_v1.0.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#!/usr/bin/perl
####################################################################################################################
# This is GBSX v1.0. A toolkit for experimental design and demultiplexing genotyping by sequencing experiments. #
# #
# Copyright (C) 2014 KU Leuven #
# #
# This file is part of GBSX. #
# #
# GBSX is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# GBSX is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details. #
# #
# You should have received a copy of the GNU General Public License #
# along with GBSX. If not, see <http://www.gnu.org/licenses/>. #
####################################################################################################################
########################################################################################################################################################################
use Getopt::Std;
use Bio::SeqIO;
use Data::Dumper;
use Bio::PrimarySeq;
use Bio::DB::Fasta;
use Bio::Restriction::EnzymeCollection;
use Bio::Restriction::Analysis;
use Bio::Restriction::Enzyme;
########################################################################################################################################################################
my $program_information = "GBSX v1.0";
my $license = "GBSX v1.0 Copyright (C) 2014 KU Leuven\nThis program comes with ABSOLUTELY NO WARRANTY; for details type `perl GBSX_digest_v1.0.pl -w'.\nThis is free software, and you are welcome to redistribute it under certain conditions; type `perl GBSX_digest_v1.0.pl -c' for details.\n";
$Getopt::Std::STANDARD_HELP_VERSION = 1;
my $help_message="\n$license\n####################################################################################\nusage: perl GBSX_digest_v1.0.pl -d \'digest sequence\' -l \'read length\' -f \'file of reference fasta file location(s)\'\n\noptional parameters: \n\t-e \'enzyme name to use\' (default: Enzyme)\n\t-g \'genome name to use in bed file name\' (default: genome)\n\t-n \'minimum size fragments to include\' (default: 100)\n\t-m \'maximum size fragments to use\' (default: 1000)\n\n\t-E \'second enzyme name to use\' (default: Enzyme2)\n\t-D \'digest sequence for a second enzyme\' (default: not declared)\n\t-R \'digest sequence for a third enzyme\' (default: not declared)\n";
########################################################################################################################################################################
########################################################################################################################################################################
#input parameters
sub HELP_MESSAGE {die "$help_message\n";}
sub VERSION_MESSAGE {print "$program_information\n";}
my %opts = ();
getopts('e:d:g:l:f:m:n:E:D:R:cwh');
#die with help message if -h
if ($opt_h){ die "$help_message\n"; }
#die with warrenty message if -w
if ($opt_w){ die "GBSX is distributed in the hope that it will be useful,\nbut WITHOUT ANY WARRANTY; without even the implied warranty of\nMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the\nGNU General Public License for more details.\n"; }
#die with conditions message if -c
if ($opt_c){ die "GBSX is free software: you can redistribute it and/or modify\nit under the terms of the GNU General Public License as published by\nthe Free Software Foundation, either version 3 of the License, or\n(at your option) any later version.\n"; }
#enzyme name (optional)
my $enzyme_name = $opt_e || "Enzyme";
#digest sequence: example "RCATG^Y";
my $digest_seq = $opt_d || "not_defined"; if ($digest_seq ne "not_defined"){$digest_seq=uc($digest_seq);}
#genome name to use for bedfile (optional)
my $bed_genome_name = $opt_g || "genome";
#define read length: assumes PE, only report back in bed first and last number of bps corresponding to this
my $read_length = $opt_l || "not_defined";
#file of reference fasta locations
my $fasta_file_locations = $opt_f || "not_defined";
#max/min fragment sizes (optional, otherwise use defaults)
my $max_size = $opt_m || 1000;
my $min_size = $opt_n || 100;
#enzyme2 name (optional, otherwise if second digest just default to Enzyme2)
my $enzyme2_name = $opt_E || "Enzyme2";
#make sure starts Upper case
$enzyme2_name = ucfirst($enzyme2_name);
#double check this is not the same as Enzyme1
if ($enzyme2_name eq $enzyme_name){die "Please use different enzyme names for enzyme 1 and enzyme 2\n"; }
#digest sequence for a second enzyme: example "RCATG^Y"; (optional, otherwise no second digest)
my $digest2_seq = $opt_D || "not_defined"; if ($digest2_seq ne "not_defined"){$digest2_seq=uc($digest2_seq);}
#double check this is not the same as Enzyme1
if ($digest2_seq eq $digest_seq){die "Please use different enzyme cut sequences for enzyme 1 and enzyme 2\n"; }
#digest sequence for a third enzyme: example "RCATG^Y"; (optional, otherwise no third digest)
my $digest3_seq = $opt_R || "not_defined";
if ($digest3_seq ne "not_defined"){$digest3_seq=uc($digest3_seq);
#double check this is not the same as Enzyme1 or Enzyme3
if ( ($digest3_seq eq $digest2_seq) || ($digest3_seq eq $digest_seq) ){die "Please use different enzyme cut sequences for enzyme 1, enzyme 2, and enzyme 3\n"; }
#we call this Enzyme3, so make sure nothing else uses this name
if ( ($enzyme2_name eq "Enzyme3") || ($enzyme_name eq "Enzyme3") ){ die "Enzyme3 is a reseved name, please use a different enzyme name\n"; }
}
#die if required parameters are missing
if ( ($digest_seq eq "not_defined") || ($read_length eq "not_defined") || ($fasta_file_locations eq "not_defined") ){
my $missing=" ";
if ($digest_seq eq "not_defined"){$missing = $missing."-d (digest sequence) ";}
if ($read_length eq "not_defined"){$missing = $missing."-l (read length) ";}
if ($fasta_file_locations eq "not_defined"){$missing = $missing."-f (reference fasta locations) ";}
die "missing parameter(s):$missing\n";
}
########################################################################################################################################################################
########################################################################################################################################################################
#first basic steps to get ready for the analysis (mostly defining variables)
#using input information, define outfiles:
my $outfile; my $main_bed;
if ($digest2_seq ne "not_defined"){
$outfile = $bed_genome_name.".".$enzyme_name.".".$enzyme2_name.".".$read_length."nt.digest_results";
$main_bed = $bed_genome_name.".".$enzyme_name.".".$enzyme2_name.".".$read_length."nt.digest.bed";
}else{
$outfile = $bed_genome_name.".".$enzyme_name.".".$read_length."nt.digest_results";
$main_bed = $bed_genome_name.".".$enzyme_name.".".$read_length."nt.digest.bed";
}
#get reference fasta files
my @ref_chr;
open (ref_files,$fasta_file_locations)||die "could not open file $fasta_file_locations\n";
while(<ref_files>){
chop;
$_=~s/\n//;$_=~s/\r//;
if ($_ ne ""){
push(@ref_chr,$_);
#current script only works with one sequence per file, add a check for this
my $grep_results = `grep \"^\>\" $_`;
my @grep_array = split(/\n/,$grep_results); my $number_sequences_in_file = @grep_array;
if ($number_sequences_in_file > 1){die "please use only one sequence per fasta file, $_ had $number_sequences_in_file sequences\n";}
}
}
close(ref_files)||die "could not close file $fasta_file_locations\n";
#define enzyme to use
my $rest_enzyme = new Bio::Restriction::Enzyme(-enzyme => $enzyme_name, -seq => $digest_seq);
#optional: define a second enzyme
my $rest_enzyme2; if ($digest2_seq ne "not_defined"){ $rest_enzyme2 = new Bio::Restriction::Enzyme(-enzyme => $enzyme2_name, -seq => $digest2_seq); }
#optional: define a third enzyme
my $rest_enzyme3; if ($digest3_seq ne "not_defined"){ $rest_enzyme3 = new Bio::Restriction::Enzyme(-enzyme => "Enzyme3", -seq => $digest3_seq); }
#define ouput count variables
my $total_cuts=0; my $total_fragments=0; my $total_fragments_between_min_and_max = 0;
my $bin_size=100;#hard coded for now
my $temp_size=$min_size;
until (($temp_size+$bin_size)>$max_size){
my $temp_size_plus_bin = $temp_size+$bin_size;
my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin;
$$variable_name=0;
$temp_size=$temp_size+$bin_size;
}
my $total_fragments_less_min = 0;
my %fragments_in_range_per_chr; my %fragments_in_range_per_chr_enzyme1_end; my %fragments_in_range_per_chr_enzyme2_end; my %fragments_in_range_per_chr_enzymeboth_end;
#define ouput count variables, optional for a second enzyme
my $total_cuts_enzyme2=0; my $fragments_both_ends_enzyme1 = 0; my $fragments_both_ends_enzyme2 = 0; my $fragments_end_from_each_enzyme = 0;
#define ouput count variables, optional for a third enzyme
my $fragments_in_range_contain_third_enzyme = 0;
#define ouput count variables, distribution of distances
my $one_kb = 0; my $ten_kb = 0; my $hundred_kb=0; my $one_Mb = 0; my $ten_Mb = 0; my $more_ten_kb=0;
########################################################################################################################################################################
########################################################################################################################################################################
#per chromosome, perform digest and add to totals
#open bed file to print to
open(outbed,">$main_bed")||die "could not open out bedfile $main_bed\n";
foreach my $chr (@ref_chr){
#just get chr name and remove directory structure
my @chr_name_parts =split(/\//,$chr);my $number_parts=@chr_name_parts;
my $chr_name=@chr_name_parts[$number_parts-1];
$chr_name=~s/.fa$//i;$chr_name=~s/.fasta$//i;
$fragments_in_range_per_chr{$chr_name}=0;
$fragments_in_range_per_chr_enzyme1_end{$chr_name}=0;
$fragments_in_range_per_chr_enzyme2_end{$chr_name}=0;
$fragments_in_range_per_chr_enzymeboth_end{$chr_name}=0;
#get fasta file for this chr
my $seqio_object1 = Bio::SeqIO->new(-file => $chr, -format => "fasta");
my $seq = $seqio_object1->next_seq;
my $fastadb = Bio::DB::Fasta->new($chr);
#digest
my $rest_analysis=Bio::Restriction::Analysis->new(-seq=>$seq, -enzymes=>$rest_enzyme);
#just get cut numbers first
my $chr_cuts=$rest_analysis->cuts_by_enzyme($enzyme_name);
$total_cuts=$total_cuts+$chr_cuts;
#optional digest and cut numbers for a second enzyme
my $rest_analysis2;my $chr_cuts2;
if ($digest2_seq ne "not_defined"){
$rest_analysis2=Bio::Restriction::Analysis->new(-seq=>$seq, -enzymes=>$rest_enzyme2);
$chr_cuts2=$rest_analysis2->cuts_by_enzyme($enzyme2_name);
$total_cuts_enzyme2=$total_cuts_enzyme2+$chr_cuts2;
}
#get locations of cuts
my @locations1 = $rest_analysis->positions($enzyme_name);
#cut locations if second enzyme
my %hash_locations1; my %hash_locations2;#use position as keys
my @locations2;my @all_locations;
if ($digest2_seq ne "not_defined"){
@locations2=$rest_analysis2->positions($enzyme2_name);
@all_locations = (@locations1,@locations2);
@all_locations=sort {$a <=> $b} @all_locations;
foreach my $position1 (@locations1){ $hash_locations1{$position1}=1; }
foreach my $position2 (@locations2){ $hash_locations2{$position2}=1; }
}else{
@all_locations=@locations1;
}
#go through all cut locations and check fragment sizes
my @chr_with_directories = split(/\//,$chr); $chr_name=pop @chr_with_directories; $chr_name=~s/.fa//;
my $previous_location=0; my $previous_fragment_in_range_end=0;
#to analyze every fragment, add the end of the chromosome as a position
my $chr_end_position = $seq->length;
push(@all_locations,$chr_end_position);
foreach my $location (@all_locations){
$total_fragments++;
if ($location<$previous_location){die "error in order, $location < $previous_location";}
my $fragment_length = $location-$previous_location;
if (($fragment_length<=$max_size)&&($fragment_length>=$min_size)){
$total_fragments_between_min_and_max++;
$fragments_in_range_per_chr{$chr_name}=$fragments_in_range_per_chr{$chr_name}+1;
#for distribution of distance between sequenced fragments
if ($previous_fragment_in_range_end!=0){
my $distance_this_fragment_to_previous=$previous_location-$previous_fragment_in_range_end;
if ( $distance_this_fragment_to_previous<=1000 ){
$one_kb++;
}elsif( ($distance_this_fragment_to_previous>1000) && ($distance_this_fragment_to_previous<=10000)){
$ten_kb++;
}elsif( ($distance_this_fragment_to_previous>10000) && ($distance_this_fragment_to_previous<=100000)){
$hundred_kb++;
}elsif( ($distance_this_fragment_to_previous>100000) && ($distance_this_fragment_to_previous<=1000000)){
$one_Mb++;
}elsif( ($distance_this_fragment_to_previous>1000000) && ($distance_this_fragment_to_previous<=10000000)){
$ten_Mb++;
}elsif( ($distance_this_fragment_to_previous>10000000) ){
$more_ten_kb++;
}else{
die "error in distances\n";
}
}
$previous_fragment_in_range_end=$location;
#if second enzyme, are ends from one or both enzymes
if ($digest2_seq ne "not_defined"){
if ( (exists($hash_locations1{$previous_location})) && (exists($hash_locations1{$location})) ){
$fragments_both_ends_enzyme1++; #both ends from enzyme1
$fragments_in_range_per_chr_enzyme1_end{$chr_name}=$fragments_in_range_per_chr_enzyme1_end{$chr_name}+1;
}elsif ( (exists($hash_locations2{$previous_location})) && (exists($hash_locations2{$location})) ){
$fragments_both_ends_enzyme2++; #both ends from enzyme2
$fragments_in_range_per_chr_enzyme2_end{$chr_name}=$fragments_in_range_per_chr_enzyme2_end{$chr_name}+1;
}elsif ( (exists($hash_locations1{$previous_location})) && (exists($hash_locations2{$location})) ){
$fragments_end_from_each_enzyme++; #one end from each enzyme
$fragments_in_range_per_chr_enzymeboth_end{$chr_name}=$fragments_in_range_per_chr_enzymeboth_end{$chr_name}+1;
}elsif ( (exists($hash_locations2{$previous_location})) && (exists($hash_locations1{$location})) ){
$fragments_end_from_each_enzyme++; #one end from each enzyme
$fragments_in_range_per_chr_enzymeboth_end{$chr_name}=$fragments_in_range_per_chr_enzymeboth_end{$chr_name}+1;
}
}
#counts on bin sizes
my $temp_size=$min_size;
until (($temp_size+$bin_size)>$max_size){
my $temp_size_plus_bin = $temp_size+$bin_size;
if ( ($fragment_length>=$temp_size)&&($fragment_length<$temp_size_plus_bin) ){
my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin;
$$variable_name=$$variable_name+1;
}
$temp_size=$temp_size+$bin_size;
}
#print to bed the potentially sequenced bases
if ($fragment_length <= (2*$read_length)){
print outbed "$chr_name\t$previous_location\t$location\n";
}else{
#print first read
$previous_location_plus=$previous_location+$read_length;
print outbed "$chr_name\t$previous_location\t$previous_location_plus\n";
#print second read
$location_minus=$location-$read_length;
print outbed "$chr_name\t$location_minus\t$location\n";
}
#if third enzyme requested, check if there is a digest site in this fragment
if ($digest3_seq ne "not_defined"){
my $fragment_seq = $fastadb->seq($chr_name, ($previous_location+1) => ($location));
my $frag_seq_obj = Bio::Seq->new( -seq => $fragment_seq );
my $rest_analysis3=Bio::Restriction::Analysis->new(-seq=>$frag_seq_obj, -enzymes=>$rest_enzyme3);
my $frag_cuts3=$rest_analysis3->cuts_by_enzyme("Enzyme3");
if ($frag_cuts3>0){ $fragments_in_range_contain_third_enzyme++ }
}
}elsif ($fragment_length<$min_size){
$total_fragments_less_min++;
}
$previous_location=$location;
}
}
#close bedfile
close(outbed)||die "could not open out bedfile $main_bed\n";
########################################################################################################################################################################
########################################################################################################################################################################
#print to output file general digest info
open(out,">$outfile")||die "could not open outfile $outfile\n";
#print input parameters
print out "input parameters:\n\tenzyme name used: $enzyme_name\n\tenzyme cut sequence: $digest_seq\n";
if ($digest2_seq ne "not_defined"){ print out "\tsecond enzyme name used: $enzyme2_name\n\tsecond enzyme cut sequence: $digest2_seq\n"; }
if ($digest3_seq ne "not_defined"){ print out "\tthird enzyme cut sequence: $digest3_seq\n"; }
print out "\tfor bedfile, genome name used: $bed_genome_name\n\tread length provided: $read_length\nminimum fragment size: $min_size\nmaximum fragment size: $max_size\n\nreference sequence(s) analyzed:\n";
foreach my $ref_to_print_to_output (@ref_chr){ print out "\t$ref_to_print_to_output\n"; }
print out "\n";
#print number of cuts
print out "total cuts for enzyme ($enzyme_name): $total_cuts\n";
if ($digest2_seq ne "not_defined"){ print out "total cuts for second enzyme ($enzyme2_name): $total_cuts_enzyme2\n"; }
#print number of fragments
print out "\nlooked at $total_fragments fragments in total\n";
print out "$total_fragments_between_min_and_max fragments <= $max_size nt and >= $min_size nt\n";
if ($digest3_seq ne "not_defined"){ print out "\t-containing one or more third digest site ($digest3_seq): $fragments_in_range_contain_third_enzyme\n"; }
if ($digest2_seq ne "not_defined"){
print out "\t-with both ends from $enzyme_name: $fragments_both_ends_enzyme1\n";
print out "\t-with both ends from $enzyme2_name: $fragments_both_ends_enzyme2\n";
print out "\t-with an end from each enzyme: $fragments_end_from_each_enzyme\n";
print out "\tfragments in range per reference sequence: (both ends from $enzyme_name / both ends from $enzyme2_name / an end from each enzyme)\n";
}else{
print out "\tfragments in range per reference sequence:\n";
}
#print fragment chromosome distribution
my @chr_names_for_count_print = keys %fragments_in_range_per_chr;
foreach my $chr_name_for_count_print (@chr_names_for_count_print){
if ($digest2_seq ne "not_defined"){
print out "\t\t$chr_name_for_count_print had $fragments_in_range_per_chr{$chr_name_for_count_print} fragments ($fragments_in_range_per_chr_enzyme1_end{$chr_name_for_count_print}/$fragments_in_range_per_chr_enzyme2_end{$chr_name_for_count_print}/$fragments_in_range_per_chr_enzymeboth_end{$chr_name_for_count_print})\n";
}else{
print out "\t\t$chr_name_for_count_print had $fragments_in_range_per_chr{$chr_name_for_count_print} fragments\n";
}
}
#print fragment distance distribution
print out "\tdistances between fragments in range:\n";
print out "\t\t<=1kb $one_kb fragment pairs\n\t\t1kb-10kb $ten_kb fragment pairs\n\t\t10kb-100kb $hundred_kb fragment pairs\n\t\t100kb-1Mb $one_Mb fragment pairs\n";
print out "\t\t1Mb-10Mb $ten_Mb fragment pairs\n\t\t>10Mb $more_ten_kb fragment pairs\n";
#print fragment size distribution
print out "\tfragments less than $min_size nt: $total_fragments_less_min \n\n";
my $temp_size=$min_size;
until (($temp_size+$bin_size)>$max_size){
my $temp_size_plus_bin = $temp_size+$bin_size;
my $variable_name = "total_fragments_between_".$temp_size."_and_".$temp_size_plus_bin;
$temp_size_plus_bin--;
print out "\tfragments $temp_size-$temp_size_plus_bin nt: $$variable_name\n";
$temp_size=$temp_size+$bin_size;
}
print out "\n";
close(out)||die "could not close outfile $outfile\n";