-
Notifications
You must be signed in to change notification settings - Fork 0
/
run-dipcall-poly
executable file
·161 lines (142 loc) · 5.06 KB
/
run-dipcall-poly
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Std;
my $version = "0.3";
my %opts = (t=>8);
getopts("t:d:x:muz:W:ap:", \%opts);
die("Usage: run-dipcall-poly [options] <prefix> <ref.fa> <hap1.fa> <hap2.fa> .. <hap?.fa>
Options:
-t INT number of threads [$opts{t}]
-d FILE unimap/minimap2 index for ref.fa []
-a call on all contigs regardless of naming
-x FILE PAR on chrX; assuming male
-z INT Z-drop [mapper default]
-m use minimap2 for mapping (default)
-u use unimap for mapping
-W FILE repetitive k-mer list; use winnowmap for mapping
-p INT ploidy level (>=2). specify the same number of haplotype fasta.
") if @ARGV < 4;
# test file existence
my $pre = shift(@ARGV);
my $ref = shift(@ARGV);
my @hap = @ARGV;
## YF
# ploidy level
my $ploidy = 2;
if (defined $opts{p}) {
if($opts{p} >= 2){
$ploidy = $opts{p};
} else {
die "ERROR: ploidy needs to be larger than or equals to 2\n";
}
}
die "ERROR: failed to read the reference file '$ref'\n" unless -f $ref;
die "ERROR: please index the reference with 'samtools faidx'\n" unless -f "$ref.fai";
if(scalar(@ARGV) != $ploidy){
die "ERROR: specify the same number of haplotype fasta as given to -p.\n";
}
for(my $i=0; $i < $ploidy; $i++){
my $str = "ERROR: failed to read the haplotype \$hap[@{[$i+1]}]\n";
die $str unless -f $hap[$i];
}
my $is_male = defined($opts{x})? 1 : 0;
if (defined $opts{x}) {
die "ERROR: failed to read PAR\n" unless -f $opts{x};
$is_male = 1;
}
# find the root directory
my $exepath = $0 =~/^\S+\/[^\/\s]+/? $0 : &which($0);
my $root = $0 =~/^(\S+)\/[^\/\s]+/? $1 : undef;
$root = $exepath =~/^(\S+)\/[^\/\s]+/? $1 : undef if !defined($root);
die "ERROR: failed to locate the root directory\n" if !defined($root);
# mapper settings
my $mapper = "minimap2";
my $mm2_opt = q/-xasm5 --cs -t$(N_THREADS)/;
if (defined $opts{u}) {
$mapper = "unimap";
} elsif (defined $opts{W}) {
$mapper = "winnowmap";
$mm2_opt .= " -r2k -W $opts{W}";
}
$mapper = "minimap2" if defined($opts{m});
$mm2_opt .= " -z$opts{z},200" if defined($opts{z});
my $mm2_idx = defined($opts{d})? $opts{d} : $ref;
# write Makefile
my @mak = ();
push(@mak, "ROOT=$root");
push(@mak, "N_THREADS=$opts{t}");
push(@mak, "MAPPER=$mapper");
push(@mak, "MM2_IDX=$mm2_idx");
push(@mak, "MM2_OPT=$mm2_opt");
push(@mak, "REF_FA=$ref");
push(@mak, "");
push(@mak, "all:$pre.dip.bed $pre.dip.vcf.gz", "");
for (my $i=0; $i < $ploidy; $i++){
push(@mak, "$pre.hap@{[$i+1]}.paf.gz:$hap[${i}]");
push(@mak, q{ $(ROOT)/$(MAPPER) -c --paf-no-hit $(MM2_OPT) $(MM2_IDX) $< 2> [email protected] | gzip > $@});
}
push(@mak, "");
for (my $i=0; $i < $ploidy; $i++){
push(@mak, "$pre.hap@{[$i+1]}.sam.gz:$hap[${i}]");
push(@mak, q{ $(ROOT)/$(MAPPER) -a $(MM2_OPT) $(MM2_IDX) $< 2> [email protected] | gzip > $@});
}
push(@mak, "");
for (my $i=0; $i < $ploidy; $i++){
push(@mak, "$pre.hap@{[$i+1]}.bam:$pre.hap@{[$i+1]}.sam.gz");
push(@mak, q{ $(ROOT)/k8 $(ROOT)/dipcall-aux.js samflt $< | $(ROOT)/samtools sort -m4G --threads 4 -o $@ -});
}
push(@mak, "");
my @t_args;
for (my $i=0; $i < $ploidy; $i++){
push(@t_args, "$pre.hap@{[$i+1]}".'.bam');
}
push(@mak, "$pre.pair.vcf.gz:".join(" ", @t_args) );
push(@mak, q{ $(ROOT)/htsbox pileup -q5 -evcf $(REF_FA) $^ | $(ROOT)/htsbox bgzip > $@});
push(@mak, "$pre.dip.vcf.gz:$pre.pair.vcf.gz");
my $pair_opt = defined($opts{a})? "-a" : "";
if ($is_male) {
push(@mak, q{ $(ROOT)/k8 $(ROOT)/dipcall-aux.js vcfpair} . qq{ $pair_opt -p $opts{x} } . q{$< | $(ROOT)/htsbox bgzip > $@});
} else {
push(@mak, q{ $(ROOT)/k8 $(ROOT)/dipcall-aux.js vcfpair} . qq{ $pair_opt } . q{$< | $(ROOT)/htsbox bgzip > $@});
}
push(@mak, "");
for (my $i=0; $i < $ploidy; $i++){
push(@mak, "$pre.hap@{[$i+1]}.var.gz:$pre.hap@{[$i+1]}.paf.gz");
push(@mak, q{ gzip -dc $< | sort -k6,6 -k8,8n | $(ROOT)/k8 $(ROOT)/paftools.js call - 2> [email protected] | gzip > $@});
}
for (my $i=0; $i < $ploidy; $i++){
push(@mak, "$pre.hap@{[$i+1]}.bed:$pre.hap@{[$i+1]}.var.gz");
push(@mak, q{ gzip -dc $< | grep ^R | cut -f2- > $@});
}
@t_args = ();
for (my $i=0; $i < $ploidy; $i++){
push(@t_args, "$pre.hap@{[$i+1]}".'.bed');
}
push(@mak, "$pre.dip.bed:".join(" ", @t_args) );
if ($is_male) {
my $bedtk = '$(ROOT)/bedtk';
my @cmd;
push(@cmd, "$bedtk isec $pre.hap1.bed $pre.hap2.bed | egrep -v '^(chr)?[XY]' > $pre.tmp.bed"); # autosome
push(@cmd, "$bedtk isec $pre.hap1.bed $pre.hap2.bed | $bedtk isec $opts{x} >> $pre.tmp.bed"); # chrX PAR
push(@cmd, "$bedtk sub $pre.hap2.bed $pre.hap1.bed | egrep '^(chr)?X' | $bedtk sub - $opts{x} >> $pre.tmp.bed"); # chrX non-PAR
push(@cmd, "$bedtk sub $pre.hap1.bed $pre.hap2.bed | egrep '^(chr)?Y' >> $pre.tmp.bed"); # chrY non-PAR
push(@cmd, "$bedtk sort $pre.tmp.bed > " . '$@');
push(@mak, "\t" . join("; ", @cmd));
} else {
push(@mak, q{ $(ROOT)/bedtk isec -m $^ > $@});
}
push(@mak, "");
print(join("\n", @mak));
exit(0);
# auxiliary routines
sub which {
my $file = shift;
my $path = (@_)? shift : $ENV{PATH};
return if (!defined($path));
foreach my $x (split(":", $path)) {
$x =~ s/\/$//;
return "$x/$file" if (-x "$x/$file");
}
return;
}