Merge branch 'master' of github.com:ISUgenomics/common_analyses

ISUgenomics · Feb 2, 2017 · f6f8fea · f6f8fea
2 parents 6096678 + c05aea2
commit f6f8fea
Show file tree

Hide file tree

Showing 3 changed files with 58 additions and 9 deletions.
diff --git a/GATK_00_PrepareRef.sh b/GATK_00_PrepareRef.sh
@@ -2,7 +2,7 @@
 # Prepares the Reference Genome for mapping as well as for using it with GATK pipeline
 # You need to supply the referece genome as REF below or as:
 # ./GATK_00_PrepareRef.sh your_genome.fasta
-module load picard_tools
+module load picard
 module load samtools
 module load bwa
 module load bedtools
@@ -11,16 +11,16 @@ module load python
 REF="$1"
 #index genome for (a) picard, (b) samtools and (c) bwa
 parallel <<FIL
-java -Xmx100G -jar $PICARD/picard.jar CreateSequenceDictionary \
+java -Xmx100G -jar $PICARD_HOME/picard.jar CreateSequenceDictionary \
   REFERENCE=${REF} \
   OUTPUT=${REF%.*}.dict
 samtools faidx ${REF}
 bwa index -a bwtsw ${REF}
 FIL
 # Create interval list (here 100 kb intervals)
-python fasta_length.py ${REF} > ${REF%.*}_length.txt
+fasta_length.py ${REF} > ${REF%.*}_length.txt
 bedtools makewindows -w 100000 -g ${REF%.*}_length.txt > ${REF%.*}_100kb_coords.bed
-java -Xmx100G -jar $PICARD/picard.jar BedToIntervalList \
+java -Xmx100G -jar $PICARD_HOME/picard.jar BedToIntervalList \
   INPUT=${REF%.*}_100kb_coords.bed \
   SEQUENCE_DICTIONARY=${REF%.*}.dict \
   OUTPUT=${REF%.*}_100kb_gatk_intervals.list
diff --git a/GATK_01_RunBWA.sh b/GATK_01_RunBWA.sh
@@ -4,26 +4,26 @@
 
 #command genomeModule READ1 READ2
 
-module load picard
+module load picard/2.4.1
 module load java
-module load bwa/0.7.13
+module load bwa/0.7.12
 module load samtools
 module load $1
 
 REF="$GENOMEDIR/$GNAME"
 # this option might be the frequetly changed, hence not it's a variable
-THREADS="8"
+THREADS="16"
 # if the reads are paired then use -p option
 if [ "$#" -eq 3 ]; then
   READ1="$2"
   READ2="$3"
   OUTNAME=$(basename ${READ1%.*} | cut -f 1-2 -d "_")
-  bwa mem -M -t ${THREADS} ${REF} ${READ1} ${READ2} | samtools view -buS - > ${OUTNAME}.bam
+  bwa mem -M -x ont2d -t ${THREADS} ${REF} ${READ1} ${READ2} | samtools view -buS - > ${OUTNAME}.bam
 # if not just use the reads as single reads
 elif [ "$#" -eq 1 ]; then
   READ1="$2"
   OUTNAME=$(basename ${READ1%.*} | cut -f 1-2 -d "_")
-  bwa mem -M -t ${THREADS} ${REF} ${READ1} | samtools view -buS - > ${OUTNAME}.bam
+  bwa mem -M -x ont2d -t ${THREADS} ${REF} ${READ1} | samtools view -buS - > ${OUTNAME}.bam
 # if number of arguments do not match, raise error
 else
   echo "ERROR: INVALID NUMBER OF ARGUMENTS"

diff --git a/fastaSortByName.pl b/fastaSortByName.pl
@@ -0,0 +1,49 @@
+#!/usr/bin/perl -w
+
+my $usage="\nUsage: $0 [-hrg] [fastaFileName1 ...]\n".
+    "  -h: help\n".
+    "  -r: reverse\n" .
+    "  -g: remove gaps '-' from the sequence\n".
+    "Sort FASTA sequences alphabetically by names.  If multiple files are \n".
+    "given, sequences in all files are marged before sorting.  If no \n".
+    "argument is given, it will take STDIN as the input\n";
+
+our($opt_h, $opt_g, $opt_r);
+
+use Bio::SeqIO;
+
+use Getopt::Std;
+getopts('hgr') || die "$usage\n";
+die "$usage\n" if (defined($opt_h));
+
+my $format = "fasta";
+my @seqArr = ();
+
+@ARGV = ('-') unless @ARGV;
+while (my $file = shift) {
+    my $seqio_obj = Bio::SeqIO->new(-file => $file, -format => $format);
+    while (my $seq = $seqio_obj->next_seq()) {
+	push(@seqArr, $seq);
+    }
+}
+
+if (defined($opt_r)) {
+    @seqArr = sort { - ($a->id() cmp $b->id()) } @seqArr;
+} else {
+    @seqArr = sort { $a->id() cmp $b->id() } @seqArr;
+}
+
+
+my $seqOut = Bio::SeqIO->new(-fs => \*STDOUT, -format => $format);
+foreach my $s (@seqArr) {
+    if(defined($opt_g)) {
+	my $tmp = $s->seq();
+	$tmp =~ s/-//g;
+	$s->seq($tmp);
+    }
+    $seqOut->write_seq($s);
+}
+
+exit;
+
+