-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_pipeline.sh
87 lines (73 loc) · 3.12 KB
/
run_pipeline.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env bash
################# RUN_PIPELINE.SH
#
# AUTHOR : Felix VANDERMEEREN
# CONTACT : [email protected]
# DEPENDENCIES : Bash, Conda
#
# DESCRIPTION:
# Wrapper around nf-core/rnavar pipeline
#
set -oeu pipefail # Best practice
## Download "nf-core/rnavar" pipeline
PIPE=rnavar
PIPE_VERS=1.0.0
TO_PIPE=nf-core-"$PIPE"_"$PIPE_VERS"
GENOME=GRCh37
if [[ ! -d "$TO_PIPE" ]] ; then
nf-core download "$PIPE" \
--revision "$PIPE_VERS" \
--download-configuration \
--container-system none \
--compress none
# Run pipeline on test data:
#nextflow run "$TO_PIPE" -profile test,singularity --max_memory 5.GB
echo
fi
## Run pipeline on wanted data
# Create SampleSheet:
sample_sheet=SampleSheet.csv
base_URL="https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/giab/data_RNAseq/AshkenazimTrio/HG002_NA24385_son/Google_Illumina/mRNA/reads"
echo "sample,fastq_1,fastq_2,strandedness" > "$sample_sheet"
echo "hg002_gm24385.mrna,$base_URL/hg002_gm24385.mrna.R1.fastq.gz,$base_URL/hg002_gm24385.mrna.R2.fastq.gz,reverse" >> "$sample_sheet"
echo "hg002_gm26105.mrna,$base_URL/hg002_gm26105.mrna.R1.fastq.gz,$base_URL/hg002_gm26105.mrna.R2.fastq.gz,reverse" >> "$sample_sheet"
echo "hg002_gm27730.mrna,$base_URL/hg002_gm27730.mrna.R1.fastq.gz,$base_URL/hg002_gm27730.mrna.R2.fastq.gz,reverse" >> "$sample_sheet"
# Reference information:
# 1st time use bellow and let pipeline download everything:
support="--genome $GENOME --save_reference --read_length 101"
# But then better to specify local support files with '--star_index' and '--fasta'
# Instead use only 'chr22' and generate STAR_index on-the-fly:
support="--read_length 150 --igenomes_ignore --fasta chr22.fa --gtf s3://ngi-igenomes/igenomes/Homo_sapiens/Ensembl/$GENOME/Annotation/Genes/genes.gtf"
# Annotation:
# Download VEP cache:
# IDEA: Use 'not refSeq' cache -> no need to later pass special options to VEP ?
# ENH: Use saref 'download_cache.nf' script to do that ?
wget https://ftp.ensembl.org/pub/release-104/variation/indexed_vep_cache/homo_sapiens_refseq_vep_104_GRCh37.tar.gz
mkdir caches
tar xvzf homo_sapiens_refseq_vep_104_GRCh37.tar.gz -C caches
# Single quotes are needed around vep_cache_version because pipeline expect a string
annot="--annotate_tools vep --annotation_cache --vep_cache caches --vep_genome $GENOME --vep_species homo_sapiens --vep_cache_version '104'"
# Use a config file to:
# * Tell GATK BedToIntervalList to '--DROP_MISSING_CONTIGS':
# (see: https://github.com/nf-core/rnavar/issues/55)
#
# * To pass external args to VEP (needed because I use a 'refSeq' cache)
#
# With '--skip_baserecalibration', no need for '--dbsnp' or '--known_indels'
#
# WARN: When usin pre-built StarIndex, must ensure same read_length as my data
# -> GIAB RNAseq FASTQ are 150pb, whereas iGenomes STARIndex is 100pb ?
#
# WARN: VEP cache version should be same as VEP version (here v104 -> OK)
#
nextflow run "$TO_PIPE"/1_0_0 \
--input "$sample_sheet" \
--outdir rnavar \
$support \
--skip_baserecalibration \
$annot \
--max_memory 30.GB \
--max_cpus 8 \
-c my_process.config \
-profile singularity \
-resume