-
Notifications
You must be signed in to change notification settings - Fork 0
/
AssemblyAlignmentGenerate
executable file
·51 lines (41 loc) · 2.2 KB
/
AssemblyAlignmentGenerate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/bin/bash
# This script generates core-gene alignments from a list of assemblies.
# Created by Mingzhi Lin ([email protected]).
# Usage: bash RunAll.sh <assembly summary file> <accession list file> <output direcotry> <output prefix>
# Arguments:
# <assembly summary file> can be download from ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/assembly_summary_refseq.txt
# <accession list file> contains a list of assembly accessions;
# <output directory> is the working space;
# <output prefix> is the result output_prefix.
# Results:
# <output directory>/<output prefix>_core.xmfa stores the final alignments of core genomes.
# This program contains six steps as detailed below.
assembly_summary_file=$1
accession_list=$2
output_dir=$3
output_prefix=$4
if [[ $# -eq 0 ]]; then
echo "Usage: AssemblyAlignmentGenerate <assembly summary file> <accession list file> <output direcotry> <output prefix>"
exit 1
fi
# Create folders for temporary results.
if [[ ! -d ${output_dir}/genomes ]]; then
mkdir ${output_dir}/genomes
fi
if [[ ! -d ${output_dir}/prokka ]]; then
mkdir ${output_dir}/prokka
fi
# Get source script path
ScriptDir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# Step 1: fetch genomic sequences from NCBI FTP.
python ${ScriptDir}/FetchGenomes.py ${assembly_summary_file} ${accession_list} ${output_dir}/genomes
# Step 2: use Prokka to re-annotate genomes
bash ${ScriptDir}/InvokeProkka.sh ${accession_list} ${output_dir}/genomes ${output_dir}/prokka
# Step 3: use Roary to generate the pan-genome.
bash ${ScriptDir}/InvokeRoary.sh ${accession_list} ${output_dir}/prokka ${output_dir}/roary
# Step 4: load sequences into sqlite database for querying.
python ${ScriptDir}/LoadSequences.py ${accession_list} ${output_dir}/prokka ${output_dir}/roary ${output_dir}/${output_prefix}_sequences.db
# Step 5: use muscle to align protein sequences and back-translate them to DNA sequences.
go run ${ScriptDir}/AlignSequences.go ${output_dir}/${output_prefix}_sequences.db ${output_dir}/${output_prefix}_alignments.csv
# Step 6: extract alignments of core genome.
python ${ScriptDir}/ExtractCoreGenes.py ${accession_list} ${output_dir}/${output_prefix}_alignments.csv ${output_dir}/${output_prefix}_core.xmfa