Skip to content

Commit

Permalink
touch up example workflow
Browse files Browse the repository at this point in the history
  • Loading branch information
lrauschning committed Dec 9, 2024
1 parent 731ee3a commit fb5b731
Showing 1 changed file with 30 additions and 20 deletions.
50 changes: 30 additions & 20 deletions example/example_workflow.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
#!/bin/sh

## Download some genomes
# This file serves as an example workflow illustrating how and when to use msyd.
# It is a part of the msyd CI, and should pass so long as your system


## Download three publicly available, high quality A. thaliana genomes

# download the Col-CC assembly
curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GCA_028009825.1/download?include_annotation_type=GENOME_FASTA&filename=GCA_028009825.1.zip" -H "Accept: application/zip"
Expand All @@ -15,14 +19,15 @@ curl -OJX GET "https://api.ncbi.nlm.nih.gov/datasets/v2alpha/genome/accession/GC
unzip "./*.zip"
mv ncbi_dataset/data/*/*.fna ./

## Prepare them for running msyd
### Prepare them for running msyd

# rename them to shorter names
## rename them to shorter names
mv GCA_001651475.1_Ler_Assembly_genomic.fna ler.fna
mv GCA_028009825.1_Col-CC_genomic.fna col.fna
mv GCA_902460295.1_Arabidopsis_thaliana_Sha_genomic.fna sha.fna
mv GCA_024498555.1_ASM2449855v1_genomic.fna swe.fna

## filter out small scaffolds
grep -n -P ">" ./*.fna
# col and swe do not require truncating,
# for ler the small scaffolds start at line 1442097
Expand All @@ -33,45 +38,50 @@ head -n 1480076 sha.fna > sha.filtered.fna
mv sha.filtered.fna sha.fna


## Generate inputs for msyd
### Generate inputs for msyd

# generate alignments to col-CC
## generate alignments to col-CC
mv col.fna ref.fna
minimap2 -cx asm5 --eqx ref.fna ler.fna > ler.paf
minimap2 -cx asm5 --eqx ref.fna sha.fna > sha.paf
minimap2 -cx asm5 --eqx ref.fna swe.fna > swe.paf
minimap2 -cx asm10 --eqx ref.fna ler.fna > ler.paf
minimap2 -cx asm10 --eqx ref.fna sha.fna > sha.paf
minimap2 -cx asm10 --eqx ref.fna swe.fna > swe.paf

# run syri on the alignments
## run syri on the alignments
# make sure to pass --cigar and specify appropriate prefixes, so the msyd output is more easily interpretable
syri --nc 5 -F P --cigar --prefix ler -c ler.paf -r ref.fna -q ler.fna --lf ler.syri.log --samplename ler
syri --nc 5 -F P --cigar --prefix sha -c sha.paf -r ref.fna -q sha.fna --lf sha.syri.log --samplename sha
syri --nc 5 -F P --cigar --prefix swe -c swe.paf -r ref.fna -q swe.fna --lf swe.syri.log --samplename swe

## construct genomes.tsv file
# as msyd needs many input files, the paths are stored in a samplesheet
echo "#name\taln\tsyri\tvcf\tseq" > genomes.tsv
for f in *syri.out
do
bs=$(basename $f syri.out)
echo "$bs\t$bs.paf\t${bs}syri.out\t${bs}syri.vcf\t${bs}.fna" >> genomes.tsv
done

# run msyd to call pansynteny
msyd call -i genomes.tsv -o athalianas.pff -m athalianas.vcf -r ref.fna
### run msyd to call multisynteny
msyd call -c 5 -i genomes.tsv -o athalianas.psf -m athalianas.vcf -r ref.fna

### work with the output

## work with the output
## export multisynteny on Chr3 for use in visualization/other software

# CP116282 is the id corresponding to chromosome 3 in Col-CC
msyd view -e "on CP116283.1" -i athalianas.pff -o athalianas-chr3.pff
# CP116283 is the id corresponding to chromosome 3 in Col-CC
# filter for multisynteny on this chromosome
msyd view -e "on CP116283.1" -i athalianas.psf -o athalianas-chr3.psf

# convert to VCF for use in visualization/other software
msyd view -i athalianas-chr3.pff -o athalianas-chr3-syn.vcf
# export to VCF; this could also be done in the command above
msyd view -i athalianas-chr3.psf -o athalianas-chr3-syn.vcf

## download 1001 genome project VCF, filter for vars in pansyntenic regions
## download 1001 genome project VCF, filter for small variants structurally conserved regions

curl https://ftp.ebi.ac.uk/ensemblgenomes/pub/release-56/plants/variation/vcf/arabidopsis_thaliana/arabidopsis_thaliana.vcf.gz -o ensembl_athaliana.vcf.gz
gunzip ensembl_athaliana.vcf.gz

# change from ids to chr numbers, to match vcf nomenclature
sed -e s/CP116280.1/1/ -e s/CP116281.1/2/ -e s/CP116282.1/3/ -e s/CP116283.1/4/ -e s/CP116284.1/5/ athalianas.pff > athalianas-chrnames.pff
sed -e s/CP116280\.1/Chr1/ -e s/CP116281\.1/Chr2/ -e s/CP116282\.1/Chr3/ -e s/CP116283\.1/Chr4/ -e s/CP116284\.1/Chr5/ athalianas.psf > athalianas-chrnames.psf

# filter for variants in pansyntenic regions!
msyd view -i athalianas-chrnames.pff -e "deg >= 3" -o pansynt-vars.vcf --intersect ensembl_athaliana.vcf
# filter for variants in coresyntenic regions!
msyd view -i athalianas-chrnames.psf -e "deg >= 3" -o coresynt-snvs.vcf --intersect ensembl_athaliana.vcf

0 comments on commit fb5b731

Please sign in to comment.