Skip to content

Commit

Permalink
update plots
Browse files Browse the repository at this point in the history
  • Loading branch information
katiesevans committed Jan 4, 2022
1 parent 86a17d5 commit b02dfba
Show file tree
Hide file tree
Showing 5 changed files with 297 additions and 48 deletions.
7 changes: 5 additions & 2 deletions bin/pca_template.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ tracy <- data.table::fread(glue::glue("EIGESTRAT/{ld}/NO_REMOVAL/TracyWidom_stat
cumsum = cumsum(VarExp),
N = paste0("PC", N))
DT::datatable(tracy[1:10,])
# print(DT::datatable(tracy[1:10,]))
tracy[1:10,]
```


Expand Down Expand Up @@ -42,7 +44,8 @@ tracy <- data.table::fread(glue::glue("EIGESTRAT/{ld}/OUTLIER_REMOVAL/TracyWidom
cumsum = cumsum(VarExp),
N = paste0("PC", N))
DT::datatable(tracy[1:10,])
# print(DT::datatable(tracy[1:10,]))
tracy[1:10,]
```

Expand Down
14 changes: 8 additions & 6 deletions bin/pca_template_template2.Rmd
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
```{r, fig.height = 10, fig.width = 10}
pcs <- data.frame(first = c(rep("PC1", 5), rep("PC2", 4), rep("PC3", 3), rep("PC4", 2), rep("PC5", 1)),
second = c(2,3,4,5,6,3,4,5,6,4,5,6,5,6, 6)) %>%
dplyr::mutate(second = paste0("PC", second))
dplyr::mutate(second = paste0("PC", second)) %>%
dplyr::mutate(first = as.character(first),
second = as.character(second))
# highlight divergent strains
if(!is.null(dstrains)) {
Expand Down Expand Up @@ -46,18 +49,17 @@ cowplot::plot_grid(plotlist = plots2, leg)
nodiv <- no_removal
# don't highight strains
plots2 <- lapply(seq(1:nrow(pcs)), FUN = function(i) {
xpc <- pcs$first[i]
ypc <- pcs$second[i]
print(xpc)
xpc <- as.character(pcs$first[i])
ypc <- as.character(pcs$second[i])
ggplot() +
geom_point(data = nodiv, shape=21, alpha=0.8, size=2, aes(x=get(xpc),y=get(ypc)), fill = "grey70")+
theme_bw() +
theme(axis.title = element_text(size=11, color = "black"),
theme(axis.title = element_text(size=11, color = "black"),
axis.text = element_text(size=10, color = "black"),
panel.grid = element_blank(),
legend.position = "none")+
labs(x = glue::glue("{xpc} ({round(tracy$VarExp[which(tracy$N == {xpc})]*100, digits = 2)}%)"),
labs(x = glue::glue("{xpc} ({round(tracy$VarExp[which(tracy$N == {xpc})]*100, digits = 2)}%)"),
y = glue::glue("{ypc} ({round(tracy$VarExp[which(tracy$N == {ypc})]*100, digits = 2)}%)"))
})
cowplot::plot_grid(plotlist = plots2)
Expand Down
7 changes: 4 additions & 3 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ params.vcfanno_config = "${workflow.projectDir}/input_files/ANNOTATION_conf.toml
params.eigen_par_outlier_removal = "${workflow.projectDir}/bin/eigpar"
params.eigen_par_no_removal = "${workflow.projectDir}/bin/eigpar_no_removal"
params.R_libpath = "/projects/b1059/software/R_lib_3.6.0"
params.snps = '--snps-only'


// Note that params.species is set in the config to be c_elegans (default)
Expand Down Expand Up @@ -127,7 +128,7 @@ if (params.help) {
}

// import the pca module
include {extract_ancestor_bed; annotate_small_vcf; vcf_to_ped; vcf_to_eigstrat_files; run_eigenstrat_no_outlier_removal; run_eigenstrat_with_outlier_removal; HTML_report_PCA} from './modules/pca.nf'
include {extract_ancestor_bed; annotate_small_vcf; vcf_to_eigstrat_files; run_eigenstrat_no_outlier_removal; run_eigenstrat_with_outlier_removal; HTML_report_PCA} from './modules/pca.nf'


workflow {
Expand Down Expand Up @@ -219,8 +220,8 @@ workflow {

// run html report
// not functional quite yet...
// run_eigenstrat_no_outlier_removal.out
// .combine(run_eigenstrat_with_outlier_removal.out) | HTML_report_PCA
run_eigenstrat_no_outlier_removal.out
.combine(run_eigenstrat_with_outlier_removal.out) | HTML_report_PCA
}

}
Expand Down
275 changes: 275 additions & 0 deletions modules/pca copy.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
/*
==================================
~ > * * < ~
~ ~ > * * < ~ ~
~ ~ ~ > * ANNOTATE VCF * < ~ ~ ~
~ ~ > * * < ~ ~
~ > * * < ~
==================================
*/


/*
------------ Extract ancestor strain from the VCF and make bed file for annotations
*/

process extract_ancestor_bed {

publishDir "${params.output}/ANNOTATE_VCF", mode: 'copy'

cpus 1

input:
tuple file(vcf), file(vcfindex)

output:
tuple file("ANC.bed.gz"), file("ANC.bed.gz.tbi")

"""
bcftools query --samples ${params.anc} -f '%CHROM\\t%POS\\t%END\\t[%TGT]\\n' ${vcf} |\\
awk -F"/" '\$1=\$1' OFS="\\t" |\\
awk '{print \$1, \$2 = \$2 - 1, \$3, \$4}' OFS="\\t" |\\
bgzip > ANC.bed.gz
tabix ANC.bed.gz
echo "ANCESTOR DONE"
"""
}

/*
------------ Annotate small variant VCF
*/

process annotate_small_vcf {

publishDir "${params.output}/ANNOTATE_VCF", mode: 'copy'

conda '/projects/b1059/software/conda_envs/vcffixup'

cpus 1

input:
tuple file(vcf), file(vcfindex), file("ANC.bed.gz"), file("ANC.bed.gz.tbi"), file(pop) //, val(pop), val(maf), val(sm)

output:
tuple file("Ce330_annotated.vcf.gz"), file("Ce330_annotated.vcf.gz.tbi")


"""
# get vcfanno files
cp ${workflow.projectDir}/input_files/annotations/${params.species}/* .
cat ${params.vcfanno_config} | sed 's/species/${params.species}/' > anno_config.toml
bcftools view -S ${pop} ${vcf} -Oz -o population-filtered.vcf.gz
vcfanno anno_config.toml population-filtered.vcf.gz |\\
awk '\$0 ~ "#" || \$0 !~ "Masked" {print}' |\\
vcffixup - |\\
bcftools filter -i N_MISSING=0 -Oz -o Ce330_annotated.vcf.gz
tabix -p vcf Ce330_annotated.vcf.gz
"""
}



/*
------------ Prune VCF, Generate PLINK files
*/

process vcf_to_ped {

tag {"PRUNING VCF FOR ADMIXTURE"}

publishDir "${params.output}/ADMIXTURE/PLINK/", mode: 'copy'

conda '/projects/b1059/software/conda_envs/vcffixup'


input:
tuple file(vcf), file(vcfindex), val(nSM), val(maf), val(samples), val(ld)

output:
tuple file("ce_norm.vcf.gz"), file("ce_norm.vcf.gz.tbi"), val(nSM), val(maf), val(samples), val(ld), file("*.map"), file("*.ped"), file("plink.prune.in")

"""
bcftools norm -m +snps ${vcf} -Oz -o ce_norm.vcf.gz
tabix -p vcf ce_norm.vcf.gz
plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --maf ${maf} --set-missing-var-ids @:# --indep-pairwise 50 10 ${ld} --allow-extra-chr
plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --maf ${maf} --set-missing-var-ids @:# --extract plink.prune.in --geno --recode12 --out LD_${ld}_MAF_${maf} --allow-extra-chr
"""

}



/*
======================================
~ > * * < ~
~ ~ > * * < ~ ~
~ ~ ~ > * Run PCA and DAPC * < ~ ~ ~
~ ~ > * * < ~ ~
~ > * * < ~
======================================
*/

/*
------------ Prepare files for EIGENSTRAT
*/

process vcf_to_eigstrat_files {

tag {"PREPARE EIGENSTRAT FILES"}

conda '/projects/b1059/software/conda_envs/vcffixup'

publishDir "${params.output}/EIGESTRAT/LD_${test_ld}/INPUTFILES", mode: 'copy'

input:
tuple file(vcf), file(vcfindex), val("test_ld")

output:
tuple file("eigenstrat_input.ped"), file("eigenstrat_input.pedsnp"), file("eigenstrat_input.pedind"), file("plink.prune.in"), \
file ("markers.txt"), file ("sorted_samples.txt"), file ("PCA.vcf.gz"), file ("PCA.vcf.gz.tbi"), val(test_ld)


"""
bcftools view --regions I,II,III,IV,V,X ${vcf} |\\
bcftools norm -m +snps -Oz -o ce_norm.vcf.gz
tabix -p vcf ce_norm.vcf.gz
plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --set-missing-var-ids @:# --indep-pairwise 50 10 ${test_ld} --allow-extra-chr
plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --set-missing-var-ids @:# --extract plink.prune.in --geno --recode12 --out eigenstrat_input --allow-extra-chr
awk -F":" '\$1=\$1' OFS="\\t" plink.prune.in | \\
sort -k1,1d -k2,2n > markers.txt
bcftools query -l ce_norm.vcf.gz |\\
sort > sorted_samples.txt
bcftools view -v snps -S sorted_samples.txt -R markers.txt ce_norm.vcf.gz -Oz -o PCA.vcf.gz
tabix -p vcf PCA.vcf.gz
bcftools view -v snps -S sorted_samples.txt -R markers.txt ce_norm.vcf.gz |\\
bcftools query -f '%CHROM\\t%CHROM:%POS\\t%cM\\t%POS\\t%REF\\t%ALT\\n' |\\
sed 's/^III/3/g' |\\
sed 's/^II/2/g' |\\
sed 's/^IV/4/g' |\\
sed 's/^I/1/g' |\\
sed 's/^V/5/g' > eigenstrat_input.pedsnp
cut -f-6 -d' ' eigenstrat_input.ped |\\
awk '{print 1, \$2, \$3, \$3, \$5, 1}' > eigenstrat_input.pedind
echo "rerun"
"""

}


/*
------------ Run EIGENSTRAT without removing outlier strains
*/

process run_eigenstrat_no_outlier_removal {

publishDir "${params.output}/EIGESTRAT/LD_${test_ld}/NO_REMOVAL/", mode: 'copy'

conda '/projects/b1059/software/conda_envs/vcffixup'

input:
tuple file("eigenstrat_input.ped"), file("eigenstrat_input.pedsnp"), file("eigenstrat_input.pedind"), file("plink.prune.in"), \
file ("markers.txt"), file ("sorted_samples.txt"), file ("PCA.vcf.gz"), file ("PCA.vcf.gz.tbi"), val(test_ld), file(eigenparameters)

output:
tuple file("eigenstrat_no_removal.evac"), file("eigenstrat_no_removal.eval"), file("logfile_no_removal.txt"), \
file("eigenstrat_no_removal_relatedness"), file("eigenstrat_no_removal_relatedness.id"), file("TracyWidom_statistics_no_removal.tsv")


"""
smartpca -p ${eigenparameters} > logfile_no_removal.txt
sed -n -e '/Tracy/,\$p' logfile_no_removal.txt |\
sed -e '/kurt/,\$d' |\
awk '\$0 !~ "##" && \$0 !~ "#" {print}' |\
sed -e "s/[[:space:]]\\+/ /g" |\
sed 's/^ //g' |\
awk 'BEGIN{print "N", "eigenvalue", "difference", "twstat", "p-value", "effect.n"}; {print}' OFS="\\t" |\
awk -F" " '\$1=\$1' OFS="\\t" > TracyWidom_statistics_no_removal.tsv
"""

}

/*
------------ Run EIGENSTRAT with removing outlier strains
*/

process run_eigenstrat_with_outlier_removal {

conda '/projects/b1059/software/conda_envs/vcffixup'

publishDir "${params.output}/EIGESTRAT/LD_${test_ld}/OUTLIER_REMOVAL/", mode: 'copy'

input:
tuple file("eigenstrat_input.ped"), file("eigenstrat_input.pedsnp"), file("eigenstrat_input.pedind"), file("plink.prune.in"), \
file ("markers.txt"), file ("sorted_samples.txt"), file ("PCA.vcf.gz"), file ("PCA.vcf.gz.tbi"), val(test_ld), file(eigenparameters)

output:
tuple file("eigenstrat_outliers_removed.evac"), file("eigenstrat_outliers_removed.eval"), file("logfile_outlier.txt"), \
file("eigenstrat_outliers_removed_relatedness"), file("eigenstrat_outliers_removed_relatedness.id"), file("TracyWidom_statistics_outlier_removal.tsv")


"""
smartpca -p ${eigenparameters} > logfile_outlier.txt
sed -n -e '/Tracy/,\$p' logfile_outlier.txt |\
sed -e '/kurt/,\$d' |\
awk '\$0 !~ "##" && \$0 !~ "#" {print}' |\
sed -e "s/[[:space:]]\\+/ /g" |\
sed 's/^ //g' |\
awk 'BEGIN{print "N", "eigenvalue", "difference", "twstat", "p-value", "effect.n"}; {print}' OFS="\\t" |\
awk -F" " '\$1=\$1' OFS="\\t" > TracyWidom_statistics_outlier_removal.tsv
"""

}


/*
------------ Run HTML report for PCA analysis
*/

process HTML_report_PCA {

conda '/projects/b1059/software/conda_envs/cegwas2-nf_env'

publishDir "${params.output}/", mode: 'copy'


input:
tuple file("eigenstrat_no_removal.evac"), file("eigenstrat_no_removal.eval"), file("logfile_no_removal.txt"), \
file("eigenstrat_no_removal_relatedness"), file("eigenstrat_no_removal_relatedness.id"), file("TracyWidom_statistics_no_removal.tsv"), \
file("eigenstrat_outliers_removed.evac"), file("eigenstrat_outliers_removed.eval"), file("logfile_outlier.txt"), \
file("eigenstrat_outliers_removed_relatedness"), file("eigenstrat_outliers_removed_relatedness.id"), file("TracyWidom_statistics_outlier_removal.tsv")


output:
tuple file("pca*.Rmd"), file("*.html")


"""
cp ${workflow.projectDir}/bin/pca*.Rmd .
echo ".libPaths(c(\\"${params.R_libpath}\\", .libPaths() ))" > .Rprofile
Rscript -e "rmarkdown::render('pca_report.Rmd', knit_root_dir='${workflow.launchDir}/${params.output}')"
"""


}


Loading

0 comments on commit b02dfba

Please sign in to comment.