update plots

AndersenLab · Jan 4, 2022 · b02dfba · b02dfba
1 parent 86a17d5
commit b02dfba
Show file tree

Hide file tree

Showing 5 changed files with 297 additions and 48 deletions.
diff --git a/bin/pca_template.Rmd b/bin/pca_template.Rmd
@@ -13,7 +13,9 @@ tracy <- data.table::fread(glue::glue("EIGESTRAT/{ld}/NO_REMOVAL/TracyWidom_stat
                 cumsum = cumsum(VarExp),
                 N = paste0("PC", N))
 
-DT::datatable(tracy[1:10,])
+# print(DT::datatable(tracy[1:10,]))
+tracy[1:10,]
+
 ```
 
 
@@ -42,7 +44,8 @@ tracy <- data.table::fread(glue::glue("EIGESTRAT/{ld}/OUTLIER_REMOVAL/TracyWidom
                 cumsum = cumsum(VarExp),
                 N = paste0("PC", N))
 
-DT::datatable(tracy[1:10,])
+# print(DT::datatable(tracy[1:10,]))
+tracy[1:10,]
 
 ```
 

diff --git a/bin/pca_template_template2.Rmd b/bin/pca_template_template2.Rmd
@@ -1,7 +1,10 @@
 ```{r, fig.height = 10, fig.width = 10}
+
 pcs <- data.frame(first = c(rep("PC1", 5), rep("PC2", 4), rep("PC3", 3), rep("PC4", 2), rep("PC5", 1)),
                   second = c(2,3,4,5,6,3,4,5,6,4,5,6,5,6, 6)) %>%
-  dplyr::mutate(second = paste0("PC", second))
+  dplyr::mutate(second = paste0("PC", second)) %>%
+  dplyr::mutate(first = as.character(first),
+                second = as.character(second))
 
 # highlight divergent strains
 if(!is.null(dstrains)) {
@@ -46,18 +49,17 @@ cowplot::plot_grid(plotlist = plots2, leg)
   nodiv <- no_removal
   # don't highight strains
   plots2 <- lapply(seq(1:nrow(pcs)), FUN = function(i) {
-    xpc <- pcs$first[i]
-    ypc <- pcs$second[i]
-    print(xpc)
+    xpc <- as.character(pcs$first[i])
+    ypc <- as.character(pcs$second[i])
     
     ggplot() +
       geom_point(data = nodiv, shape=21, alpha=0.8, size=2, aes(x=get(xpc),y=get(ypc)), fill = "grey70")+
       theme_bw() +
-      theme(axis.title = element_text(size=11, color = "black"), 
+      theme(axis.title = element_text(size=11, color = "black"),
             axis.text = element_text(size=10, color = "black"),
             panel.grid = element_blank(),
             legend.position = "none")+
-    labs(x = glue::glue("{xpc} ({round(tracy$VarExp[which(tracy$N == {xpc})]*100, digits = 2)}%)"), 
+    labs(x = glue::glue("{xpc} ({round(tracy$VarExp[which(tracy$N == {xpc})]*100, digits = 2)}%)"),
          y = glue::glue("{ypc} ({round(tracy$VarExp[which(tracy$N == {ypc})]*100, digits = 2)}%)"))
   })
   cowplot::plot_grid(plotlist = plots2)

diff --git a/main.nf b/main.nf
@@ -45,6 +45,7 @@ params.vcfanno_config = "${workflow.projectDir}/input_files/ANNOTATION_conf.toml
 params.eigen_par_outlier_removal = "${workflow.projectDir}/bin/eigpar"
 params.eigen_par_no_removal = "${workflow.projectDir}/bin/eigpar_no_removal"
 params.R_libpath = "/projects/b1059/software/R_lib_3.6.0"
+params.snps = '--snps-only'
 
 
 // Note that params.species is set in the config to be c_elegans (default)
@@ -127,7 +128,7 @@ if (params.help) {
 }
 
 // import the pca module
-include {extract_ancestor_bed; annotate_small_vcf; vcf_to_ped; vcf_to_eigstrat_files; run_eigenstrat_no_outlier_removal; run_eigenstrat_with_outlier_removal; HTML_report_PCA} from './modules/pca.nf'
+include {extract_ancestor_bed; annotate_small_vcf; vcf_to_eigstrat_files; run_eigenstrat_no_outlier_removal; run_eigenstrat_with_outlier_removal; HTML_report_PCA} from './modules/pca.nf'
 
 
 workflow { 
@@ -219,8 +220,8 @@ workflow {
 
         // run html report
         // not functional quite yet...
-        // run_eigenstrat_no_outlier_removal.out
-        //     .combine(run_eigenstrat_with_outlier_removal.out) | HTML_report_PCA
+         run_eigenstrat_no_outlier_removal.out
+             .combine(run_eigenstrat_with_outlier_removal.out) | HTML_report_PCA
     }
 
 }

diff --git a/modules/pca copy.nf b/modules/pca copy.nf
@@ -0,0 +1,275 @@
+/*
+==================================
+~ > *                        * < ~
+~ ~ > *                    * < ~ ~
+~ ~ ~ > *  ANNOTATE VCF  * < ~ ~ ~
+~ ~ > *                    * < ~ ~
+~ > *                        * < ~
+==================================
+*/
+
+
+/*
+------------ Extract ancestor strain from the VCF and make bed file for annotations 
+*/
+
+process extract_ancestor_bed {
+
+    publishDir "${params.output}/ANNOTATE_VCF", mode: 'copy'
+
+    cpus 1
+
+    input:
+      tuple file(vcf), file(vcfindex)
+
+    output:
+      tuple file("ANC.bed.gz"), file("ANC.bed.gz.tbi")
+
+      """
+        bcftools query --samples ${params.anc} -f '%CHROM\\t%POS\\t%END\\t[%TGT]\\n' ${vcf} |\\
+        awk -F"/" '\$1=\$1' OFS="\\t" |\\
+        awk '{print \$1, \$2 = \$2 - 1, \$3, \$4}' OFS="\\t" |\\
+        bgzip > ANC.bed.gz
+
+        tabix ANC.bed.gz
+        echo "ANCESTOR DONE"
+      """
+}
+
+/*
+------------ Annotate small variant VCF  
+*/
+
+process annotate_small_vcf {
+
+    publishDir "${params.output}/ANNOTATE_VCF", mode: 'copy'
+
+    conda '/projects/b1059/software/conda_envs/vcffixup'
+
+    cpus 1
+
+    input:
+      tuple file(vcf), file(vcfindex), file("ANC.bed.gz"), file("ANC.bed.gz.tbi"), file(pop) //, val(pop), val(maf), val(sm)
+
+    output:
+      tuple file("Ce330_annotated.vcf.gz"), file("Ce330_annotated.vcf.gz.tbi")
+
+
+      """
+        # get vcfanno files
+        cp ${workflow.projectDir}/input_files/annotations/${params.species}/* .
+        cat ${params.vcfanno_config} | sed 's/species/${params.species}/' > anno_config.toml
+
+        bcftools view -S ${pop} ${vcf} -Oz -o population-filtered.vcf.gz 
+
+        vcfanno anno_config.toml population-filtered.vcf.gz |\\
+        awk '\$0 ~ "#" || \$0 !~ "Masked" {print}' |\\
+        vcffixup - |\\
+        bcftools filter -i N_MISSING=0 -Oz -o Ce330_annotated.vcf.gz
+
+        tabix -p vcf Ce330_annotated.vcf.gz
+      """
+}
+
+
+
+/*
+------------ Prune VCF, Generate PLINK files
+*/
+
+process vcf_to_ped {
+
+  tag {"PRUNING VCF FOR ADMIXTURE"}
+
+  publishDir "${params.output}/ADMIXTURE/PLINK/", mode: 'copy'
+
+  conda '/projects/b1059/software/conda_envs/vcffixup'
+
+
+  input:
+    tuple file(vcf), file(vcfindex), val(nSM), val(maf), val(samples), val(ld)
+
+  output:
+    tuple file("ce_norm.vcf.gz"), file("ce_norm.vcf.gz.tbi"), val(nSM), val(maf), val(samples), val(ld), file("*.map"), file("*.ped"), file("plink.prune.in")
+
+    """
+    bcftools norm -m +snps ${vcf} -Oz -o ce_norm.vcf.gz
+    tabix -p vcf ce_norm.vcf.gz
+
+    plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --maf ${maf} --set-missing-var-ids @:# --indep-pairwise 50 10 ${ld} --allow-extra-chr 
+    
+    plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --maf ${maf} --set-missing-var-ids @:# --extract plink.prune.in --geno --recode12 --out LD_${ld}_MAF_${maf} --allow-extra-chr 
+    """
+
+}
+
+
+
+/*
+======================================
+~ > *                            * < ~
+~ ~ > *                        * < ~ ~
+~ ~ ~ > *  Run PCA and DAPC  * < ~ ~ ~
+~ ~ > *                        * < ~ ~
+~ > *                            * < ~
+======================================
+*/
+
+/*
+------------ Prepare files for EIGENSTRAT
+*/
+
+process vcf_to_eigstrat_files {
+
+  tag {"PREPARE EIGENSTRAT FILES"}
+
+  conda '/projects/b1059/software/conda_envs/vcffixup'
+
+  publishDir "${params.output}/EIGESTRAT/LD_${test_ld}/INPUTFILES", mode: 'copy'
+
+  input:
+    tuple file(vcf), file(vcfindex), val("test_ld")
+
+  output:
+    tuple file("eigenstrat_input.ped"), file("eigenstrat_input.pedsnp"), file("eigenstrat_input.pedind"), file("plink.prune.in"), \
+    file ("markers.txt"), file ("sorted_samples.txt"), file ("PCA.vcf.gz"), file ("PCA.vcf.gz.tbi"), val(test_ld)
+
+
+    """
+
+    bcftools view --regions I,II,III,IV,V,X ${vcf} |\\
+    bcftools norm -m +snps -Oz -o ce_norm.vcf.gz
+
+    tabix -p vcf ce_norm.vcf.gz
+
+    plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --set-missing-var-ids @:# --indep-pairwise 50 10 ${test_ld} --allow-extra-chr 
+
+    plink --vcf ce_norm.vcf.gz --snps-only --biallelic-only --set-missing-var-ids @:# --extract plink.prune.in --geno --recode12 --out eigenstrat_input --allow-extra-chr
+
+    awk -F":" '\$1=\$1' OFS="\\t" plink.prune.in | \\
+    sort -k1,1d -k2,2n > markers.txt
+
+    bcftools query -l ce_norm.vcf.gz |\\
+    sort > sorted_samples.txt 
+
+    bcftools view -v snps -S sorted_samples.txt -R markers.txt ce_norm.vcf.gz -Oz -o PCA.vcf.gz
+    
+    tabix -p vcf PCA.vcf.gz
+
+    bcftools view -v snps -S sorted_samples.txt -R markers.txt ce_norm.vcf.gz |\\
+    bcftools query -f '%CHROM\\t%CHROM:%POS\\t%cM\\t%POS\\t%REF\\t%ALT\\n' |\\
+    sed 's/^III/3/g' |\\
+    sed 's/^II/2/g' |\\
+    sed 's/^IV/4/g' |\\
+    sed 's/^I/1/g' |\\
+    sed 's/^V/5/g' > eigenstrat_input.pedsnp      
+
+    cut -f-6 -d' ' eigenstrat_input.ped |\\
+    awk '{print 1, \$2, \$3, \$3, \$5, 1}'  > eigenstrat_input.pedind
+
+    echo "rerun"
+    """
+
+}
+
+
+/*
+------------ Run EIGENSTRAT without removing outlier strains
+*/
+
+process run_eigenstrat_no_outlier_removal {
+
+  publishDir "${params.output}/EIGESTRAT/LD_${test_ld}/NO_REMOVAL/", mode: 'copy'
+
+  conda '/projects/b1059/software/conda_envs/vcffixup'
+
+  input:
+    tuple file("eigenstrat_input.ped"), file("eigenstrat_input.pedsnp"), file("eigenstrat_input.pedind"), file("plink.prune.in"), \
+    file ("markers.txt"), file ("sorted_samples.txt"), file ("PCA.vcf.gz"), file ("PCA.vcf.gz.tbi"), val(test_ld), file(eigenparameters)
+
+  output:
+    tuple file("eigenstrat_no_removal.evac"), file("eigenstrat_no_removal.eval"), file("logfile_no_removal.txt"), \
+    file("eigenstrat_no_removal_relatedness"), file("eigenstrat_no_removal_relatedness.id"), file("TracyWidom_statistics_no_removal.tsv")
+
+
+    """
+
+    smartpca -p ${eigenparameters} > logfile_no_removal.txt
+
+    sed -n -e '/Tracy/,\$p' logfile_no_removal.txt |\
+    sed -e '/kurt/,\$d' |\
+    awk '\$0 !~ "##" && \$0 !~ "#" {print}' |\
+    sed -e "s/[[:space:]]\\+/ /g" |\
+    sed 's/^ //g' |\
+    awk 'BEGIN{print "N", "eigenvalue", "difference", "twstat", "p-value", "effect.n"}; {print}' OFS="\\t" |\
+    awk -F" " '\$1=\$1' OFS="\\t" > TracyWidom_statistics_no_removal.tsv
+    """
+
+}
+
+/*
+------------ Run EIGENSTRAT with removing outlier strains
+*/
+
+process run_eigenstrat_with_outlier_removal {
+
+  conda '/projects/b1059/software/conda_envs/vcffixup'
+
+  publishDir "${params.output}/EIGESTRAT/LD_${test_ld}/OUTLIER_REMOVAL/", mode: 'copy'
+
+  input:
+    tuple file("eigenstrat_input.ped"), file("eigenstrat_input.pedsnp"), file("eigenstrat_input.pedind"), file("plink.prune.in"), \
+    file ("markers.txt"), file ("sorted_samples.txt"), file ("PCA.vcf.gz"), file ("PCA.vcf.gz.tbi"), val(test_ld), file(eigenparameters)
+
+  output:
+    tuple file("eigenstrat_outliers_removed.evac"), file("eigenstrat_outliers_removed.eval"), file("logfile_outlier.txt"), \
+    file("eigenstrat_outliers_removed_relatedness"), file("eigenstrat_outliers_removed_relatedness.id"), file("TracyWidom_statistics_outlier_removal.tsv")
+
+
+    """
+    smartpca -p ${eigenparameters} > logfile_outlier.txt
+
+    sed -n -e '/Tracy/,\$p' logfile_outlier.txt |\
+    sed -e '/kurt/,\$d' |\
+    awk '\$0 !~ "##" && \$0 !~ "#" {print}' |\
+    sed -e "s/[[:space:]]\\+/ /g" |\
+    sed 's/^ //g' |\
+    awk 'BEGIN{print "N", "eigenvalue", "difference", "twstat", "p-value", "effect.n"}; {print}' OFS="\\t" |\
+    awk -F" " '\$1=\$1' OFS="\\t" > TracyWidom_statistics_outlier_removal.tsv
+    """
+
+}
+
+
+/*
+------------ Run HTML report for PCA analysis
+*/
+
+process HTML_report_PCA {
+
+  conda '/projects/b1059/software/conda_envs/cegwas2-nf_env'
+
+  publishDir "${params.output}/", mode: 'copy'
+
+
+  input:
+  tuple file("eigenstrat_no_removal.evac"), file("eigenstrat_no_removal.eval"), file("logfile_no_removal.txt"), \
+    file("eigenstrat_no_removal_relatedness"), file("eigenstrat_no_removal_relatedness.id"), file("TracyWidom_statistics_no_removal.tsv"), \
+    file("eigenstrat_outliers_removed.evac"), file("eigenstrat_outliers_removed.eval"), file("logfile_outlier.txt"), \
+    file("eigenstrat_outliers_removed_relatedness"), file("eigenstrat_outliers_removed_relatedness.id"), file("TracyWidom_statistics_outlier_removal.tsv")
+
+
+  output:
+   tuple file("pca*.Rmd"), file("*.html")
+
+
+  """
+  cp ${workflow.projectDir}/bin/pca*.Rmd . 
+  echo ".libPaths(c(\\"${params.R_libpath}\\", .libPaths() ))" > .Rprofile
+  Rscript -e "rmarkdown::render('pca_report.Rmd', knit_root_dir='${workflow.launchDir}/${params.output}')"
+  """
+
+
+}
+
+