Merge pull request #7 from AndersenLab/add_docker

Add docker
AndersenLab · Mar 7, 2022 · eae9267 · eae9267
2 parents 3978f61 + 205be40
commit eae9267
Show file tree

Hide file tree

Showing 16 changed files with 361 additions and 51 deletions.
diff --git a/.github/workflows/build_docker.yml b/.github/workflows/build_docker.yml
@@ -0,0 +1,33 @@
+# Build dockerfile on change
+name: Build Docker (env/concordance.Dockerfile)
+
+on:
+  push:
+    paths:
+    - 'env/concordance.Dockerfile'
+    - '.github/workflows/build_docker.yml'
+    - 'env/conda.yml'
+  pull_request:
+    paths:
+    - 'env/concordance.Dockerfile'
+    - '.github/workflows/build_docker.yml'
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+
+    # Build Tools
+    - name: Build and Publish
+      uses: elgohr/Publish-Docker-Github-Action@master
+      with:
+        name: andersenlab/concordance
+        tag: "${{ steps.current-time.formattedTime }}"
+        username: ${{ secrets.KSE_DOCKER_USER }}
+        password: ${{ secrets.KSE_DOCKER_PASS }}
+        snapshot: true
+        dockerfile: concordance.Dockerfile
+        workdir: "env"
+        tags: "latest"
+        cache: true
diff --git a/.gitignore b/.gitignore
@@ -9,7 +9,6 @@ __pycache__/
 
 # Distribution / packaging
 .Python
-env/
 build/
 develop-eggs/
 dist/
@@ -81,7 +80,7 @@ celerybeat-schedule
 
 # virtualenv
 venv/
-ENV/
+
 
 # Spyder project settings
 .spyderproject

diff --git a/bin/concordance_report.Rmd b/bin/concordance_report.Rmd
@@ -0,0 +1,233 @@
+---
+date: "`r Sys.Date()`"
+output: 
+  html_document: 
+    theme: yeti
+editor_options: 
+  chunk_output_type: console
+---
+
+
+```{r setup, echo=F, warning=FALSE, message=FALSE}
+library(tidyverse)
+library(knitr)
+library(DT)
+library(plotly)
+library(glue)
+library(patchwork)
+library(ComplexHeatmap)
+library(circlize)
+
+opts_chunk$set(echo=FALSE, warning=FALSE, message=FALSE, print=FALSE, verbose=TRUE)
+
+presentation <- theme(axis.text.x = element_text(size=10, face="bold", color="black"),
+                      axis.text.y = element_text(size=10, face="bold", color="black"),
+                      axis.title.x = element_text(size=14, face="bold", color="black", vjust=-1),
+                      axis.title.y = element_text(size=14, face="bold", color="black", vjust=2),
+                      strip.text.x = element_text(size=12, face="bold", color="black"),
+                      strip.text.y = element_text(size=12, face="bold", color="black"),
+                      panel.spacing = unit(0.50, "lines"),
+                      plot.margin  = unit(c(0.8,0.8,0.8,0.8), "cm"),
+                      plot.title   = element_text(size=24, face="bold",vjust=2),
+                      legend.position="none")
+
+wd <- "~/Dropbox/AndersenLab/CeNDR/Data_processing/20210121/all_reports/concordance/"
+
+
+```
+
+
+```{r}
+isotype <- read.delim(glue::glue("{wd}/20210115_isotype_assignment.tsv"), stringsAsFactors=FALSE)
+
+## this skips the 20201015 release, so previous release was 20200815 with 399 isotypes
+prev_isotype <- isotype %>%
+    dplyr::filter(strain == "ECA1206" | release != 20210121) # manually add ECA1206 to "old" isotypes because it is the clean version of an old isotype ECA2678
+new_strain <- isotype %>%
+    dplyr::filter(release == 20210121, strain != "ECA1206") 
+new_isotype <- new_strain %>%
+    dplyr::filter(!isotype %in% prev_isotype$isotype)
+
+# old isotype
+old_iso <- readr::read_tsv("~/Dropbox/AndersenLab/CeNDR/Data_processing/20200815/all_reports/concordance/20200810_isotype_assignment.tsv")
+
+p_i <- nrow(dplyr::select(prev_isotype, isotype) %>% unique)
+p_s <- nrow(dplyr::select(prev_isotype, strain))
+
+n_i <- nrow(dplyr::select(new_isotype, isotype) %>% unique)
+n_s <- nrow(dplyr::select(new_strain, strain))
+
+
+```
+
+## Overview
+
+Concordance analysis allows us to group strains that are genetically almost identical into an isotype. The following table summarizes the number of isotypes from previous and current releases, and the number of strains that belong to those isotypes. 
+
+|                 | __Isotypes__ | __Strains Included__ | __Strains with WGS data__ | __Strains with RAD-seq data__ |
+|-----------------|----------|---------|----------|---------|
+| __Isotypes from Previous Release__ | `r p_i`* | `r p_s  + 140` | `r p_s` | 140 |
+| __New Isotypes from Current Release__    | `r n_i` | `r n_s` | `r n_s` | 0 |
+| __Total__        | `r sum(p_i, n_i)` | `r sum(p_s, n_s, 140)` | `r sum(p_s, n_s)` | 140 |
+*Four strains were reduced to a single isotype group so this number was reduced from 403 to 400 (see below for details)
+
+<br>
+
+## Concordance score distribution and cutoff
+
+We examined the pairwise concordance scores of all strains. Concordance values for every pair of strains were calculated as the number of shared variant sites divided by the total number of variants called for each pair. If the concordance score was more than 0.9997, the strain pair is grouped into the same isotype.
+
+```{r load, fig.width=10, fig.height=4}
+cutoff <- 0.9997
+
+gtcheck <- readr::read_tsv(glue::glue("{wd}/gtcheck.tsv")) %>%
+    dplyr::mutate(concordance = 1-(discordance/sites)) %>%
+    dplyr::mutate(isotype = concordance > cutoff) 
+
+p1 <- ggplot(gtcheck) +
+  geom_histogram(aes(x=concordance, fill = isotype), binwidth = 0.00025) +
+  scale_fill_manual(values = c("#808080", "#0080FF")) + 
+    labs(x = "Concordance", y = "Number of Comparisons") +
+    theme_bw() +
+    presentation 
+
+
+p2 <- ggplot() +
+    geom_histogram(data=gtcheck, aes(x=concordance, fill = isotype), binwidth = 0.000025) +     
+    geom_rect(aes(xmax=1.0, xmin=cutoff, ymin=-Inf, ymax=Inf), fill="blue", alpha=0.2) +
+    geom_text(aes(x = Inf, y = Inf, label="the same isotype"),
+              vjust = 3, hjust = 1.4, colour="blue", size = 5) + 
+    scale_fill_manual(values = c("#808080", "#0080FF")) +
+    scale_x_continuous(limits = c(0.999, 1.0)) +
+    labs(x = "Concordance", y = "Number of Comparisons") +
+    theme_bw() +
+    presentation 
+
+p1+p2
+```
+
+<br>
+
+## Search for concordance for strain pairs
+
+Strain comparisons are listed in the table below. Only concordance scores > 0.999 are shown.
+
+```{r}
+
+# edit ECA2649 group by hand
+gtcheck %>%
+    dplyr::select(i, j, sites, discordance, concordance, isotype) %>% 
+    dplyr::filter(concordance > 0.999) %>% 
+    dplyr::mutate(isotype = ifelse(i == "ECA2649" & j %in% paste0("ECA", c(2657, 2608, 2585, 2672, 2552, 2548)), FALSE, isotype),
+                  isotype = ifelse(j == "ECA2649" & j %in% paste0("ECA", c(2657, 2608, 2585, 2672, 2552, 2548)), FALSE, isotype)) %>% 
+    DT::datatable(colnames = c("Strain", "Strain", "Total variants", "Diff variants", "Concordance", "Is isotype"),
+              rownames = F,
+              options = list(pageLength = 5, columnDefs = list(list(targets = c(2, 3, 4, 5), searchable = FALSE)))) # scroller view may change how filter look. be careful
+```
+
+<br>
+
+* ECA2649 had high concordance to two distinct isotype groups. However, upon investigating the relationships to each group, we chose to manually place ECA2649 with isotype ECA2551, not ECA2672.
+
+```{r, fig.width = 8, fig.height = 6}
+group_ECA2649 <- filter(isotype, isotype %in% c("ECA2551","ECA2672"))
+    
+df1 <- gtcheck %>%
+    dplyr::select(i, j, concordance) %>% 
+    dplyr::filter(i %in% group_ECA2649$strain & j %in% group_ECA2649$strain)
+
+df2 <- df1 %>% 
+    dplyr::rename(i=j, j=i)
+
+df <- dplyr::bind_rows(df1, df2) %>% 
+    unique() %>% 
+    tidyr::spread(j, concordance) 
+
+# make heatmap of concordance
+m = dplyr::select(df, -i) %>% 
+    as.matrix()
+rownames(m) = df$i
+
+col_fun = colorRamp2(c(min(m, na.rm=T), max(m, na.rm=T)), c("royalblue1", "indianred1"))
+
+ComplexHeatmap::Heatmap(m, col=col_fun, rect_gp = gpar(col = "white", lwd = 2), cell_fun = function(j, i, x, y, width, height, fill) {
+        grid.text(sprintf("%.4f", m[i, j]), x, y, gp = gpar(fontsize = 8)) 
+})
+```
+
+
+## Changes from previous release
+
+<br>
+
+* This release used only SNVs for isotype assignment.
+
+* Four strains (ECA2677, ECA2678, ECA2679, and ECA2686) were removed because they were frozen as "dirty" strains and have now been cleaned, frozen, and re-sequenced. Because these four strains were isotype reference strains, a new isotype reference strain was assigned. It appears that the other six strains in these isotype groups changed isotypes, but they remain in the same group as before with a new, clean isotype reference strain. Details can be found below.
+
+<br>
+
+| __Dirty Strain (Old)__ | __Clean Strain (New)__ | __Previous Isotype__ | __New Isotype__ | __Other Strains in Isotype Group__ |
+|-----------------|----------|---------|----------|---------|
+| ECA2677 | ECA1202 | ECA2677 | ECA1202 | ECA1201 |
+| ECA2678 | ECA1206 | ECA2678 | ECA1206 | ECA1973, ECA1979, ECA1983 |
+| ECA2679 | ECA1211 | ECA2679 | ECA1212 | ECA1209, ECA1211 |
+| ECA2686 | NA* | ECA2686 | ECA1243 | NA |
+*Clean strain for ECA2686 is ECA2803 but has not been sequenced yet.
+
+
+```{r}
+
+## no need to run heatmap on these strains
+
+# we also removed ECA1719 because it was a dirty strain, but this was not a previous strain so no need to report. Also ECA2796 is "N2" (npr-1 check)
+
+```
+
+<br>
+
+* Strains ECA1465, ECA1467, ECA1493, ECA1515 were each their own isotype in 20200815 release. They were grouped into the same isotype in this release, which resulted in a reduce of count of previous isotypes from 403 to 400. Below is their pairwise concordance value in this release (top) and in 20200815 release (bottom).
+
+<br>
+
+```{r, fig.height=4}
+group_ECA1493 <- filter(isotype, isotype %in% c("ECA1465", "ECA1467", "ECA1493", "ECA1515"))
+    
+df1 <- select(gtcheck, i, j, concordance) %>% filter(i %in% group_ECA1493$strain & j %in% group_ECA1493$strain)
+
+df2 <- df1 %>% rename(i=j, j=i)
+
+df = bind_rows(df1, df2) %>% unique() %>% spread(j, concordance) 
+
+# make heatmap of concordance
+m = select(df, -i) %>% as.matrix()
+rownames(m) = df$i
+
+col_fun = colorRamp2(c(min(m, na.rm=T), max(m, na.rm=T)), c("royalblue1", "indianred1"))
+
+Heatmap(m, col=col_fun, rect_gp = gpar(col = "white", lwd = 2), cell_fun = function(j, i, x, y, width, height, fill) {
+        grid.text(sprintf("%.4f", m[i, j]), x, y, gp = gpar(fontsize = 10)) 
+})
+```
+
+<br>
+
+```{r, fig.height=4}
+gtcheck_old <- readr::read_tsv("~/Dropbox/AndersenLab/CeNDR/Data_processing/20200815/4.concordance/gtcheck.tsv") %>%
+    dplyr::mutate(concordance = 1-(discordance/sites))
+
+df1 <- select(gtcheck_old, i, j, concordance) %>% filter(i %in% group_ECA1493$strain & j %in% group_ECA1493$strain)
+
+df2 <- df1 %>% rename(i=j, j=i)
+
+df = bind_rows(df1, df2) %>% unique() %>% spread(j, concordance) 
+
+# make heatmap of concordance
+m = select(df, -i) %>% as.matrix()
+rownames(m) = df$i
+
+col_fun = colorRamp2(c(min(m, na.rm=T), max(m, na.rm=T)), c("yellow", "indianred1"))
+
+Heatmap(m, col=col_fun, rect_gp = gpar(col = "white", lwd = 2), cell_fun = function(j, i, x, y, width, height, fill) {
+        grid.text(sprintf("%.4f", m[i, j]), x, y, gp = gpar(fontsize = 10)) 
+})
+```
diff --git a/bin/fq_concordance.R b/bin/fq_concordance.R
@@ -1,4 +1,7 @@
-library(tidyverse)
+# library(tidyverse)
+library(dplyr)
+library(readr)
+library(tidyr)
 
 gt_dict = list("0/0" = 0, "1/1" = 1)
 

diff --git a/bin/merge_groups_info.R b/bin/merge_groups_info.R
@@ -1,5 +1,8 @@
 #!/usr/bin/env Rscript
-library(tidyverse)
+# library(tidyverse)
+library(readr)
+library(dplyr)
+library(tidyr)
 # Read data from Rscript input
 args <- commandArgs(trailingOnly=TRUE)
 

diff --git a/bin/plot_pairwise.R b/bin/plot_pairwise.R
@@ -1,4 +1,7 @@
-library(tidyverse)
+# library(tidyverse)
+library(dplyr)
+library(readr)
+library(stringr)
 library(ggplot2)
 
 args <- commandArgs(trailingOnly=TRUE)

diff --git a/bin/process_concordance.R b/bin/process_concordance.R
@@ -1,6 +1,9 @@
 #!/usr/bin/env Rscript
 library(ggplot2)
-library(tidyverse)
+library(dplyr)
+library(readr)
+library(tidyr)
+# library(tidyverse)
 
 # Used in calculating isotypes
 stack_list <- function(x) {

diff --git a/bin/process_concordance_report.Rmd b/bin/process_concordance_report.Rmd
@@ -18,7 +18,11 @@ output: html_document
 ```{r global_options, include=FALSE}
 knitr::opts_chunk$set(warning=FALSE, message=FALSE, echo=FALSE)
 
-library(tidyverse)
+# library(tidyverse)
+library(ggplot2)
+library(dplyr)
+library(readr)
+library(tidyr)
 library(magrittr)
 library(DT)
 library(plotly)

diff --git a/bin/process_strain_pairwise.R b/bin/process_strain_pairwise.R
@@ -1,6 +1,9 @@
 #!/usr/bin/env Rscript
 library(ggplot2)
-library(tidyverse)
+library(readr)
+library(dplyr)
+library(tidyr)
+# library(tidyverse)
 # Read data from Rscript input
 args <- commandArgs(trailingOnly=TRUE)
 

diff --git a/bin/process_trees.R b/bin/process_trees.R
@@ -1,7 +1,9 @@
 library(ape)
 library(ggmap)
 library(phyloseq)
-library(tidyverse)
+library(dplyr)
+library(ggplot2)
+# library(tidyverse)
 
 contig = commandArgs(trailingOnly=TRUE)[[1]]
 tree <- ape::read.tree(paste0(contig,".tree"))

diff --git a/conf/quest.config b/conf/quest.config
@@ -2,11 +2,11 @@
 // Configuration for Quest (slurm)
 
 
-process {
+// process {
 
-    conda = "/projects/b1059/software/conda_envs/concordance-nf_env"
-    module = 'R/3.6.0'
-}
+//     conda = "/projects/b1059/software/conda_envs/concordance-nf_env"
+//     module = 'R/3.6.0'
+// }
 
 params {