diff --git a/quibl_analysis/quibl_analysis_solenopsis_50samples.Rmd b/quibl_analysis/quibl_analysis_solenopsis_50samples.Rmd new file mode 100644 index 0000000..e3cebaa --- /dev/null +++ b/quibl_analysis/quibl_analysis_solenopsis_50samples.Rmd @@ -0,0 +1,525 @@ +--- +title: "`QuIBL` introgression analysis of _Solenopsis_ species (50 samples)" +author: "Federico López-Osorio" +date: '`r Sys.Date()`' +output: + html_document: + toc: yes + pdf_document: + fig_caption: yes + toc: yes +editor_options: + chunk_output_type: console +geometry: margin = 1cm +--- + +```{r setup, echo = FALSE} +knitr::opts_chunk$set( + echo = TRUE, + message = FALSE, + warning = FALSE, + cache.lazy = TRUE, + include = TRUE, + out.height = "\textheight", + out.width = "\textwidth" +) +``` + +# Load libraries +```{r} +load_cran_pkgs <- function(pkg) { + new_pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] + if (length(new_pkg)) { + install.packages(new_pkg, dependencies = TRUE) + } + sapply(pkg, require, character.only = TRUE) +} + +# Install packages available on CRAN +cran_pkgs <- c( + "devtools", "tidyverse", "RColorBrewer", "here", "styler", + "ggrepel", "magrittr", "ape", "hash", "ggpubr", "egg", "ggpubr" +) +load_cran_pkgs(cran_pkgs) + +# Install packages available on Bioconductor +load_bioconductor_pkgs <- function(pkg) { + new_pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] + if (length(new_pkg)) { + BiocManager::install(new_pkg, version = "3.13") + } + sapply(pkg, require, character.only = TRUE) +} + +bioconductor_pkgs <- c("ggtree") + +load_bioconductor_pkgs(bioconductor_pkgs) + +# Install "quiblR" +devtools::install_github("nbedelman/quiblR") +library(quiblR) +``` + +# Set a custom `ggplot` theme +```{r} +# Set a custom ggplot theme +# devtools::install_github("cttobin/ggthemr") +library(ggthemr) +custom_palette <- c( + "#9E9E9E", "#59BDEF", "#EBCC2A", "#E1AF00", "#F27C8D", "#7B73D1", "#7B9FE0", + "#F5CC7F", "#66C2A5", "#28A7B6", "#F2CB57", "#F2A057", "#F2845C" +) +# ggthemr::colour_plot(custom_palette) + +custom_theme <- ggthemr::define_palette( + swatch = custom_palette, + gradient = c(lower = "#59BDEF", upper = "#FF6B5A") +) +ggthemr::ggthemr(custom_theme) +``` + +The analysis below follows the procedure documented in the `quiblR` package +(https://github.com/nbedelman/quiblR). + +# Load data +`quiblR` requires three input files: + +* A species tree +* QuIBL output +* A list of gene trees + +```{r} +species_tree <- quiblR::read_speciesTree( + here::here("data", "Astral.10SNP-genes.chr1-15.nwk") +) + +quibl_output <- quiblR::read_csv_quibl( + here::here("data", "chr16nr.50samples.quibl.out.csv") +) + +original_trees <- ape::read.tree( + here::here("data", "chr16nr.50samples.raxml.rooted.bestTree") +) + +length(original_trees) +# [1] 107 +``` + +# Replace underscores with dashes in sample names +```{r} +# get tip names from one of the subset trees +subset_tips <- c(original_trees[[1]]$tip.label) + +# subset the large species tree to include only the tips of interest +species_tree <- ape::keep.tip(species_tree, subset_tips) + +# replace underscores with dashes +# QuIBL and quiblR use underscores to separate triplets +species_tree$tip.label <- gsub( + "(SRR.+?)_", "\\1-", species_tree$tip.label +) + +quibl_output <- quibl_output %>% + dplyr::mutate_at(c("triplet", "outgroup"), + stringr::str_replace_all, + pattern = "(SRR.+?)_", replacement = "\\1-" + ) + +original_trees <- lapply(original_trees, function(x) { + x$tip.label <- gsub("(SRR.+?)_", "\\1-", x$tip.label) + return(x) +}) + +class(original_trees) <- "multiPhylo" + +# plot tree including the subset of tips +# ape::plot.phylo(species_tree) +pruned_tree <- ggtree(species_tree, + size = 1, color = "#767676", ladderize = FALSE +) + + ggtree::geom_tiplab(size = 4, offset = .1, color = "#767676") + + xlim(0, 28) + +pruned_tree + +ggsave(pruned_tree, + filename = "Astral.10SNP-genes.chr1-15.pruned50samples.pdf", + width = 8, + height = 8, + units = "in", + dpi = "retina" +) +``` + +# Get big-picture results +QuIBL assumes that we have a rooted species tree. In this case, we have rooted +the tree by fixing "gem-1-bigB-m-majorityallele" as the outgroup. + +QuIBL discards all triplets that include the overall outgroup, since all loci +are forced to have the same topology ("gem-1-bigB-m-majorityallele" as the +outgroup). + +We only accept the two-distribution model if "BIC2Dist" is at least 10 units +lower than "BIC1Dist" (`quiblR::testSignificance()`). + +```{r} +ape::is.rooted(species_tree) + +# species_tree <- root(species_tree, +# outgroup = "gem-1-bigB-m-majorityallele", resolve.root = TRUE +# ) + +# check tip labels +species_tree$tip.label %>% sort() +original_trees[[1]]$tip.label %>% sort() +quibl_output$outgroup %>% + sort() %>% + unique() + +# sanity check: compare tip labels +setdiff(species_tree$tip.label, original_trees[[1]]$tip.label) +setdiff(species_tree$tip.label, quibl_output$outgroup %>% sort() %>% unique()) + +# identify topologies concordant with the species tree and topologies which +# represent either ILS or introgression +# identify significant differences between the two models +total_trees <- sum(quibl_output$count) / length(unique(quibl_output$triplet)) + +quibl_output <- dplyr::mutate(quibl_output, + isDiscordant = as.integer(!apply(quibl_output, 1, + quiblR::isSpeciesTree, + sTree = species_tree + )), + isSignificant = as.integer(apply(quibl_output, 1, + quiblR::testSignificance, + threshold = 10 + )), + totalIntrogressionFraction = (mixprop2 * count * isDiscordant) / total_trees +) + +head(quibl_output) + +# save the results including discordant and significant triplets +quibl_output_sig <- quibl_output %>% + dplyr::filter(isDiscordant == 1 & isSignificant == 1) %>% + dplyr::arrange(desc(totalIntrogressionFraction)) + +quibl_output_sig$isSignificant %>% length() +# [1] 461 + +write.csv(quibl_output_sig, + file = here::here("chr16nr_50samples_quiblr_introgression_sig.csv"), + row.names = FALSE +) +``` + +# Get summary of results +`quiblR`'s `getIntrogressionSummary()` function returns a data frame with the +average introgression fraction for each pair of taxa (samples). +Specifically, the function goes triplet by triplet, ignores topologies +concordant with the species tree, records `mixprop2 * count / total trees`, +and averages that value across all tests that include each pair of species. + +```{r} +introgression_summary <- quiblR::getIntrogressionSummary( + quibl_output, species_tree +) + +introgression_summary <- introgression_summary %>% + dplyr::arrange(desc(value)) + +# save the introgression summary +write.csv(introgression_summary, + file = here::here("chr16nr_50samples_quiblr_introgression_summary.csv"), + row.names = FALSE +) + +head(introgression_summary) +``` + +# Make a heatmap of the average introgression fractions +```{r} +# plot heatmap of average introgression fractions + +# create a function to plot heatmaps of average introgression fractions +plot_heatmap <- function(df) { + ggplot(data = df, aes(x = tax1, y = tax2, fill = value)) + + geom_tile(color = "white") + + scale_fill_gradient2( + low = "#3B9AB2", high = "#F21A00", mid = "#EBCC2A", na.value = "grey50", + midpoint = max(introgression_summary$value) / 2, + limit = c(0, max(introgression_summary$value)), + name = "Average introgression fraction" + ) + + geom_abline(slope = 1, intercept = 0, color = "grey50") + + geom_vline( + xintercept = seq(1.5, nrow(introgression_summary) + 0.5, 1), + alpha = 0.6 + ) + + geom_hline( + yintercept = seq(1.5, nrow(introgression_summary) + 0.5, 1), + alpha = 0.6 + ) + + labs(x = "", y = "") + + # scale_x_discrete(position = "top") + + theme( + panel.grid = element_blank(), + # legend.position = "none", + legend.position = "top", + legend.title.align = 0.5, + legend.box = "vertical", + legend.margin = margin(), + text = element_text(color = "#767676"), + # legend.title = element_text(size = 12), + # legend.text = element_text(size = 10), + axis.text = element_text(color = "#767676"), + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1), + plot.margin = margin(6, 6, 0, 0) + ) +} + +summary_matrix <- plot_heatmap(introgression_summary) +summary_matrix + +species_tree_subset <- ggtree(quiblR::extractTripletTree( + species_tree, + unique(introgression_summary$tax1) +), +size = 1, color = "#767676", ladderize = FALSE, +layout = "rect", branch.length = "none" +) + + theme(plot.margin = margin(0, 0, 0, 0)) +# ggtree::geom_tiplab(size = 4, offset = .5, color = "#767676") + +# xlim(0, 32) +# species_tree_subset + +species_tree_subset_down <- ggtree(quiblR::extractTripletTree( + species_tree, + unique(introgression_summary$tax1) +), +size = 1, color = "#767676", ladderize = FALSE, +layout = "rect", branch.length = "none" +) + + coord_flip() + + theme(plot.margin = margin(0, 0, 0, 0)) +# ggtree::geom_tiplab( +# angle = 90, vjust = 0.5, hjust = 0, size = 4, offset = .5, +# color = "#767676" +# ) + +# xlim(0, 46) +# species_tree_subset_down +# geom_tiplab(angle = 90, vjust = 0.5, hjust = 1) + +empty <- ggplot() + + theme_void() + +introgression_heatmap <- egg::ggarrange( + species_tree_subset, summary_matrix, empty, species_tree_subset_down, + ncol = 2, nrow = 2, heights = c(2, 1), widths = c(1, 2) +) + +ggsave(introgression_heatmap, + filename = "chr16nr_50samples_introgression_heatmap.pdf", + width = 14, + height = 14, + units = "in", + dpi = "retina" +) + +# introgression_heatmap <- ggpubr::ggarrange( +# speciesTreeSubset, summaryMatrix, NULL, speciesTreeSubset_down, +# ncol = 2, nrow = 2, align = "hv", heights = c(2, 1), widths = c(1, 2), +# common.legend = TRUE +# ) + +# plot heatmap with samples sorted according to species and supergene haplotype +unique_tax <- introgression_summary %>% + dplyr::pull(tax1) %>% + unique() %>% + as.character() + +ordered_samples <- as.data.frame(unique_tax) %>% + dplyr::rename(sample = unique_tax) %>% + dplyr::mutate( + species = case_when( + grepl( + "-inv-|AR104|AR163|AR179|Km466|Cox1|AL-139|AL-150|U44|Pal1", + sample + ) ~ "invicta", + grepl( + "-ric-|U13|U93|U94|AR55|AR56|AR119|AR169|AR172", + sample + ) ~ "richteri", + # grepl("-mac-|U14|AR33", sample) ~ "macdonaghi", + grepl("-mac-|U14|AR33", sample) ~ "invicta", + grepl("-sae-|Copa2|Par1|Par2|USP3", sample) ~ "saevissima", + grepl("AR223|Lad4", sample) ~ "pusillignis" + ), + genotype = case_when( + grepl("bigB", sample) ~ "bigB", + grepl("littleb", sample) ~ "littleb" + ) + ) %>% + dplyr::arrange(species, genotype) + +introgression_summary_ordered <- introgression_summary + +# reorder samples according to species and supergene form +introgression_summary_ordered$tax1 <- factor( + introgression_summary_ordered$tax1, + levels = rev(ordered_samples$sample) + # levels = rev(ordered_samples$sample) +) + +introgression_summary_ordered$tax2 <- factor( + introgression_summary_ordered$tax2, + levels = ordered_samples$sample + # levels = rev(ordered_samples$sample) +) + +plot_heatmap(introgression_summary_ordered) + + geom_rug( + data = ordered_samples, + aes(x = sample, color = species), + length = unit(0.01, "npc"), size = 3, sides = "b", + inherit.aes = FALSE + ) + + geom_rug( + data = ordered_samples, + aes(y = sample, color = species), + length = unit(0.01, "npc"), size = 3, sides = "l", + inherit.aes = FALSE + ) + + scale_x_discrete(expand = expansion(mult = c(0.03, 0))) + + scale_y_discrete(expand = expansion(mult = c(0.03, 0))) + + guides(color = guide_legend(title = "Species")) + + ggthemr::scale_colour_ggthemr_d() + +ggsave( + filename = "chr16nr_50samples_introgression_heatmap_ordered.pdf", + width = 10, + height = 10, + units = "in", + dpi = "retina" +) +``` + +# Examine triplets of interest +```{r} +# select triplet(s) of interest from the list of significant results +quibl_output_sig %>% + dplyr::arrange(desc(totalIntrogressionFraction)) %>% + head(n = 20) + +target_triplet <- + "U14-1-littleb-p_SRR9008135-ric-76-Ros-littleb_SRR9008263-inv-180-For-bigB" + +quibl_output %>% + dplyr::filter(stringr::str_detect(triplet, target_triplet)) + +list_target_triplets <- list( + "U14-1-littleb-p_SRR9008135-ric-76-Ros-littleb_SRR9008263-inv-180-For-bigB", + "U14-1-littleb-p_SRR9008135-ric-76-Ros-littleb_SRR7028246-AL-150-bigB-m", + "U14-1-littleb-p_SRR9008189-ric-105-Bol-littleb_SRR9008263-inv-180-For-bigB", + "U14-1-littleb-p_SRR9008189-ric-105-Bol-littleb_SRR7028246-AL-150-bigB-m" +) +``` + +`getPerLocusStats()` produces a data frame including the triplet sub-tree, +the outgroup, internal branch length, and introgression probability. +```{r} +triplet_perlocus_stats <- quiblR::getPerLocusStats( + quiblOutput = quibl_output, + trip = target_triplet, + treeList = original_trees, + overallOut = "gem-1-bigB-m-majorityallele" +) + +head(triplet_perlocus_stats) + +triplet_perlocus_stats %>% + ggplot(aes(x = introProb)) + + geom_histogram(fill = "grey50", bins = 80) + +multiple_triplets_perlocus_stats <- list() + +for (i in list_target_triplets) { + multiple_triplets_perlocus_stats[[i]] <- quiblR::getPerLocusStats( + quiblOutput = quibl_output, + trip = i, + treeList = original_trees, + overallOut = "gem-1-bigB-m-majorityallele" + ) +} +# +# plot_histo <- function(data) { +# ggplot(data, aes(x = introProb)) + +# geom_histogram(fill = "grey50", bins = 80) +# } +# +# histo_triplets <- lapply(multiple_triplets_perlocus_stats, plot_histo) +# do.call(gridExtra::grid.arrange, c(histo_triplets, ncol = 2)) +``` + +Look at the internal branch lengths for introgression topologies. +```{r} +subset(triplet_perlocus_stats, out == "SRR9008263-inv-180-For-bigB") %>% str() + +plot_density_blens <- function(df, outgroup, target_triplet) { + ggplot( + data = subset( + df, out == outgroup + ), + aes(x = branchLength) + ) + + geom_density(fill = "grey50", color = "grey50", alpha = 0.6) + + labs(title = target_triplet, x = "Branch Length", y = "Density") + + theme( + plot.title = element_text(hjust = 0.5, size = 14, face = "plain"), + axis.title = element_text(size = 14), + axis.text = element_text(size = 12) + ) + + xlim(0, 0.005) +} + +names(multiple_triplets_perlocus_stats) +# [1] "U14-1-littleb-p_SRR9008135-ric-76-Ros-littleb_SRR9008263-inv-180-For-bigB" +# [2] "U14-1-littleb-p_SRR9008135-ric-76-Ros-littleb_SRR7028246-AL-150-bigB-m" +# [3] "U14-1-littleb-p_SRR9008189-ric-105-Bol-littleb_SRR9008263-inv-180-For-bigB" +# [4] "U14-1-littleb-p_SRR9008189-ric-105-Bol-littleb_SRR7028246-AL-150-bigB-m" + +pd1 <- plot_density_blens( + multiple_triplets_perlocus_stats[[1]], + "SRR9008263-inv-180-For-bigB", + names(multiple_triplets_perlocus_stats)[1] +) + +pd2 <- plot_density_blens( + multiple_triplets_perlocus_stats[[2]], + "SRR7028246-AL-150-bigB-m", + names(multiple_triplets_perlocus_stats)[2] +) + +pd3 <- plot_density_blens( + multiple_triplets_perlocus_stats[[3]], + "SRR9008263-inv-180-For-bigB", + names(multiple_triplets_perlocus_stats)[3] +) + +pd4 <- plot_density_blens( + multiple_triplets_perlocus_stats[[4]], + "SRR7028246-AL-150-bigB-m", + names(multiple_triplets_perlocus_stats)[4] +) + +density_triplets <- gridExtra::grid.arrange(pd1, pd2, pd3, pd4, ncol = 2) + +ggsave( + plot = density_triplets, + filename = "chr16nr_50samples_select_triplets_blengths_density.pdf", + width = 8, + height = 8, + units = "in", + dpi = "retina" +) +```