From 7a663892a9fe0567cad8ba14b987be2fe4f14327 Mon Sep 17 00:00:00 2001 From: roddypr Date: Mon, 6 Dec 2021 10:56:48 +0000 Subject: [PATCH] Analysis of shared polymorphisms between invicta and richteri --- .../README.md | 156 +++++ .../allele_freq1.r | 318 +++++++++ .../allele_freq2.r | 318 +++++++++ .../allele_freq3.r | 318 +++++++++ .../allele_freq4.r | 318 +++++++++ .../allele_freq5.r | 318 +++++++++ .../shared_private.rmd | 656 ++++++++++++++++++ 7 files changed, 2402 insertions(+) create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/README.md create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq1.r create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq2.r create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq3.r create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq4.r create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq5.r create mode 100644 Private and shared alleles/Polymorphisms in invicta and richteri/shared_private.rmd diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/README.md b/Private and shared alleles/Polymorphisms in invicta and richteri/README.md new file mode 100644 index 0000000..e09d6f9 --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/README.md @@ -0,0 +1,156 @@ +# BBBAA-BBABA test: alleles shared between SB and Sb of different _S. invicta_ and _S. richteri_ + +Rodrigo Pracana, QMUL 2021 + +## Samples + +We get the samples from the twisst analysis + +```sh + +mkdir tmp +mkdir results +mkdir input + +ln -sfr /data/archive/archive-SBCS-WurmLab/2021-solenopsis_introgression/2021-08-04-twisst/results/twisst_samples_low_missingness input/samples + +``` + +Then choose the same number of samples per species: + +```sh + + +# species in analysis +analysis_species <- c( + "invicta", + "richteri", + "saevissima" +) + +# Get twisst sample names (invicta, richteri, saevissima) +sample_table <- read.table("input/samples") +colnames(sample_table) <- c("sample", "species") + +sample_table$variant <- gsub(".*_", "", sample_table$species) +sample_table$variant <- gsub("SB", "bb", sample_table$variant) +sample_table$variant <- gsub("Sb", "lb", sample_table$variant) +sample_table$variant[!sample_table$variant %in% c("lb", "bb")] <- "outgroup" + +sample_table$species <- gsub("invicta/macdonaghi", "invicta", sample_table$species) +sample_table$species <- gsub("_S[Bb]", "", sample_table$species) + +sample_table <- sample_table[sample_table$species %in% analysis_species, ] + +sample_table$species_variant <- paste(sample_table$species, + sample_table$variant, + sep = "_") + +table(sample_table$species_variant) +# invicta_bb invicta_lb richteri_bb richteri_lb +# 134 59 28 32 +# saevissima_outgroup +# 11 + +# Remove North American samples +north_american_samples <- grep("^F[1-8]", sample_table$sample, value = TRUE) + +sample_table <- sample_table[!sample_table$sample %in% north_american_samples, ] + +# Vector of samples +invicta_bb <- sample_table$sample[sample_table$species_variant == "invicta_bb"] +invicta_lb <- sample_table$sample[sample_table$species_variant == "invicta_lb"] +richteri_bb <- sample_table$sample[sample_table$species_variant == "richteri_bb"] +richteri_lb <- sample_table$sample[sample_table$species_variant == "richteri_lb"] +saevissima <- sample_table$sample[sample_table$species_variant == "saevissima_outgroup"] + +# The smallest sample size is richteri_bb, with 28 samples +invicta_bb <- invicta_bb[sample(length(invicta_bb), 28)] +invicta_lb <- invicta_lb[sample(length(invicta_bb), 28)] +richteri_lb <- richteri_lb[sample(length(invicta_bb), 28)] + +all_samples <- c(invicta_bb, invicta_lb, richteri_bb, richteri_lb, saevissima) + +sample_table_chose <- sample_table[sample_table$sample %in% all_samples, ] +write.table(sample_table_chose, file="results/samples", quote=FALSE) + +``` + +## VCF input + +```sh + +ln -sf /data/archive/archive-SBCS-WurmLab/rpracana/projects/2020-introgression/2021-04-introgression/results/gt.vcf.gz input/all.vcf.gz +ln -sf /data/archive/archive-SBCS-WurmLab/rpracana/projects/2020-introgression/2021-04-introgression/results/gt.vcf.gz.tbi input/all.vcf.gz.tbi + +``` + +Include bi-allelic only: + +```sh + +module load bcftools/1.13 +bcftools view --max-alleles 2 --exclude-types indels -O z \ + input/all.vcf.gz > tmp/biallelic_snps.vcf.gz +bcftools index -t tmp/biallelic_snps.vcf.gz + +bcftools index --nrecords input/all.vcf.gz +# 2650251 +bcftools index --nrecords tmp/biallelic_snps.vcf.gz +# 2600879 + +``` + +## Supergene location + +```sh + +ln -sf /data/archive/archive-SBCS-WurmLab/rpracana/projects/south_social_chrom_div/south_nf/analysis/2018-05-11-linkage-map/results/linkage_map_supergene.txt input + +``` + +## GFF + +```sh + +ln -sfr /data/archive/archive-SBCS-WurmLab/db/genomic/S_invicta/Si_gnGA/annotation/SolInvGFF.corrected.mRNACDSintron.gff input/gnga.gff + +ln -sfr /data/archive/archive-SBCS-WurmLab/db/genomic/S_invicta/Si_gnGA/assembly/gnG_20161213.fa input/ref.fa + +``` + +## R + +The following script loads the VCF for _S. invicta_, _S. richteri_, and the outgroup _S. saevissima_. For each site, the script identifies the ancestral and the derived allele and measures the derived allele frequency of each population (the SB and the Sb samples, respectively, of each species). The script is divided into five parts to parallelise the process. + +```sh + +module load R/4.0.2 + +Rscript allele_freq1.r 1> tmp/allele_freq1.r.out 2> tmp/allele_freq1.r.err & +Rscript allele_freq2.r 1> tmp/allele_freq2.r.out 2> tmp/allele_freq2.r.err & +Rscript allele_freq3.r 1> tmp/allele_freq3.r.out 2> tmp/allele_freq3.r.err & +Rscript allele_freq4.r 1> tmp/allele_freq4.r.out 2> tmp/allele_freq4.r.err & +Rscript allele_freq5.r 1> tmp/allele_freq5.r.out 2> tmp/allele_freq5.r.err & + +``` + +The results are added to the directory `results/by_scaff`. + +```sh + +head -n1 results/by_scaff/private_and_shared_per_site_10.csv > results/snp_frequencies.csv +tail -n +2 --quiet results/by_scaff/private_and_shared_per_site_* >> results/snp_frequencies.csv + +wc -l results/snp_frequencies.csv +# 807657 results/snp_frequencies.csv + +grep -c supergene results/snp_frequencies.csv +# 39210 + +``` + + +## Analyse + +We use the rmarkdown script `shared_private.Rmd` to diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq1.r b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq1.r new file mode 100644 index 0000000..5d8f2ae --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq1.r @@ -0,0 +1,318 @@ +library(GenomicRanges) +library(VariantAnnotation) +library(GenomicFeatures) + +# VCF file +vcf_path <- "tmp/biallelic_snps.vcf.gz" + +sample_table <- read.table("results/samples", header = TRUE) + +invicta_bb <- sample_table$sample[sample_table$species_variant == "invicta_bb"] +invicta_lb <- sample_table$sample[sample_table$species_variant == "invicta_lb"] +richteri_bb <- sample_table$sample[sample_table$species_variant == "richteri_bb"] +richteri_lb <- sample_table$sample[sample_table$species_variant == "richteri_lb"] +saevissima <- sample_table$sample[sample_table$species_variant == "saevissima_outgroup"] +all_samples <- c(invicta_bb, invicta_lb, richteri_bb, richteri_lb, saevissima) + +dir.create("results/by_scaff", recursive=TRUE) + +# ---------------------------------------------------------------------------- # +# Get supergene/non supergene region +# ---------------------------------------------------------------------------- # + +regions <- read.table("input/linkage_map_supergene.txt", header = TRUE) +regions_gr <- regions[, c("scaffold", "scaffold_start", "scaffold_end", "orientation", "chr", "region")] +colnames(regions_gr) <- c( + "chr", "start", "end", "orientation", "chr_name", "region" +) +regions_gr <- GRanges(regions_gr) + +# ---------------------------------------------------------------------------- # +# Read genome fasta and gff file, to predict coding changes +# ---------------------------------------------------------------------------- # +txdb <- makeTxDbFromGFF("input/gnga.gff", format = "gff3") +gnga <- readDNAStringSet("input/ref.fa") +names(gnga) <- gsub(" .*", "", names(gnga)) + +# ---------------------------------------------------------------------------- # +# Read VCF bit by bit +# ---------------------------------------------------------------------------- # +scaffs_in_vcf <- headerTabix("tmp/biallelic_snps.vcf.gz")$seqnames +for (i in 1:100) { + + if (as.character(seqnames(regions_gr[i])) %in% scaffs_in_vcf) { + # ---------------------------------------------------------------------------- # + # Read VCF (VariantAnnotation) + # ---------------------------------------------------------------------------- # + + param <- ScanVcfParam(which = regions_gr[i], + geno = "GT", + samples = all_samples) + + vcf <- readVcf(vcf_path, row.names=TRUE, param=param) + + # ---------------------------------------------------------------------------- # + # Remove weird sites + # ---------------------------------------------------------------------------- # + acceptable_gt <- c("0|0", "1|1", ".|.") + bi_allelic <- apply(geno(vcf)$GT, 1, function(gt_row) all(gt_row %in% acceptable_gt)) + vcf <- vcf[bi_allelic,] + + # keep only sites with a single alternative - too slow + # now done with bcftools in advance + # vcf <- vcf[sapply(rowRanges(vcf)$ALT, length) == 1, ] + + # ---------------------------------------------------------------------------- # + # Recode GT + # ---------------------------------------------------------------------------- # + + geno(vcf)$GT[geno(vcf)$GT == "0|0"] <- "0" + geno(vcf)$GT[geno(vcf)$GT == "1|1"] <- "1" + geno(vcf)$GT[geno(vcf)$GT == ".|."] <- NA + + # ---------------------------------------------------------------------------- # + # No missing + # ---------------------------------------------------------------------------- # + # Analysis will only work if most individuals in all populations carry the SNP + + missing_per_row <- function(gt_row, max_missing) { + # max_missing should be a proportion + stopifnot(max_missing < 1, max_missing >= 0) + max_missing <- max_missing * length(gt_row) + + # retuns TRUE is the number of missing genopypes is smaller than max_missing + return(sum(is.na(gt_row)) < max_missing) + } + + missing_per_pop <- function(gt, pop, max_missing) { + gt <- gt[,pop] + # returns vector of TRUE or FALSE + not_missing <- apply(gt, 1, function(row) missing_per_row(row, max_missing)) + } + + max_missing <- 0.20 # that's 5.6 in 28 + + # filter by missing + not_missing_pop <- missing_per_pop(geno(vcf)$GT, invicta_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, invicta_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, saevissima, max_missing) + + # sum(not_missing_pop) + # 11979 in 12222 + vcf <- vcf[not_missing_pop, ] + + # ---------------------------------------------------------------------------- # + # Needs to be polymorphic + # ---------------------------------------------------------------------------- # + + alternative_frequency <- function(gt_row) { + + # Deal with missing data + if (all(is.na(gt_row))) { + print(gt_row) + stop("Error: all samples are NA") + } + gt_row <- gt_row[!is.na(gt_row)] + + # Calculate frquency of alternative (0) allele + aaf <- sum(gt_row == 1) / length(gt_row) + + } + + af <- apply(geno(vcf)$GT, 1, function(row) alternative_frequency(row)) + + vcf <- vcf[(af != 0) & (af != 1)] + + # ---------------------------------------------------------------------------- # + # Remove sites that are polymorphic in the outgroup species + # ---------------------------------------------------------------------------- # + + af_by_pop <- function(gt, pop) { + apply(gt[,pop], 1, function(row) alternative_frequency(row)) + } + + af_outgroup <- af_by_pop(gt = geno(vcf)$GT, pop = saevissima) + + # sum((af_outgroup == 0) & (af_outgroup == 1)) + # [1] 3836 + + vcf <- vcf[(af_outgroup == 0) | (af_outgroup == 1), ] + af_outgroup <- af_outgroup[(af_outgroup == 0) | (af_outgroup == 1)] + + # ---------------------------------------------------------------------------- # + # Predict coding + # ---------------------------------------------------------------------------- # + if (nrow(vcf) > 0) { + code_prediction <- predictCoding(vcf, txdb, seqSource = gnga) + + # Remove SNPs that have a coding effect in multiple transcripts + duplicated_SNPs <- names(code_prediction)[duplicated(names(code_prediction))] + + vcf <- vcf[!rownames(vcf) %in% duplicated_SNPs] + + code_prediction <- code_prediction[!names(code_prediction) %in% duplicated_SNPs] + } + + if (nrow(vcf) > 0) { + + # ---------------------------------------------------------------------------- # + # Recode SNPs as numeric + # ---------------------------------------------------------------------------- # + + gt_data <- rowRanges(vcf) + gt <- t(apply(geno(vcf)$GT, 1, as.numeric)) + colnames(gt) <- colnames(geno(vcf)$GT) + + # ---------------------------------------------------------------------------- # + # Recode SNPs + # ---------------------------------------------------------------------------- # + # 0 is the allele carried by the outgroup + # 1 is the other allele + + recode_by_outgroup <- function(gt_row, outgroup) { + + no_missing <- gt_row[!is.na(gt_row)] + + if (!any(no_missing %in% c(0,1))) { + stop("The only allowed genotypes are 1 and 0") + } + + if (all(no_missing == no_missing[1])) { + stop("Site needs to be polymorphic") + } + + outgroup_gt <- gt_row[outgroup] + if (all(is.na(outgroup_gt))) { + print(outgroup_gt) + stop("Error: all outgroup samples are NA") + } + + outgroup_gt <- outgroup_gt[!is.na(outgroup_gt)] + + if (any(outgroup_gt != outgroup_gt[1])) { + print(outgroup_gt) + stop("Error: outgroup cannot be polymorphic") + } + + # The following is necessary in case there is missing data + + # If the first element of outgroup is the alternative allele (1), + # Then 1 -> 0 and 0 -> 1 + if (outgroup_gt[1] == 1) { + + gt_row <- c(1,0)[match(gt_row,c(0,1))] + + } + + return(gt_row) + + } + + recoded_gt <- t(apply(gt, 1, function(row) recode_by_outgroup(row, outgroup = saevissima))) + colnames(recoded_gt) <- colnames(gt) + + gt <- recoded_gt + + # ---------------------------------------------------------------------------- # + # Allele frequency of each population + # ---------------------------------------------------------------------------- # + + allele_frequencies_ma <- cbind( + invicta_bb = af_by_pop(gt = gt, pop = invicta_bb), + invicta_lb = af_by_pop(gt = gt, pop = invicta_lb), + richteri_bb = af_by_pop(gt = gt, pop = richteri_bb), + richteri_lb = af_by_pop(gt = gt, pop = richteri_lb) + ) + + # ---------------------------------------------------------------------------- # + # Private to SB + # ---------------------------------------------------------------------------- # + + private_alleles <- function(freq_matrix, pop) { + pop_freq <- freq_matrix[, pop, drop = FALSE] + other_freq <- freq_matrix[, !colnames(freq_matrix) %in% pop, drop = FALSE] + + # Present in population 1 and no other population + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop_freq, 1, sum) > 0) & (apply(other_freq, 1, sum) == 0) + + } + + private_to_invicta_bb <- private_alleles(allele_frequencies_ma, pop="invicta_bb") + private_to_richteri_bb <- private_alleles(allele_frequencies_ma, pop="richteri_bb") + + # ---------------------------------------------------------------------------- # + # Shared between clades + # ---------------------------------------------------------------------------- # + shared_between_clades <- function(freq_matrix, pop1, pop2) { + pop1_freq <- freq_matrix[, pop1, drop = FALSE] + pop2_freqs <- freq_matrix[, pop2, drop = FALSE] + + # Present in population 1 and at least one of the other populations + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop1_freq, 1, sum) > 0) & (apply(pop2_freqs, 1, sum) > 0) + } + + shared_lb_invicta_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "invicta_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("invicta_bb", "invicta_lb", "richteri_lb")) + + shared_lb_richteri_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "richteri_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("richteri_bb", "invicta_lb", "richteri_lb")) + + # ---------------------------------------------------------------------------- # + # Prepare end data frame + # ---------------------------------------------------------------------------- # + df_to_write <- as.data.frame(rowRanges(vcf)) + + df_to_write$paramRangeID <- NULL + df_to_write$ALT <- sapply(unlist(df_to_write$ALT), as.character) + + code_prediction <- as.data.frame(code_prediction) + code_prediction <- code_prediction[, -c(1:9)] + + tables_match <- match(rownames(df_to_write), rownames(code_prediction)) + df_to_write <- cbind(df_to_write, code_prediction[tables_match,]) + + # Add frequencies + # Recalculate for outgroup because VCF may have been filtered + # But use the genotypes in VCF and not the recoded GT + df_to_write$af_saevissima_outgroup <- af_by_pop(gt = geno(vcf)$GT, + pop = saevissima) + df_to_write$derived_allele <- ifelse(df_to_write$af_saevissima_outgroup == 0, + "REF", "ALT") + + colnames(allele_frequencies_ma) <- paste0("daf_", colnames(allele_frequencies_ma)) + df_to_write <- cbind(df_to_write, allele_frequencies_ma) + + # Add measurement of each private/shared + df_to_write$private_to_invicta_bb <- private_to_invicta_bb + df_to_write$private_to_richteri_bb <- private_to_richteri_bb + + df_to_write$shared_lb_invicta_bb <- shared_lb_invicta_bb + df_to_write$shared_lb_richteri_bb <- shared_lb_richteri_bb + + # Add region + df_to_write$chr <- regions_gr$chr_name[i] + df_to_write$region <- regions_gr$region[i] + + # ---------------------------------------------------------------------------- # + out_path <- paste("results/by_scaff/private_and_shared_per_site_", i, ".csv", sep = "") + + write.csv(df_to_write, file = out_path, + row.names=FALSE, quote=FALSE) + # ---------------------------------------------------------------------------- # + } + } +} diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq2.r b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq2.r new file mode 100644 index 0000000..d6ed56d --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq2.r @@ -0,0 +1,318 @@ +library(GenomicRanges) +library(VariantAnnotation) +library(GenomicFeatures) + +# VCF file +vcf_path <- "tmp/biallelic_snps.vcf.gz" + +sample_table <- read.table("results/samples", header = TRUE) + +invicta_bb <- sample_table$sample[sample_table$species_variant == "invicta_bb"] +invicta_lb <- sample_table$sample[sample_table$species_variant == "invicta_lb"] +richteri_bb <- sample_table$sample[sample_table$species_variant == "richteri_bb"] +richteri_lb <- sample_table$sample[sample_table$species_variant == "richteri_lb"] +saevissima <- sample_table$sample[sample_table$species_variant == "saevissima_outgroup"] +all_samples <- c(invicta_bb, invicta_lb, richteri_bb, richteri_lb, saevissima) + +dir.create("results/by_scaff", recursive=TRUE) + +# ---------------------------------------------------------------------------- # +# Get supergene/non supergene region +# ---------------------------------------------------------------------------- # + +regions <- read.table("input/linkage_map_supergene.txt", header = TRUE) +regions_gr <- regions[, c("scaffold", "scaffold_start", "scaffold_end", "orientation", "chr", "region")] +colnames(regions_gr) <- c( + "chr", "start", "end", "orientation", "chr_name", "region" +) +regions_gr <- GRanges(regions_gr) + +# ---------------------------------------------------------------------------- # +# Read genome fasta and gff file, to predict coding changes +# ---------------------------------------------------------------------------- # +txdb <- makeTxDbFromGFF("input/gnga.gff", format = "gff3") +gnga <- readDNAStringSet("input/ref.fa") +names(gnga) <- gsub(" .*", "", names(gnga)) + +# ---------------------------------------------------------------------------- # +# Read VCF bit by bit +# ---------------------------------------------------------------------------- # +scaffs_in_vcf <- headerTabix("tmp/biallelic_snps.vcf.gz")$seqnames +for (i in 101:200) { + + if (as.character(seqnames(regions_gr[i])) %in% scaffs_in_vcf) { + # ---------------------------------------------------------------------------- # + # Read VCF (VariantAnnotation) + # ---------------------------------------------------------------------------- # + + param <- ScanVcfParam(which = regions_gr[i], + geno = "GT", + samples = all_samples) + + vcf <- readVcf(vcf_path, row.names=TRUE, param=param) + + # ---------------------------------------------------------------------------- # + # Remove weird sites + # ---------------------------------------------------------------------------- # + acceptable_gt <- c("0|0", "1|1", ".|.") + bi_allelic <- apply(geno(vcf)$GT, 1, function(gt_row) all(gt_row %in% acceptable_gt)) + vcf <- vcf[bi_allelic,] + + # keep only sites with a single alternative - too slow + # now done with bcftools in advance + # vcf <- vcf[sapply(rowRanges(vcf)$ALT, length) == 1, ] + + # ---------------------------------------------------------------------------- # + # Recode GT + # ---------------------------------------------------------------------------- # + + geno(vcf)$GT[geno(vcf)$GT == "0|0"] <- "0" + geno(vcf)$GT[geno(vcf)$GT == "1|1"] <- "1" + geno(vcf)$GT[geno(vcf)$GT == ".|."] <- NA + + # ---------------------------------------------------------------------------- # + # No missing + # ---------------------------------------------------------------------------- # + # Analysis will only work if most individuals in all populations carry the SNP + + missing_per_row <- function(gt_row, max_missing) { + # max_missing should be a proportion + stopifnot(max_missing < 1, max_missing >= 0) + max_missing <- max_missing * length(gt_row) + + # retuns TRUE is the number of missing genopypes is smaller than max_missing + return(sum(is.na(gt_row)) < max_missing) + } + + missing_per_pop <- function(gt, pop, max_missing) { + gt <- gt[,pop] + # returns vector of TRUE or FALSE + not_missing <- apply(gt, 1, function(row) missing_per_row(row, max_missing)) + } + + max_missing <- 0.20 # that's 5.6 in 28 + + # filter by missing + not_missing_pop <- missing_per_pop(geno(vcf)$GT, invicta_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, invicta_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, saevissima, max_missing) + + # sum(not_missing_pop) + # 11979 in 12222 + vcf <- vcf[not_missing_pop, ] + + # ---------------------------------------------------------------------------- # + # Needs to be polymorphic + # ---------------------------------------------------------------------------- # + + alternative_frequency <- function(gt_row) { + + # Deal with missing data + if (all(is.na(gt_row))) { + print(gt_row) + stop("Error: all samples are NA") + } + gt_row <- gt_row[!is.na(gt_row)] + + # Calculate frquency of alternative (0) allele + aaf <- sum(gt_row == 1) / length(gt_row) + + } + + af <- apply(geno(vcf)$GT, 1, function(row) alternative_frequency(row)) + + vcf <- vcf[(af != 0) & (af != 1)] + + # ---------------------------------------------------------------------------- # + # Remove sites that are polymorphic in the outgroup species + # ---------------------------------------------------------------------------- # + + af_by_pop <- function(gt, pop) { + apply(gt[,pop], 1, function(row) alternative_frequency(row)) + } + + af_outgroup <- af_by_pop(gt = geno(vcf)$GT, pop = saevissima) + + # sum((af_outgroup == 0) & (af_outgroup == 1)) + # [1] 3836 + + vcf <- vcf[(af_outgroup == 0) | (af_outgroup == 1), ] + af_outgroup <- af_outgroup[(af_outgroup == 0) | (af_outgroup == 1)] + + # ---------------------------------------------------------------------------- # + # Predict coding + # ---------------------------------------------------------------------------- # + if (nrow(vcf) > 0) { + code_prediction <- predictCoding(vcf, txdb, seqSource = gnga) + + # Remove SNPs that have a coding effect in multiple transcripts + duplicated_SNPs <- names(code_prediction)[duplicated(names(code_prediction))] + + vcf <- vcf[!rownames(vcf) %in% duplicated_SNPs] + + code_prediction <- code_prediction[!names(code_prediction) %in% duplicated_SNPs] + } + + if (nrow(vcf) > 0) { + + # ---------------------------------------------------------------------------- # + # Recode SNPs as numeric + # ---------------------------------------------------------------------------- # + + gt_data <- rowRanges(vcf) + gt <- t(apply(geno(vcf)$GT, 1, as.numeric)) + colnames(gt) <- colnames(geno(vcf)$GT) + + # ---------------------------------------------------------------------------- # + # Recode SNPs + # ---------------------------------------------------------------------------- # + # 0 is the allele carried by the outgroup + # 1 is the other allele + + recode_by_outgroup <- function(gt_row, outgroup) { + + no_missing <- gt_row[!is.na(gt_row)] + + if (!any(no_missing %in% c(0,1))) { + stop("The only allowed genotypes are 1 and 0") + } + + if (all(no_missing == no_missing[1])) { + stop("Site needs to be polymorphic") + } + + outgroup_gt <- gt_row[outgroup] + if (all(is.na(outgroup_gt))) { + print(outgroup_gt) + stop("Error: all outgroup samples are NA") + } + + outgroup_gt <- outgroup_gt[!is.na(outgroup_gt)] + + if (any(outgroup_gt != outgroup_gt[1])) { + print(outgroup_gt) + stop("Error: outgroup cannot be polymorphic") + } + + # The following is necessary in case there is missing data + + # If the first element of outgroup is the alternative allele (1), + # Then 1 -> 0 and 0 -> 1 + if (outgroup_gt[1] == 1) { + + gt_row <- c(1,0)[match(gt_row,c(0,1))] + + } + + return(gt_row) + + } + + recoded_gt <- t(apply(gt, 1, function(row) recode_by_outgroup(row, outgroup = saevissima))) + colnames(recoded_gt) <- colnames(gt) + + gt <- recoded_gt + + # ---------------------------------------------------------------------------- # + # Allele frequency of each population + # ---------------------------------------------------------------------------- # + + allele_frequencies_ma <- cbind( + invicta_bb = af_by_pop(gt = gt, pop = invicta_bb), + invicta_lb = af_by_pop(gt = gt, pop = invicta_lb), + richteri_bb = af_by_pop(gt = gt, pop = richteri_bb), + richteri_lb = af_by_pop(gt = gt, pop = richteri_lb) + ) + + # ---------------------------------------------------------------------------- # + # Private to SB + # ---------------------------------------------------------------------------- # + + private_alleles <- function(freq_matrix, pop) { + pop_freq <- freq_matrix[, pop, drop = FALSE] + other_freq <- freq_matrix[, !colnames(freq_matrix) %in% pop, drop = FALSE] + + # Present in population 1 and no other population + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop_freq, 1, sum) > 0) & (apply(other_freq, 1, sum) == 0) + + } + + private_to_invicta_bb <- private_alleles(allele_frequencies_ma, pop="invicta_bb") + private_to_richteri_bb <- private_alleles(allele_frequencies_ma, pop="richteri_bb") + + # ---------------------------------------------------------------------------- # + # Shared between clades + # ---------------------------------------------------------------------------- # + shared_between_clades <- function(freq_matrix, pop1, pop2) { + pop1_freq <- freq_matrix[, pop1, drop = FALSE] + pop2_freqs <- freq_matrix[, pop2, drop = FALSE] + + # Present in population 1 and at least one of the other populations + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop1_freq, 1, sum) > 0) & (apply(pop2_freqs, 1, sum) > 0) + } + + shared_lb_invicta_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "invicta_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("invicta_bb", "invicta_lb", "richteri_lb")) + + shared_lb_richteri_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "richteri_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("richteri_bb", "invicta_lb", "richteri_lb")) + + # ---------------------------------------------------------------------------- # + # Prepare end data frame + # ---------------------------------------------------------------------------- # + df_to_write <- as.data.frame(rowRanges(vcf)) + + df_to_write$paramRangeID <- NULL + df_to_write$ALT <- sapply(unlist(df_to_write$ALT), as.character) + + code_prediction <- as.data.frame(code_prediction) + code_prediction <- code_prediction[, -c(1:9)] + + tables_match <- match(rownames(df_to_write), rownames(code_prediction)) + df_to_write <- cbind(df_to_write, code_prediction[tables_match,]) + + # Add frequencies + # Recalculate for outgroup because VCF may have been filtered + # But use the genotypes in VCF and not the recoded GT + df_to_write$af_saevissima_outgroup <- af_by_pop(gt = geno(vcf)$GT, + pop = saevissima) + df_to_write$derived_allele <- ifelse(df_to_write$af_saevissima_outgroup == 0, + "REF", "ALT") + + colnames(allele_frequencies_ma) <- paste0("daf_", colnames(allele_frequencies_ma)) + df_to_write <- cbind(df_to_write, allele_frequencies_ma) + + # Add measurement of each private/shared + df_to_write$private_to_invicta_bb <- private_to_invicta_bb + df_to_write$private_to_richteri_bb <- private_to_richteri_bb + + df_to_write$shared_lb_invicta_bb <- shared_lb_invicta_bb + df_to_write$shared_lb_richteri_bb <- shared_lb_richteri_bb + + # Add region + df_to_write$chr <- regions_gr$chr_name[i] + df_to_write$region <- regions_gr$region[i] + + # ---------------------------------------------------------------------------- # + out_path <- paste("results/by_scaff/private_and_shared_per_site_", i, ".csv", sep = "") + + write.csv(df_to_write, file = out_path, + row.names=FALSE, quote=FALSE) + # ---------------------------------------------------------------------------- # + } + } +} diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq3.r b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq3.r new file mode 100644 index 0000000..55afeaf --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq3.r @@ -0,0 +1,318 @@ +library(GenomicRanges) +library(VariantAnnotation) +library(GenomicFeatures) + +# VCF file +vcf_path <- "tmp/biallelic_snps.vcf.gz" + +sample_table <- read.table("results/samples", header = TRUE) + +invicta_bb <- sample_table$sample[sample_table$species_variant == "invicta_bb"] +invicta_lb <- sample_table$sample[sample_table$species_variant == "invicta_lb"] +richteri_bb <- sample_table$sample[sample_table$species_variant == "richteri_bb"] +richteri_lb <- sample_table$sample[sample_table$species_variant == "richteri_lb"] +saevissima <- sample_table$sample[sample_table$species_variant == "saevissima_outgroup"] +all_samples <- c(invicta_bb, invicta_lb, richteri_bb, richteri_lb, saevissima) + +dir.create("results/by_scaff", recursive=TRUE) + +# ---------------------------------------------------------------------------- # +# Get supergene/non supergene region +# ---------------------------------------------------------------------------- # + +regions <- read.table("input/linkage_map_supergene.txt", header = TRUE) +regions_gr <- regions[, c("scaffold", "scaffold_start", "scaffold_end", "orientation", "chr", "region")] +colnames(regions_gr) <- c( + "chr", "start", "end", "orientation", "chr_name", "region" +) +regions_gr <- GRanges(regions_gr) + +# ---------------------------------------------------------------------------- # +# Read genome fasta and gff file, to predict coding changes +# ---------------------------------------------------------------------------- # +txdb <- makeTxDbFromGFF("input/gnga.gff", format = "gff3") +gnga <- readDNAStringSet("input/ref.fa") +names(gnga) <- gsub(" .*", "", names(gnga)) + +# ---------------------------------------------------------------------------- # +# Read VCF bit by bit +# ---------------------------------------------------------------------------- # +scaffs_in_vcf <- headerTabix("tmp/biallelic_snps.vcf.gz")$seqnames +for (i in 201:300) { + + if (as.character(seqnames(regions_gr[i])) %in% scaffs_in_vcf) { + # ---------------------------------------------------------------------------- # + # Read VCF (VariantAnnotation) + # ---------------------------------------------------------------------------- # + + param <- ScanVcfParam(which = regions_gr[i], + geno = "GT", + samples = all_samples) + + vcf <- readVcf(vcf_path, row.names=TRUE, param=param) + + # ---------------------------------------------------------------------------- # + # Remove weird sites + # ---------------------------------------------------------------------------- # + acceptable_gt <- c("0|0", "1|1", ".|.") + bi_allelic <- apply(geno(vcf)$GT, 1, function(gt_row) all(gt_row %in% acceptable_gt)) + vcf <- vcf[bi_allelic,] + + # keep only sites with a single alternative - too slow + # now done with bcftools in advance + # vcf <- vcf[sapply(rowRanges(vcf)$ALT, length) == 1, ] + + # ---------------------------------------------------------------------------- # + # Recode GT + # ---------------------------------------------------------------------------- # + + geno(vcf)$GT[geno(vcf)$GT == "0|0"] <- "0" + geno(vcf)$GT[geno(vcf)$GT == "1|1"] <- "1" + geno(vcf)$GT[geno(vcf)$GT == ".|."] <- NA + + # ---------------------------------------------------------------------------- # + # No missing + # ---------------------------------------------------------------------------- # + # Analysis will only work if most individuals in all populations carry the SNP + + missing_per_row <- function(gt_row, max_missing) { + # max_missing should be a proportion + stopifnot(max_missing < 1, max_missing >= 0) + max_missing <- max_missing * length(gt_row) + + # retuns TRUE is the number of missing genopypes is smaller than max_missing + return(sum(is.na(gt_row)) < max_missing) + } + + missing_per_pop <- function(gt, pop, max_missing) { + gt <- gt[,pop] + # returns vector of TRUE or FALSE + not_missing <- apply(gt, 1, function(row) missing_per_row(row, max_missing)) + } + + max_missing <- 0.20 # that's 5.6 in 28 + + # filter by missing + not_missing_pop <- missing_per_pop(geno(vcf)$GT, invicta_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, invicta_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, saevissima, max_missing) + + # sum(not_missing_pop) + # 11979 in 12222 + vcf <- vcf[not_missing_pop, ] + + # ---------------------------------------------------------------------------- # + # Needs to be polymorphic + # ---------------------------------------------------------------------------- # + + alternative_frequency <- function(gt_row) { + + # Deal with missing data + if (all(is.na(gt_row))) { + print(gt_row) + stop("Error: all samples are NA") + } + gt_row <- gt_row[!is.na(gt_row)] + + # Calculate frquency of alternative (0) allele + aaf <- sum(gt_row == 1) / length(gt_row) + + } + + af <- apply(geno(vcf)$GT, 1, function(row) alternative_frequency(row)) + + vcf <- vcf[(af != 0) & (af != 1)] + + # ---------------------------------------------------------------------------- # + # Remove sites that are polymorphic in the outgroup species + # ---------------------------------------------------------------------------- # + + af_by_pop <- function(gt, pop) { + apply(gt[,pop], 1, function(row) alternative_frequency(row)) + } + + af_outgroup <- af_by_pop(gt = geno(vcf)$GT, pop = saevissima) + + # sum((af_outgroup == 0) & (af_outgroup == 1)) + # [1] 3836 + + vcf <- vcf[(af_outgroup == 0) | (af_outgroup == 1), ] + af_outgroup <- af_outgroup[(af_outgroup == 0) | (af_outgroup == 1)] + + # ---------------------------------------------------------------------------- # + # Predict coding + # ---------------------------------------------------------------------------- # + if (nrow(vcf) > 0) { + code_prediction <- predictCoding(vcf, txdb, seqSource = gnga) + + # Remove SNPs that have a coding effect in multiple transcripts + duplicated_SNPs <- names(code_prediction)[duplicated(names(code_prediction))] + + vcf <- vcf[!rownames(vcf) %in% duplicated_SNPs] + + code_prediction <- code_prediction[!names(code_prediction) %in% duplicated_SNPs] + } + + if (nrow(vcf) > 0) { + + # ---------------------------------------------------------------------------- # + # Recode SNPs as numeric + # ---------------------------------------------------------------------------- # + + gt_data <- rowRanges(vcf) + gt <- t(apply(geno(vcf)$GT, 1, as.numeric)) + colnames(gt) <- colnames(geno(vcf)$GT) + + # ---------------------------------------------------------------------------- # + # Recode SNPs + # ---------------------------------------------------------------------------- # + # 0 is the allele carried by the outgroup + # 1 is the other allele + + recode_by_outgroup <- function(gt_row, outgroup) { + + no_missing <- gt_row[!is.na(gt_row)] + + if (!any(no_missing %in% c(0,1))) { + stop("The only allowed genotypes are 1 and 0") + } + + if (all(no_missing == no_missing[1])) { + stop("Site needs to be polymorphic") + } + + outgroup_gt <- gt_row[outgroup] + if (all(is.na(outgroup_gt))) { + print(outgroup_gt) + stop("Error: all outgroup samples are NA") + } + + outgroup_gt <- outgroup_gt[!is.na(outgroup_gt)] + + if (any(outgroup_gt != outgroup_gt[1])) { + print(outgroup_gt) + stop("Error: outgroup cannot be polymorphic") + } + + # The following is necessary in case there is missing data + + # If the first element of outgroup is the alternative allele (1), + # Then 1 -> 0 and 0 -> 1 + if (outgroup_gt[1] == 1) { + + gt_row <- c(1,0)[match(gt_row,c(0,1))] + + } + + return(gt_row) + + } + + recoded_gt <- t(apply(gt, 1, function(row) recode_by_outgroup(row, outgroup = saevissima))) + colnames(recoded_gt) <- colnames(gt) + + gt <- recoded_gt + + # ---------------------------------------------------------------------------- # + # Allele frequency of each population + # ---------------------------------------------------------------------------- # + + allele_frequencies_ma <- cbind( + invicta_bb = af_by_pop(gt = gt, pop = invicta_bb), + invicta_lb = af_by_pop(gt = gt, pop = invicta_lb), + richteri_bb = af_by_pop(gt = gt, pop = richteri_bb), + richteri_lb = af_by_pop(gt = gt, pop = richteri_lb) + ) + + # ---------------------------------------------------------------------------- # + # Private to SB + # ---------------------------------------------------------------------------- # + + private_alleles <- function(freq_matrix, pop) { + pop_freq <- freq_matrix[, pop, drop = FALSE] + other_freq <- freq_matrix[, !colnames(freq_matrix) %in% pop, drop = FALSE] + + # Present in population 1 and no other population + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop_freq, 1, sum) > 0) & (apply(other_freq, 1, sum) == 0) + + } + + private_to_invicta_bb <- private_alleles(allele_frequencies_ma, pop="invicta_bb") + private_to_richteri_bb <- private_alleles(allele_frequencies_ma, pop="richteri_bb") + + # ---------------------------------------------------------------------------- # + # Shared between clades + # ---------------------------------------------------------------------------- # + shared_between_clades <- function(freq_matrix, pop1, pop2) { + pop1_freq <- freq_matrix[, pop1, drop = FALSE] + pop2_freqs <- freq_matrix[, pop2, drop = FALSE] + + # Present in population 1 and at least one of the other populations + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop1_freq, 1, sum) > 0) & (apply(pop2_freqs, 1, sum) > 0) + } + + shared_lb_invicta_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "invicta_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("invicta_bb", "invicta_lb", "richteri_lb")) + + shared_lb_richteri_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "richteri_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("richteri_bb", "invicta_lb", "richteri_lb")) + + # ---------------------------------------------------------------------------- # + # Prepare end data frame + # ---------------------------------------------------------------------------- # + df_to_write <- as.data.frame(rowRanges(vcf)) + + df_to_write$paramRangeID <- NULL + df_to_write$ALT <- sapply(unlist(df_to_write$ALT), as.character) + + code_prediction <- as.data.frame(code_prediction) + code_prediction <- code_prediction[, -c(1:9)] + + tables_match <- match(rownames(df_to_write), rownames(code_prediction)) + df_to_write <- cbind(df_to_write, code_prediction[tables_match,]) + + # Add frequencies + # Recalculate for outgroup because VCF may have been filtered + # But use the genotypes in VCF and not the recoded GT + df_to_write$af_saevissima_outgroup <- af_by_pop(gt = geno(vcf)$GT, + pop = saevissima) + df_to_write$derived_allele <- ifelse(df_to_write$af_saevissima_outgroup == 0, + "REF", "ALT") + + colnames(allele_frequencies_ma) <- paste0("daf_", colnames(allele_frequencies_ma)) + df_to_write <- cbind(df_to_write, allele_frequencies_ma) + + # Add measurement of each private/shared + df_to_write$private_to_invicta_bb <- private_to_invicta_bb + df_to_write$private_to_richteri_bb <- private_to_richteri_bb + + df_to_write$shared_lb_invicta_bb <- shared_lb_invicta_bb + df_to_write$shared_lb_richteri_bb <- shared_lb_richteri_bb + + # Add region + df_to_write$chr <- regions_gr$chr_name[i] + df_to_write$region <- regions_gr$region[i] + + # ---------------------------------------------------------------------------- # + out_path <- paste("results/by_scaff/private_and_shared_per_site_", i, ".csv", sep = "") + + write.csv(df_to_write, file = out_path, + row.names=FALSE, quote=FALSE) + # ---------------------------------------------------------------------------- # + } + } +} diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq4.r b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq4.r new file mode 100644 index 0000000..e63ad27 --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq4.r @@ -0,0 +1,318 @@ +library(GenomicRanges) +library(VariantAnnotation) +library(GenomicFeatures) + +# VCF file +vcf_path <- "tmp/biallelic_snps.vcf.gz" + +sample_table <- read.table("results/samples", header = TRUE) + +invicta_bb <- sample_table$sample[sample_table$species_variant == "invicta_bb"] +invicta_lb <- sample_table$sample[sample_table$species_variant == "invicta_lb"] +richteri_bb <- sample_table$sample[sample_table$species_variant == "richteri_bb"] +richteri_lb <- sample_table$sample[sample_table$species_variant == "richteri_lb"] +saevissima <- sample_table$sample[sample_table$species_variant == "saevissima_outgroup"] +all_samples <- c(invicta_bb, invicta_lb, richteri_bb, richteri_lb, saevissima) + +dir.create("results/by_scaff", recursive=TRUE) + +# ---------------------------------------------------------------------------- # +# Get supergene/non supergene region +# ---------------------------------------------------------------------------- # + +regions <- read.table("input/linkage_map_supergene.txt", header = TRUE) +regions_gr <- regions[, c("scaffold", "scaffold_start", "scaffold_end", "orientation", "chr", "region")] +colnames(regions_gr) <- c( + "chr", "start", "end", "orientation", "chr_name", "region" +) +regions_gr <- GRanges(regions_gr) + +# ---------------------------------------------------------------------------- # +# Read genome fasta and gff file, to predict coding changes +# ---------------------------------------------------------------------------- # +txdb <- makeTxDbFromGFF("input/gnga.gff", format = "gff3") +gnga <- readDNAStringSet("input/ref.fa") +names(gnga) <- gsub(" .*", "", names(gnga)) + +# ---------------------------------------------------------------------------- # +# Read VCF bit by bit +# ---------------------------------------------------------------------------- # +scaffs_in_vcf <- headerTabix("tmp/biallelic_snps.vcf.gz")$seqnames +for (i in 301:400) { + + if (as.character(seqnames(regions_gr[i])) %in% scaffs_in_vcf) { + # ---------------------------------------------------------------------------- # + # Read VCF (VariantAnnotation) + # ---------------------------------------------------------------------------- # + + param <- ScanVcfParam(which = regions_gr[i], + geno = "GT", + samples = all_samples) + + vcf <- readVcf(vcf_path, row.names=TRUE, param=param) + + # ---------------------------------------------------------------------------- # + # Remove weird sites + # ---------------------------------------------------------------------------- # + acceptable_gt <- c("0|0", "1|1", ".|.") + bi_allelic <- apply(geno(vcf)$GT, 1, function(gt_row) all(gt_row %in% acceptable_gt)) + vcf <- vcf[bi_allelic,] + + # keep only sites with a single alternative - too slow + # now done with bcftools in advance + # vcf <- vcf[sapply(rowRanges(vcf)$ALT, length) == 1, ] + + # ---------------------------------------------------------------------------- # + # Recode GT + # ---------------------------------------------------------------------------- # + + geno(vcf)$GT[geno(vcf)$GT == "0|0"] <- "0" + geno(vcf)$GT[geno(vcf)$GT == "1|1"] <- "1" + geno(vcf)$GT[geno(vcf)$GT == ".|."] <- NA + + # ---------------------------------------------------------------------------- # + # No missing + # ---------------------------------------------------------------------------- # + # Analysis will only work if most individuals in all populations carry the SNP + + missing_per_row <- function(gt_row, max_missing) { + # max_missing should be a proportion + stopifnot(max_missing < 1, max_missing >= 0) + max_missing <- max_missing * length(gt_row) + + # retuns TRUE is the number of missing genopypes is smaller than max_missing + return(sum(is.na(gt_row)) < max_missing) + } + + missing_per_pop <- function(gt, pop, max_missing) { + gt <- gt[,pop] + # returns vector of TRUE or FALSE + not_missing <- apply(gt, 1, function(row) missing_per_row(row, max_missing)) + } + + max_missing <- 0.20 # that's 5.6 in 28 + + # filter by missing + not_missing_pop <- missing_per_pop(geno(vcf)$GT, invicta_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, invicta_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, saevissima, max_missing) + + # sum(not_missing_pop) + # 11979 in 12222 + vcf <- vcf[not_missing_pop, ] + + # ---------------------------------------------------------------------------- # + # Needs to be polymorphic + # ---------------------------------------------------------------------------- # + + alternative_frequency <- function(gt_row) { + + # Deal with missing data + if (all(is.na(gt_row))) { + print(gt_row) + stop("Error: all samples are NA") + } + gt_row <- gt_row[!is.na(gt_row)] + + # Calculate frquency of alternative (0) allele + aaf <- sum(gt_row == 1) / length(gt_row) + + } + + af <- apply(geno(vcf)$GT, 1, function(row) alternative_frequency(row)) + + vcf <- vcf[(af != 0) & (af != 1)] + + # ---------------------------------------------------------------------------- # + # Remove sites that are polymorphic in the outgroup species + # ---------------------------------------------------------------------------- # + + af_by_pop <- function(gt, pop) { + apply(gt[,pop], 1, function(row) alternative_frequency(row)) + } + + af_outgroup <- af_by_pop(gt = geno(vcf)$GT, pop = saevissima) + + # sum((af_outgroup == 0) & (af_outgroup == 1)) + # [1] 3836 + + vcf <- vcf[(af_outgroup == 0) | (af_outgroup == 1), ] + af_outgroup <- af_outgroup[(af_outgroup == 0) | (af_outgroup == 1)] + + # ---------------------------------------------------------------------------- # + # Predict coding + # ---------------------------------------------------------------------------- # + if (nrow(vcf) > 0) { + code_prediction <- predictCoding(vcf, txdb, seqSource = gnga) + + # Remove SNPs that have a coding effect in multiple transcripts + duplicated_SNPs <- names(code_prediction)[duplicated(names(code_prediction))] + + vcf <- vcf[!rownames(vcf) %in% duplicated_SNPs] + + code_prediction <- code_prediction[!names(code_prediction) %in% duplicated_SNPs] + } + + if (nrow(vcf) > 0) { + + # ---------------------------------------------------------------------------- # + # Recode SNPs as numeric + # ---------------------------------------------------------------------------- # + + gt_data <- rowRanges(vcf) + gt <- t(apply(geno(vcf)$GT, 1, as.numeric)) + colnames(gt) <- colnames(geno(vcf)$GT) + + # ---------------------------------------------------------------------------- # + # Recode SNPs + # ---------------------------------------------------------------------------- # + # 0 is the allele carried by the outgroup + # 1 is the other allele + + recode_by_outgroup <- function(gt_row, outgroup) { + + no_missing <- gt_row[!is.na(gt_row)] + + if (!any(no_missing %in% c(0,1))) { + stop("The only allowed genotypes are 1 and 0") + } + + if (all(no_missing == no_missing[1])) { + stop("Site needs to be polymorphic") + } + + outgroup_gt <- gt_row[outgroup] + if (all(is.na(outgroup_gt))) { + print(outgroup_gt) + stop("Error: all outgroup samples are NA") + } + + outgroup_gt <- outgroup_gt[!is.na(outgroup_gt)] + + if (any(outgroup_gt != outgroup_gt[1])) { + print(outgroup_gt) + stop("Error: outgroup cannot be polymorphic") + } + + # The following is necessary in case there is missing data + + # If the first element of outgroup is the alternative allele (1), + # Then 1 -> 0 and 0 -> 1 + if (outgroup_gt[1] == 1) { + + gt_row <- c(1,0)[match(gt_row,c(0,1))] + + } + + return(gt_row) + + } + + recoded_gt <- t(apply(gt, 1, function(row) recode_by_outgroup(row, outgroup = saevissima))) + colnames(recoded_gt) <- colnames(gt) + + gt <- recoded_gt + + # ---------------------------------------------------------------------------- # + # Allele frequency of each population + # ---------------------------------------------------------------------------- # + + allele_frequencies_ma <- cbind( + invicta_bb = af_by_pop(gt = gt, pop = invicta_bb), + invicta_lb = af_by_pop(gt = gt, pop = invicta_lb), + richteri_bb = af_by_pop(gt = gt, pop = richteri_bb), + richteri_lb = af_by_pop(gt = gt, pop = richteri_lb) + ) + + # ---------------------------------------------------------------------------- # + # Private to SB + # ---------------------------------------------------------------------------- # + + private_alleles <- function(freq_matrix, pop) { + pop_freq <- freq_matrix[, pop, drop = FALSE] + other_freq <- freq_matrix[, !colnames(freq_matrix) %in% pop, drop = FALSE] + + # Present in population 1 and no other population + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop_freq, 1, sum) > 0) & (apply(other_freq, 1, sum) == 0) + + } + + private_to_invicta_bb <- private_alleles(allele_frequencies_ma, pop="invicta_bb") + private_to_richteri_bb <- private_alleles(allele_frequencies_ma, pop="richteri_bb") + + # ---------------------------------------------------------------------------- # + # Shared between clades + # ---------------------------------------------------------------------------- # + shared_between_clades <- function(freq_matrix, pop1, pop2) { + pop1_freq <- freq_matrix[, pop1, drop = FALSE] + pop2_freqs <- freq_matrix[, pop2, drop = FALSE] + + # Present in population 1 and at least one of the other populations + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop1_freq, 1, sum) > 0) & (apply(pop2_freqs, 1, sum) > 0) + } + + shared_lb_invicta_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "invicta_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("invicta_bb", "invicta_lb", "richteri_lb")) + + shared_lb_richteri_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "richteri_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("richteri_bb", "invicta_lb", "richteri_lb")) + + # ---------------------------------------------------------------------------- # + # Prepare end data frame + # ---------------------------------------------------------------------------- # + df_to_write <- as.data.frame(rowRanges(vcf)) + + df_to_write$paramRangeID <- NULL + df_to_write$ALT <- sapply(unlist(df_to_write$ALT), as.character) + + code_prediction <- as.data.frame(code_prediction) + code_prediction <- code_prediction[, -c(1:9)] + + tables_match <- match(rownames(df_to_write), rownames(code_prediction)) + df_to_write <- cbind(df_to_write, code_prediction[tables_match,]) + + # Add frequencies + # Recalculate for outgroup because VCF may have been filtered + # But use the genotypes in VCF and not the recoded GT + df_to_write$af_saevissima_outgroup <- af_by_pop(gt = geno(vcf)$GT, + pop = saevissima) + df_to_write$derived_allele <- ifelse(df_to_write$af_saevissima_outgroup == 0, + "REF", "ALT") + + colnames(allele_frequencies_ma) <- paste0("daf_", colnames(allele_frequencies_ma)) + df_to_write <- cbind(df_to_write, allele_frequencies_ma) + + # Add measurement of each private/shared + df_to_write$private_to_invicta_bb <- private_to_invicta_bb + df_to_write$private_to_richteri_bb <- private_to_richteri_bb + + df_to_write$shared_lb_invicta_bb <- shared_lb_invicta_bb + df_to_write$shared_lb_richteri_bb <- shared_lb_richteri_bb + + # Add region + df_to_write$chr <- regions_gr$chr_name[i] + df_to_write$region <- regions_gr$region[i] + + # ---------------------------------------------------------------------------- # + out_path <- paste("results/by_scaff/private_and_shared_per_site_", i, ".csv", sep = "") + + write.csv(df_to_write, file = out_path, + row.names=FALSE, quote=FALSE) + # ---------------------------------------------------------------------------- # + } + } +} diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq5.r b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq5.r new file mode 100644 index 0000000..55ddaa7 --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/allele_freq5.r @@ -0,0 +1,318 @@ +library(GenomicRanges) +library(VariantAnnotation) +library(GenomicFeatures) + +# VCF file +vcf_path <- "tmp/biallelic_snps.vcf.gz" + +sample_table <- read.table("results/samples", header = TRUE) + +invicta_bb <- sample_table$sample[sample_table$species_variant == "invicta_bb"] +invicta_lb <- sample_table$sample[sample_table$species_variant == "invicta_lb"] +richteri_bb <- sample_table$sample[sample_table$species_variant == "richteri_bb"] +richteri_lb <- sample_table$sample[sample_table$species_variant == "richteri_lb"] +saevissima <- sample_table$sample[sample_table$species_variant == "saevissima_outgroup"] +all_samples <- c(invicta_bb, invicta_lb, richteri_bb, richteri_lb, saevissima) + +dir.create("results/by_scaff", recursive=TRUE) + +# ---------------------------------------------------------------------------- # +# Get supergene/non supergene region +# ---------------------------------------------------------------------------- # + +regions <- read.table("input/linkage_map_supergene.txt", header = TRUE) +regions_gr <- regions[, c("scaffold", "scaffold_start", "scaffold_end", "orientation", "chr", "region")] +colnames(regions_gr) <- c( + "chr", "start", "end", "orientation", "chr_name", "region" +) +regions_gr <- GRanges(regions_gr) + +# ---------------------------------------------------------------------------- # +# Read genome fasta and gff file, to predict coding changes +# ---------------------------------------------------------------------------- # +txdb <- makeTxDbFromGFF("input/gnga.gff", format = "gff3") +gnga <- readDNAStringSet("input/ref.fa") +names(gnga) <- gsub(" .*", "", names(gnga)) + +# ---------------------------------------------------------------------------- # +# Read VCF bit by bit +# ---------------------------------------------------------------------------- # +scaffs_in_vcf <- headerTabix("tmp/biallelic_snps.vcf.gz")$seqnames +for (i in 401:length(regions_gr)) { + + if (as.character(seqnames(regions_gr[i])) %in% scaffs_in_vcf) { + # ---------------------------------------------------------------------------- # + # Read VCF (VariantAnnotation) + # ---------------------------------------------------------------------------- # + + param <- ScanVcfParam(which = regions_gr[i], + geno = "GT", + samples = all_samples) + + vcf <- readVcf(vcf_path, row.names=TRUE, param=param) + + # ---------------------------------------------------------------------------- # + # Remove weird sites + # ---------------------------------------------------------------------------- # + acceptable_gt <- c("0|0", "1|1", ".|.") + bi_allelic <- apply(geno(vcf)$GT, 1, function(gt_row) all(gt_row %in% acceptable_gt)) + vcf <- vcf[bi_allelic,] + + # keep only sites with a single alternative - too slow + # now done with bcftools in advance + # vcf <- vcf[sapply(rowRanges(vcf)$ALT, length) == 1, ] + + # ---------------------------------------------------------------------------- # + # Recode GT + # ---------------------------------------------------------------------------- # + + geno(vcf)$GT[geno(vcf)$GT == "0|0"] <- "0" + geno(vcf)$GT[geno(vcf)$GT == "1|1"] <- "1" + geno(vcf)$GT[geno(vcf)$GT == ".|."] <- NA + + # ---------------------------------------------------------------------------- # + # No missing + # ---------------------------------------------------------------------------- # + # Analysis will only work if most individuals in all populations carry the SNP + + missing_per_row <- function(gt_row, max_missing) { + # max_missing should be a proportion + stopifnot(max_missing < 1, max_missing >= 0) + max_missing <- max_missing * length(gt_row) + + # retuns TRUE is the number of missing genopypes is smaller than max_missing + return(sum(is.na(gt_row)) < max_missing) + } + + missing_per_pop <- function(gt, pop, max_missing) { + gt <- gt[,pop] + # returns vector of TRUE or FALSE + not_missing <- apply(gt, 1, function(row) missing_per_row(row, max_missing)) + } + + max_missing <- 0.20 # that's 5.6 in 28 + + # filter by missing + not_missing_pop <- missing_per_pop(geno(vcf)$GT, invicta_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, invicta_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_bb, max_missing) & + missing_per_pop(geno(vcf)$GT, richteri_lb, max_missing) & + missing_per_pop(geno(vcf)$GT, saevissima, max_missing) + + # sum(not_missing_pop) + # 11979 in 12222 + vcf <- vcf[not_missing_pop, ] + + # ---------------------------------------------------------------------------- # + # Needs to be polymorphic + # ---------------------------------------------------------------------------- # + + alternative_frequency <- function(gt_row) { + + # Deal with missing data + if (all(is.na(gt_row))) { + print(gt_row) + stop("Error: all samples are NA") + } + gt_row <- gt_row[!is.na(gt_row)] + + # Calculate frquency of alternative (0) allele + aaf <- sum(gt_row == 1) / length(gt_row) + + } + + af <- apply(geno(vcf)$GT, 1, function(row) alternative_frequency(row)) + + vcf <- vcf[(af != 0) & (af != 1)] + + # ---------------------------------------------------------------------------- # + # Remove sites that are polymorphic in the outgroup species + # ---------------------------------------------------------------------------- # + + af_by_pop <- function(gt, pop) { + apply(gt[,pop], 1, function(row) alternative_frequency(row)) + } + + af_outgroup <- af_by_pop(gt = geno(vcf)$GT, pop = saevissima) + + # sum((af_outgroup == 0) & (af_outgroup == 1)) + # [1] 3836 + + vcf <- vcf[(af_outgroup == 0) | (af_outgroup == 1), ] + af_outgroup <- af_outgroup[(af_outgroup == 0) | (af_outgroup == 1)] + + # ---------------------------------------------------------------------------- # + # Predict coding + # ---------------------------------------------------------------------------- # + if (nrow(vcf) > 0) { + code_prediction <- predictCoding(vcf, txdb, seqSource = gnga) + + # Remove SNPs that have a coding effect in multiple transcripts + duplicated_SNPs <- names(code_prediction)[duplicated(names(code_prediction))] + + vcf <- vcf[!rownames(vcf) %in% duplicated_SNPs] + + code_prediction <- code_prediction[!names(code_prediction) %in% duplicated_SNPs] + } + + if (nrow(vcf) > 0) { + + # ---------------------------------------------------------------------------- # + # Recode SNPs as numeric + # ---------------------------------------------------------------------------- # + + gt_data <- rowRanges(vcf) + gt <- t(apply(geno(vcf)$GT, 1, as.numeric)) + colnames(gt) <- colnames(geno(vcf)$GT) + + # ---------------------------------------------------------------------------- # + # Recode SNPs + # ---------------------------------------------------------------------------- # + # 0 is the allele carried by the outgroup + # 1 is the other allele + + recode_by_outgroup <- function(gt_row, outgroup) { + + no_missing <- gt_row[!is.na(gt_row)] + + if (!any(no_missing %in% c(0,1))) { + stop("The only allowed genotypes are 1 and 0") + } + + if (all(no_missing == no_missing[1])) { + stop("Site needs to be polymorphic") + } + + outgroup_gt <- gt_row[outgroup] + if (all(is.na(outgroup_gt))) { + print(outgroup_gt) + stop("Error: all outgroup samples are NA") + } + + outgroup_gt <- outgroup_gt[!is.na(outgroup_gt)] + + if (any(outgroup_gt != outgroup_gt[1])) { + print(outgroup_gt) + stop("Error: outgroup cannot be polymorphic") + } + + # The following is necessary in case there is missing data + + # If the first element of outgroup is the alternative allele (1), + # Then 1 -> 0 and 0 -> 1 + if (outgroup_gt[1] == 1) { + + gt_row <- c(1,0)[match(gt_row,c(0,1))] + + } + + return(gt_row) + + } + + recoded_gt <- t(apply(gt, 1, function(row) recode_by_outgroup(row, outgroup = saevissima))) + colnames(recoded_gt) <- colnames(gt) + + gt <- recoded_gt + + # ---------------------------------------------------------------------------- # + # Allele frequency of each population + # ---------------------------------------------------------------------------- # + + allele_frequencies_ma <- cbind( + invicta_bb = af_by_pop(gt = gt, pop = invicta_bb), + invicta_lb = af_by_pop(gt = gt, pop = invicta_lb), + richteri_bb = af_by_pop(gt = gt, pop = richteri_bb), + richteri_lb = af_by_pop(gt = gt, pop = richteri_lb) + ) + + # ---------------------------------------------------------------------------- # + # Private to SB + # ---------------------------------------------------------------------------- # + + private_alleles <- function(freq_matrix, pop) { + pop_freq <- freq_matrix[, pop, drop = FALSE] + other_freq <- freq_matrix[, !colnames(freq_matrix) %in% pop, drop = FALSE] + + # Present in population 1 and no other population + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop_freq, 1, sum) > 0) & (apply(other_freq, 1, sum) == 0) + + } + + private_to_invicta_bb <- private_alleles(allele_frequencies_ma, pop="invicta_bb") + private_to_richteri_bb <- private_alleles(allele_frequencies_ma, pop="richteri_bb") + + # ---------------------------------------------------------------------------- # + # Shared between clades + # ---------------------------------------------------------------------------- # + shared_between_clades <- function(freq_matrix, pop1, pop2) { + pop1_freq <- freq_matrix[, pop1, drop = FALSE] + pop2_freqs <- freq_matrix[, pop2, drop = FALSE] + + # Present in population 1 and at least one of the other populations + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop1_freq, 1, sum) > 0) & (apply(pop2_freqs, 1, sum) > 0) + } + + shared_lb_invicta_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "invicta_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("invicta_bb", "invicta_lb", "richteri_lb")) + + shared_lb_richteri_bb <- shared_between_clades( + allele_frequencies_ma, + pop1 = "richteri_bb", + pop2 = c("invicta_lb", "richteri_lb") + ) & + private_alleles(allele_frequencies_ma, c("richteri_bb", "invicta_lb", "richteri_lb")) + + # ---------------------------------------------------------------------------- # + # Prepare end data frame + # ---------------------------------------------------------------------------- # + df_to_write <- as.data.frame(rowRanges(vcf)) + + df_to_write$paramRangeID <- NULL + df_to_write$ALT <- sapply(unlist(df_to_write$ALT), as.character) + + code_prediction <- as.data.frame(code_prediction) + code_prediction <- code_prediction[, -c(1:9)] + + tables_match <- match(rownames(df_to_write), rownames(code_prediction)) + df_to_write <- cbind(df_to_write, code_prediction[tables_match,]) + + # Add frequencies + # Recalculate for outgroup because VCF may have been filtered + # But use the genotypes in VCF and not the recoded GT + df_to_write$af_saevissima_outgroup <- af_by_pop(gt = geno(vcf)$GT, + pop = saevissima) + df_to_write$derived_allele <- ifelse(df_to_write$af_saevissima_outgroup == 0, + "REF", "ALT") + + colnames(allele_frequencies_ma) <- paste0("daf_", colnames(allele_frequencies_ma)) + df_to_write <- cbind(df_to_write, allele_frequencies_ma) + + # Add measurement of each private/shared + df_to_write$private_to_invicta_bb <- private_to_invicta_bb + df_to_write$private_to_richteri_bb <- private_to_richteri_bb + + df_to_write$shared_lb_invicta_bb <- shared_lb_invicta_bb + df_to_write$shared_lb_richteri_bb <- shared_lb_richteri_bb + + # Add region + df_to_write$chr <- regions_gr$chr_name[i] + df_to_write$region <- regions_gr$region[i] + + # ---------------------------------------------------------------------------- # + out_path <- paste("results/by_scaff/private_and_shared_per_site_", i, ".csv", sep = "") + + write.csv(df_to_write, file = out_path, + row.names=FALSE, quote=FALSE) + # ---------------------------------------------------------------------------- # + } + } +} diff --git a/Private and shared alleles/Polymorphisms in invicta and richteri/shared_private.rmd b/Private and shared alleles/Polymorphisms in invicta and richteri/shared_private.rmd new file mode 100644 index 0000000..2145e65 --- /dev/null +++ b/Private and shared alleles/Polymorphisms in invicta and richteri/shared_private.rmd @@ -0,0 +1,656 @@ +--- +title: "Comparing ILS and introgression scenarios using SNPs" +author: "Rodrigo Pracana, QMUL" +date: "09/11/2021" +output: html_document +--- + +```{r setup, include=FALSE, cache = FALSE} + + +make_sci <- function(x) { + sprintf("%.1fx10^{%d}", + x/10^floor(log10(abs(x))), + floor(log10(abs(x)))) +} + +pretty_number_inner <- function(x) { + if (is.na(x)) { + x <- NA + } else if (x == 0) { + x <- as.character(x) + } else if (x < 1e-4) { + x <- as.character(make_sci(x)) + } else if (x < 1e-2) { + x <- as.character(round(x, 4)) + } else if (x < 1e4) { + x <- as.character(round(x, 2)) + } else { + x <- as.character(make_sci(x)) + } + return(x) +} + +pretty_numbers <- function(x) { + sapply(x, pretty_number_inner) +} + +knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) + + +library(tidyverse) +library(patchwork) +library(broom) + +dir.create(recursive = T, "results/fig") + +``` + +```{r} + +snp_freq <- read.csv("results/snp_frequencies.csv") + +snp_freq$CONSEQUENCE[is.na(snp_freq$CONSEQUENCE)] <- "not protein-coding" + +``` + +## Introduction + +We are studying `r nrow(snp_freq)` SNP sites. + +```{r} + +snp_freq %>% + group_by(region) %>% count + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarise(n = n()) %>% + mutate(percent = round(100 * n / sum(n))) + +``` + +Within a species, the allele frequency of SNPs in the Sb samples and SB samples should be highly correlated, except in the supergene. This is seen in _S. invicta_: + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + ggplot() + + geom_point(aes(x = daf_invicta_bb, y = daf_invicta_lb), alpha = 0.5) + + facet_grid(cols = vars(CONSEQUENCE), rows = vars(region)) + + theme_bw() + +``` + +It is also seen in _S. richteri_: + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + ggplot() + + geom_point(aes(x = daf_richteri_bb, y = daf_richteri_lb), alpha = 0.5) + + facet_grid(cols = vars(CONSEQUENCE), rows = vars(region)) + + theme_bw() + +``` + +When we compare between species, we see the following: + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + ggplot() + + geom_point(aes(x = daf_invicta_bb, y = daf_richteri_bb), alpha = 0.5) + + facet_grid(cols = vars(CONSEQUENCE), rows = vars(region)) + + theme_bw() + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + ggplot() + + geom_point(aes(x = daf_invicta_lb, y = daf_richteri_lb), alpha = 0.5) + + facet_grid(cols = vars(CONSEQUENCE), rows = vars(region)) + + theme_bw() + + +``` + +## Private to SB + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "not protein-coding")) %>% + pivot_longer(c(private_to_invicta_bb, private_to_richteri_bb)) %>% + group_by(region, CONSEQUENCE, name) %>% + summarize(number = sum(value), + percentage = round(100* sum(value)/n()), + n = n()) + +``` + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous")) %>% + pivot_longer(c(private_to_invicta_bb, private_to_richteri_bb)) %>% + group_by(region, CONSEQUENCE, name, TXID) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = name, values_from = number) %>% + ggplot() + + geom_point(aes(x = private_to_invicta_bb, y =private_to_richteri_bb), alpha = 0.5) + + facet_grid(cols = vars(CONSEQUENCE), rows = vars(region)) + + theme_bw() + + coord_fixed(xlim = c(0, 20), ylim = c(0,20)) + +``` + +## Shared between SB and the two Sb + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "not protein-coding")) %>% + pivot_longer(c(shared_lb_invicta_bb, shared_lb_richteri_bb)) %>% + group_by(region, CONSEQUENCE, name) %>% + summarize(number = sum(value), + percentage = round(100* sum(value)/n()), + n = n()) + +``` + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous")) %>% + pivot_longer(c(shared_lb_invicta_bb, shared_lb_richteri_bb)) %>% + group_by(region, CONSEQUENCE, name, TXID) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = name, values_from = number) %>% + ggplot() + + geom_point(aes(x = shared_lb_invicta_bb, y =shared_lb_richteri_bb), alpha = 0.5) + + facet_grid(cols = vars(CONSEQUENCE), rows = vars(region)) + + theme_bw() + + coord_fixed(xlim = c(0, 20), ylim = c(0,20)) + +``` + +## Private to each lineage + +```{r} +private_alleles <- function(freq_matrix, pop) { + pop_freq <- freq_matrix[, pop, drop = FALSE] + other_freq <- freq_matrix[, !colnames(freq_matrix) %in% pop, drop = FALSE] + + # Present in population 1 and no other population + # The sum of frequencies will be greater than 0 if the allele is present + # in at least one of the other populations + (apply(pop_freq, 1, sum) > 0) & (apply(other_freq, 1, sum) == 0) + +} + +private_to_invicta_bb <- private_alleles(allele_frequencies_ma, pop="invicta_bb") +private_to_richteri_bb <- private_alleles(allele_frequencies_ma, pop="richteri_bb") + +allele_frequencies_ma <- snp_freq[, c("daf_invicta_bb", "daf_invicta_lb", "daf_richteri_bb", "daf_richteri_lb")] +allele_frequencies_ma <- as.matrix(allele_frequencies_ma) + + + +snp_freq$private_to_richteri <- snp_freq$daf_invicta_bb == 0 & snp_freq$daf_invicta_lb == 0 & +(snp_freq$daf_richteri_bb > 0 | snp_freq$daf_richteri_lb > 0 ) + +snp_freq$private_to_invicta <- snp_freq$daf_richteri_bb == 0 & snp_freq$daf_richteri_lb == 0 & +(snp_freq$daf_invicta_bb > 0 | snp_freq$daf_invicta_lb > 0 ) + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "not protein-coding")) %>% + pivot_longer(c(private_to_invicta, private_to_richteri)) %>% + group_by(region, CONSEQUENCE, name) %>% + summarize(number = sum(value), + percentage = round(100* sum(value)/n()), + n = n()) + +snp_freq$private_to_richteri_bb <- + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 & + snp_freq$daf_richteri_bb > 0 + +snp_freq$private_to_richteri_lb <- + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_richteri_lb > 0 + + +snp_freq$private_to_invicta_bb <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 & + snp_freq$daf_invicta_bb > 0 + +snp_freq$private_to_invicta_lb <- + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_richteri_lb == 0 & + snp_freq$daf_invicta_lb > 0 + + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "not protein-coding")) %>% + pivot_longer(c(private_to_richteri_bb, private_to_richteri_lb, + private_to_invicta_bb, private_to_invicta_lb)) %>% + group_by(region, CONSEQUENCE, name) %>% + summarize(number = sum(value), + percentage = round(100* sum(value)/n()), + n = n()) + +``` + +## Fixed differences between SB and Sb + +```{r} + +snp_freq$fixed_difference <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 1 & + snp_freq$daf_richteri_lb == 1 + +snp_freq %>% + group_by(region) %>% + summarize(number = sum(fixed_difference), + percentage = round(100* sum(fixed_difference)/n()), + n = n()) + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarize(number = sum(fixed_difference), + percentage = round(100* sum(fixed_difference)/n()), + n = n()) + +``` + +## Private alleles present in the Sb of both species + +```{r} + +snp_freq$in_both_Sb <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb > 0 & + snp_freq$daf_richteri_lb > 0 + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarize(number = sum(in_both_Sb), + percentage = round(100* sum(in_both_Sb)/n()), + n = n()) + +``` + +## Private alleles present in the SB of both species + +As expected, there are very few of these (they occur because of incomplete lineage sorting): + +```{r} + +snp_freq$in_both_Sb <- + snp_freq$daf_richteri_bb > 0 & + snp_freq$daf_invicta_bb > 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarize(number = sum(in_both_Sb), + percentage = round(100* sum(in_both_Sb)/n()), + n = n()) + +``` + +```{r} + + +snp_freq %>% filter(region == "supergene") %>% + ggplot(aes(x = daf_invicta_bb, y = daf_richteri_bb)) + + geom_point(alpha = 0.5) + + facet_grid(cols=vars(in_both_Sb), rows = vars(CONSEQUENCE)) + + +``` + +## Alleles private to Sb, but not necessarily present in both species + +```{r} + +snp_freq$private_to_Sb <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + (snp_freq$daf_invicta_lb > 0 | snp_freq$daf_richteri_lb > 0) + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarize(number = sum(private_to_Sb), + percentage = round(100* sum(private_to_Sb)/n()), + n = n()) + +``` + +## Alleles present in Sb (but not necessarily private to Sb) + +```{r} + +snp_freq$present_in_Sb <- + (snp_freq$daf_invicta_lb > 0 | snp_freq$daf_richteri_lb > 0) + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarize(number = sum(present_in_Sb), + percentage = round(100* sum(present_in_Sb)/n()), + n = n()) + +``` + + +## Alleles present in SB (but not necessarily private to SB) + +```{r} + +snp_freq$present_in_SB <- + (snp_freq$daf_invicta_bb > 0 | snp_freq$daf_richteri_bb > 0) + +snp_freq %>% + group_by(region, CONSEQUENCE) %>% + summarize(number = sum(present_in_SB), + percentage = round(100* sum(present_in_SB)/n()), + n = n()) + +``` + +## Private but not fixed per lineage + +```{r} + +snp_freq$polymorphic_invicta_bb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb > 0 & snp_freq$daf_invicta_bb < 1 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$polymorphic_invicta_lb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb > 0 & snp_freq$daf_invicta_lb < 1 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$polymorphic_richteri_bb_only <- + snp_freq$daf_richteri_bb > 0 & snp_freq$daf_richteri_bb < 1 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$polymorphic_richteri_lb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb > 0 & snp_freq$daf_richteri_lb < 1 + +snp_freq$polymorphic_richteri_only <- + (snp_freq$daf_richteri_bb > 0 | snp_freq$daf_richteri_lb > 0) & + !(snp_freq$daf_richteri_bb == 1 & snp_freq$daf_richteri_lb == 1) & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 + +snp_freq$polymorphic_invicta_only <- + (snp_freq$daf_invicta_bb > 0 | snp_freq$daf_invicta_lb > 0) & + !(snp_freq$daf_invicta_bb == 1 & snp_freq$daf_invicta_lb == 1) & + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$polymorphic_bb_only <- + snp_freq$daf_richteri_bb > 0 & snp_freq$daf_richteri_bb < 1 & + snp_freq$daf_invicta_bb > 0 & snp_freq$daf_invicta_bb < 1 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$polymorphic_lb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb > 0 & snp_freq$daf_invicta_lb < 1 & + snp_freq$daf_richteri_lb > 0 & snp_freq$daf_richteri_lb < 1 + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + pivot_longer(c(polymorphic_invicta_bb_only, + polymorphic_invicta_lb_only, + polymorphic_richteri_bb_only, + polymorphic_richteri_lb_only, + polymorphic_richteri_only, + polymorphic_invicta_only, + polymorphic_bb_only, + polymorphic_lb_only)) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = CONSEQUENCE, values_from = number) + +``` + +## Private and fixed per lineage + +```{r} + +snp_freq$fixed_invicta_bb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 1 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$fixed_invicta_lb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 1 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$fixed_richteri_bb_only <- + snp_freq$daf_richteri_bb == 1 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$fixed_richteri_lb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 1 + +snp_freq$fixed_richteri_only <- + snp_freq$daf_richteri_bb == 1 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 1 + +snp_freq$fixed_invicta_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 1 & + snp_freq$daf_invicta_lb == 1 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$fixed_bb_only <- + snp_freq$daf_richteri_bb == 1 & + snp_freq$daf_invicta_bb == 1 & + snp_freq$daf_invicta_lb == 0 & + snp_freq$daf_richteri_lb == 0 + +snp_freq$fixed_lb_only <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_invicta_lb == 1 & + snp_freq$daf_richteri_lb == 1 + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + pivot_longer(c(fixed_invicta_bb_only, + fixed_invicta_lb_only, + fixed_richteri_bb_only, + fixed_richteri_lb_only, + fixed_richteri_only, + fixed_invicta_only, + fixed_bb_only, + fixed_lb_only)) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = CONSEQUENCE, values_from = number) + + +``` + +## Shared between Sb and each of the SB species + +```{r} + +# Present in Sb and in S. invicta SB +snp_freq$pesent_sb_invicta_bb <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb > 0 & + (snp_freq$daf_invicta_lb > 0 | snp_freq$daf_richteri_lb > 0) + +# Present in Sb and in S. richteri SB +snp_freq$pesent_sb_richteri_bb <- + snp_freq$daf_richteri_bb > 0 & + snp_freq$daf_invicta_bb == 0 & + (snp_freq$daf_invicta_lb > 0 | snp_freq$daf_richteri_lb > 0) + +# Present in both species +snp_freq$pesent_both_species <- + (snp_freq$daf_richteri_bb > 0 | snp_freq$daf_richteri_lb > 0) & + (snp_freq$daf_invicta_bb > 0 | snp_freq$daf_invicta_lb > 0) + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + pivot_longer(c(pesent_sb_invicta_bb, + pesent_sb_richteri_bb, + pesent_both_species)) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = CONSEQUENCE, values_from = number) %>% + mutate(total = synonymous + nonsynonymous + `not protein-coding`) + +``` + +## Shared between Sb and each of the SB species + +```{r} + +# Fixed in Sb and present in S. invicta SB +snp_freq$fixed_sb_present_invicta_bb <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb > 0 & + (snp_freq$daf_invicta_lb == 1 | snp_freq$daf_richteri_lb == 1) + +# Present in Sb and in S. richteri SB +snp_freq$fixed_sb_present_richteri_bb <- + snp_freq$daf_richteri_bb > 0 & + snp_freq$daf_invicta_bb == 0 & + (snp_freq$daf_invicta_lb == 1 | snp_freq$daf_richteri_lb == 1) + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + pivot_longer(c(fixed_sb_present_invicta_bb, + fixed_sb_present_richteri_bb)) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = CONSEQUENCE, values_from = number) + +``` +## Shared between Sb and each of the SB species + +```{r} + +# Fixed in Sb and Fixed in S. invicta SB +snp_freq$fixed_sb_fixed_invicta_bb <- + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_bb == 1 & + (snp_freq$daf_invicta_lb == 1 | snp_freq$daf_richteri_lb == 1) + +# Fixed in Sb and Fixed in S. richteri SB +snp_freq$fixed_sb_fixed_richteri_bb <- + snp_freq$daf_richteri_bb == 1 & + snp_freq$daf_invicta_bb == 0 & + (snp_freq$daf_invicta_lb == 1 | snp_freq$daf_richteri_lb == 1) + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + pivot_longer(c(fixed_sb_fixed_invicta_bb, + fixed_sb_fixed_richteri_bb)) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = CONSEQUENCE, values_from = number) + +``` + + +## Present in all populations + + +```{r} + +# Present in all populations +snp_freq$present_in_all <- + snp_freq$daf_richteri_bb > 0 & + snp_freq$daf_invicta_bb > 0 & + snp_freq$daf_invicta_lb > 0 & + snp_freq$daf_richteri_lb > 0 + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous", "not protein-coding")) %>% + pivot_longer(present_in_all) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value)) %>% + pivot_wider(names_from = CONSEQUENCE, values_from = number) + +``` + +## Deleterious load per species + +### Alleles private to each + +```{r} + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous")) %>% + pivot_longer(c(private_to_richteri_bb, private_to_richteri_lb, + private_to_invicta_bb, private_to_invicta_lb)) %>% + group_by(region, CONSEQUENCE, name) %>% + summarize(number = sum(value), + percentage = round(100* sum(value)/n()), + n = n()) + +``` + +### Alleles private to Sb and fixed in each species + +```{r} + +snp_freq$private_to_Sb_and_fixed_invicta <- + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_invicta_lb == 1 + +snp_freq$private_to_Sb_and_fixed_richteri <- + snp_freq$daf_invicta_bb == 0 & + snp_freq$daf_richteri_bb == 0 & + snp_freq$daf_richteri_lb == 1 + +snp_freq %>% + filter(CONSEQUENCE %in% c("synonymous", "nonsynonymous")) %>% + pivot_longer(c(private_to_Sb_and_fixed_invicta, private_to_Sb_and_fixed_richteri)) %>% + group_by(region, name, CONSEQUENCE) %>% + summarize(number = sum(value), + percentage = round(100* sum(value)/n()), + n = n()) + +```