diff --git a/MAPpoly.Rproj b/MAPpoly.Rproj index 9b14df21..67a3ef33 100755 --- a/MAPpoly.Rproj +++ b/MAPpoly.Rproj @@ -1,20 +1,20 @@ -Version: 1.0 - -RestoreWorkspace: Default -SaveWorkspace: Default -AlwaysSaveHistory: Default - -EnableCodeIndexing: Yes -UseSpacesForTab: Yes -NumSpacesForTab: 2 -Encoding: UTF-8 - -RnwWeave: knitr -LaTeX: pdfLaTeX - -BuildType: Package -PackageUseDevtools: Yes -PackageInstallArgs: --no-multiarch --with-keep.source --resave-data -PackageBuildArgs: --resave-data -PackageCheckArgs: --no-vignettes --as-cran --install-args=--build -PackageRoxygenize: rd,collate,namespace,vignette +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: knitr +LaTeX: pdfLaTeX + +BuildType: Package +PackageUseDevtools: Yes +PackageInstallArgs: --no-multiarch --with-keep.source --resave-data +PackageBuildArgs: --resave-data +PackageCheckArgs: --no-vignettes --as-cran --install-args=--build +PackageRoxygenize: rd,collate,namespace,vignette diff --git a/R/RcppExports.R b/R/RcppExports.R index 78f35b01..a0d59d91 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,23 +1,23 @@ -# Generated by using Rcpp::compileAttributes() -> do not edit by hand -# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#' @export -.vcf_get_probabilities <- function(mat, pl_pos) { - .Call('_mappoly_vcf_get_probabilities', PACKAGE = 'mappoly', mat, pl_pos) -} - -#' @export -.vcf_transform_dosage <- function(mat, gt_pos) { - .Call('_mappoly_vcf_transform_dosage', PACKAGE = 'mappoly', mat, gt_pos) -} - -#' @export -.vcf_get_ploidy <- function(mat, gt_pos) { - .Call('_mappoly_vcf_get_ploidy', PACKAGE = 'mappoly', mat, gt_pos) -} - -#' @export -.vcf_get_depth <- function(mat, dp_pos) { - .Call('_mappoly_vcf_get_depth', PACKAGE = 'mappoly', mat, dp_pos) -} - +# Generated by using Rcpp::compileAttributes() -> do not edit by hand +# Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#' @export +.vcf_get_probabilities <- function(mat, pl_pos) { + .Call('_mappoly_vcf_get_probabilities', PACKAGE = 'mappoly', mat, pl_pos) +} + +#' @export +.vcf_transform_dosage <- function(mat, gt_pos) { + .Call('_mappoly_vcf_transform_dosage', PACKAGE = 'mappoly', mat, gt_pos) +} + +#' @export +.vcf_get_ploidy <- function(mat, gt_pos) { + .Call('_mappoly_vcf_get_ploidy', PACKAGE = 'mappoly', mat, gt_pos) +} + +#' @export +.vcf_get_depth <- function(mat, dp_pos) { + .Call('_mappoly_vcf_get_depth', PACKAGE = 'mappoly', mat, dp_pos) +} + diff --git a/R/build_map_by_steps.R b/R/build_map_by_steps.R index 567d9103..8cf6596d 100644 --- a/R/build_map_by_steps.R +++ b/R/build_map_by_steps.R @@ -1,508 +1,535 @@ -#' Design linkage map framework in two steps: i) estimating the recombination fraction with -#' HMM approach for each parent separately using only markers segregating individually -#' (e.g. map 1 - P1:3 x P2:0, P1: 2x4; map 2 - P1:0 x P2:3, P1:4 x P2:2); ii) merging both -#' maps and re-estimate recombination fractions. -#' -#' @param input.seq object of class \code{mappoly.sequence} -#' @param twopt object of class \code{mappoly.twopt} -#' @param start.set number of markers to start the phasing procedure (default = 4) -#' @param thres.twopt the LOD threshold used to determine if the linkage phases compared via two-point -#' analysis should be considered for the search space reduction (A.K.A. η in Mollinari and Garcia (2019), -#' default = 5) -#' @param thres.hmm the LOD threshold used to determine if the linkage phases compared via hmm analysis -#' should be evaluated in the next round of marker inclusion (default = 50) -#' @param extend.tail the length of the chain's tail that should be used to calculate the likelihood of -#' the map. If NULL (default), the function uses all markers positioned. Even if info.tail = TRUE, -#' it uses at least extend.tail as the tail length -#' @param inflation.lim.p1 the maximum accepted length difference between the current and the previous -#' parent 1 sub-map defined by arguments info.tail and extend.tail. If the size exceeds this limit, the marker will -#' not be inserted. If NULL(default), then it will insert all markers. -#' @param inflation.lim.p2 same as `inflation.lim.p1` but for parent 2 sub-map. -#' @param phase.number.limit the maximum number of linkage phases of the sub-maps defined by arguments info.tail -#' and extend.tail. Default is 20. If the size exceeds this limit, the marker will not be inserted. If Inf, -#' then it will insert all markers. -#' @param tol the desired accuracy during the sequential phase of each parental map (default = 10e-02) -#' @param tol.final the desired accuracy for the final parental map (default = 10e-04) -#' @param verbose If TRUE (default), current progress is shown; if FALSE, no output is produced -#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) -#' to re-estimate the recombination fractions while merging the parental maps (default:hmm) -#' -#' @return list containing three \code{mappoly.map} objects:1) map built with markers with segregation information from parent 1; -#' 2) map built with markers with segregation information from parent 2; 3) maps in 1 and 2 merged -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} -#' -#' -#' @export -framework_map <- function(input.seq, - twopt, - start.set = 10, - thres.twopt = 10, - thres.hmm = 30, - extend.tail = 30, - inflation.lim.p1 = 5, - inflation.lim.p2 = 5, - phase.number.limit = 10, - tol = 10e-3, - tol.final = 10e-4, - verbose = TRUE, - method = "hmm"){ - - if (!inherits(input.seq, "mappoly.sequence")) { - stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") - } - - if (!inherits(twopt, "mappoly.twopt")) { - stop(deparse(substitute(twopt)), " is not an object of class 'mappoly.twopt'") - } - ##### Map for P1 #### - s.p1 <- make_seq_mappoly(input.seq, info.parent = "p1") - tpt.p1 <- make_pairs_mappoly(twopt, s.p1) - map.p1 <- est_rf_hmm_sequential(input.seq = s.p1, - twopt = tpt.p1, - start.set = start.set, - thres.twopt = thres.twopt, - thres.hmm = thres.hmm, - extend.tail = extend.tail, - sub.map.size.diff.limit = inflation.lim.p1, - phase.number.limit = phase.number.limit, - reestimate.single.ph.configuration = TRUE, - tol = tol, - tol.final = tol.final, - verbose = verbose) - - ##### Map for P2 #### - s.p2 <- make_seq_mappoly(input.seq, info.parent = "p2") - tpt.p2 <- make_pairs_mappoly(twopt, s.p2) - map.p2 <- est_rf_hmm_sequential(input.seq = s.p2, - twopt = tpt.p2, - start.set = start.set, - thres.twopt = thres.twopt, - thres.hmm = thres.hmm, - extend.tail = extend.tail, - sub.map.size.diff.limit = inflation.lim.p2, - phase.number.limit = phase.number.limit, - reestimate.single.ph.configuration = TRUE, - tol = tol, - tol.final = tol.final, - verbose = verbose) - #### Merging P1 and P2 #### - rf.mat <- rf_list_to_matrix(twopt, - thresh.LOD.ph = thres.twopt, - shared.alleles = TRUE) - - map.p1.p2 <- merge_parental_maps(map.p1, - map.p2, - input.seq, - rf.mat, - method = method) - list(map.p1 = map.p1, - map.p2 = map.p2, - map.p1.p2 = map.p1.p2) -} - -#' Add markers that are informative in both parents using HMM approach and evaluating difference -#' in LOD and gap size -#' -#' @param input.map.list list containing three \code{mappoly.map} objects:1) map built with markers with segregation information from parent 1; -#' 2) map built with markers with segregation information from parent 2; 3) maps in 1 and 2 merged -#' @param input.seq object of class \code{mappoly.sequence} containing all markers for specific group -#' @param twopt object of class \code{mappoly.twopt} -#' @param thres.twopt the LOD threshold used to determine if the linkage phases compared via two-point -#' analysis should be considered for the search space reduction (A.K.A. η in Mollinari and Garcia (2019), -#' default = 5) -#' @param init.LOD the LOD threshold used to determine if the marker will be included or not after hmm analysis (default = 30) -#' @param verbose If TRUE (default), current progress is shown; if FALSE, no output is produced -#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) or 'wMDS_to_1D_pc' -#' (weighted MDS followed by fitting a one dimensional principal curve) to re-estimate the recombination fractions after adding markers -#' @param input.mds An object of class \code{mappoly.map} -#' @param max.rounds integer defining number of times to try to fit the remaining markers in the sequence -#' @param gap.threshold threshold for gap size -#' @param size.rem.cluster threshold for number of markers that must contain in a segment after a gap is removed to keep this segment in the sequence -#' -#' @return object of class \code{mappoly.map2} -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} -#' -#' @export -update_framework_map <- function(input.map.list, - input.seq, - twopt, - thres.twopt = 10, - init.LOD = 30, - verbose = TRUE, - method = "hmm", - input.mds = NULL, - max.rounds = 50, - size.rem.cluster = 2, - gap.threshold = 4) -{ - - if (!all(sapply(input.map.list, function(x) inherits(x, "mappoly.map")))) { - stop(deparse(substitute(twopt)), " is not an object of class 'mappoly.map'") - } - if (!inherits(input.seq, "mappoly.sequence")) { - stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") - } - - #### Inserting remaining markers #### - cur.map <- input.map.list$map.p1.p2 - cur.seq <- input.seq - LOD <- init.LOD - ll <- map.list <- vector("list", max.rounds) - la <- numeric(max.rounds) - i <- 1 - dat <- get(cur.map$info$data.name, pos = 1) - mrk.to.include <- setdiff(cur.seq$seq.mrk.names, cur.map$info$mrk.names) - - rf.mat <- rf_list_to_matrix(twopt, - thresh.LOD.ph = thres.twopt, - shared.alleles = TRUE) - - rem.mrk <- NULL - while(length(mrk.to.include) > 0 & i <= length(la)){ - cur.gen <- calc_genoprob(cur.map) - cur.res <- add_md_markers(input.map = cur.map, - mrk.to.include = mrk.to.include, - input.seq = cur.seq, - input.matrix = rf.mat, - input.genoprob = cur.gen, - input.data = dat, - input.mds = input.mds, - thresh = LOD, - method = "hmm", - verbose = verbose) - - a <- rem_mrk_clusters(cur.res$map, - size.rem.cluster = size.rem.cluster, - gap.threshold = gap.threshold) - rem.mrk <- c(rem.mrk, setdiff(cur.res$map$info$mrk.names, a$info$mrk.names)) - if(a$info$n.mrk < cur.res$map$info$n.mrk){ - if(method == "hmm"){ - cur.res$map <- reest_rf(a, method = "hmm", verbose = verbose) - } else if(method == "wMDS_to_1D_pc"){ - cur.res$map <- reest_rf(a, method = "wMDS_to_1D_pc", input.mds = input.mds) - } - } - #### resulting map - cur.map <- map.list[[i]] <- cur.res$map - ### resulting log-likelihood vector - ll[[i]] <- cur.res$ll[cur.res$map$info$mrk.names] - if(i != 1){ - if(map.list[[i-1]]$info$n.mrk >= map.list[[i]]$info$n.mrk) - { - LOD <- max(cur.res$ll, na.rm = TRUE) - 10 - if(LOD <= 0) LOD <- 10e-5 - } - } - la[i] <- LOD - mrk.to.include <- setdiff(cur.seq$seq.mrk.names, cur.map$info$mrk.names) - mrk.to.include <- setdiff(mrk.to.include, rem.mrk) - plot_map_list(map.list[1:i]) - text(rep(0, i), 1:i+.5, labels = unlist(sapply(map.list, function(x) x$info$n.mrk)), adj=-1) - i <- i + 1 - cat("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") - print(summary_maps(map.list[!sapply(map.list, is.null)], verbose = verbose)) - cat("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") - } - #### Post-mapping #### - id <- which(la!=0) - structure(list(single = list(map.p1 = input.map.list$map.p1, - map.p2 = input.map.list$map.p2, - map.p1.p2 = input.map.list$map.p1.p2), - both = list(map.list = map.list[id], - lod.thresh = la[id], - calc.lod = ll[id])), class = "mappoly.map2") -} - - -#' Build merged parental maps -#' -#' @param map.p1 object of class \code{mappoly.map} with parent 1 phased -#' @param map.p2 object of class \code{mappoly.map} with parent 2 phased -#' @param full.seq object of class \code{mappoly.sequence} containing parent 1 and parent 2 markers -#' @param full.mat object of class \code{mappoly.rf.matrix} containing two-points recombination -#' fraction estimations for parent 1 and parent 2 markers -#' -#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) -#' to re-estimate the recombination fractions -#' -#' @importFrom dplyr left_join `%>%` -#' -#' @return object of class \code{mappoly.map} with both parents information -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} -#' -#' @keywords internal -#' -merge_parental_maps <- function(map.p1, - map.p2, - full.seq, - full.mat, - method = c("ols", "hmm")){ - - method <- match.arg(method) - df.map <- data.frame(mrk = full.seq$seq.mrk.names, - pos = seq(0, 100, length.out = length(full.seq$seq.mrk.names)), - row.names = full.seq$seq.mrk.names) - mrk.p1 <- data.frame(mrk = map.p1$info$mrk.names, p1 = TRUE) - mrk.p2 <- data.frame(mrk = map.p2$info$mrk.names, p2 = TRUE) - mrk.md <- data.frame(mrk = setdiff(full.seq$seq.mrk.names, - c(mrk.p1$mrk, mrk.p2$mrk)), - p1p2 = TRUE) - df.map <- df.map %>% left_join(mrk.p1, by = "mrk") %>% - left_join(mrk.md, by = "mrk") %>% left_join(mrk.p2, by = "mrk") - mrk.p1.p2 <- df.map$mrk[which(df.map$p1 | df.map$p2)] - seq.num <- full.seq$seq.num[match(mrk.p1.p2, full.seq$seq.mrk.names)] - seq.ph = list(P = c(map.p1$maps[[1]]$seq.ph$P, - map.p2$maps[[1]]$seq.ph$P)[as.character(seq.num)], - Q = c(map.p1$maps[[1]]$seq.ph$Q, - map.p2$maps[[1]]$seq.ph$Q)[as.character(seq.num)]) - map.p1.p2 <- .mappoly_map_skeleton(ploidy = full.seq$ploidy, - n.mrk = length(mrk.p1.p2), - seq.num = seq.num, - mrk.names = mrk.p1.p2, - seq.dose.p1 = sapply(seq.ph$P, function(x) sum(as.logical(x))), - seq.dose.p2 = sapply(seq.ph$Q, function(x) sum(as.logical(x))), - data.name = full.seq$data.name, - seq.rf = mf_h(diff(df.map$pos[match(mrk.p1.p2, df.map$mrk)])), - seq.ph = seq.ph) - if(method == "ols"){ - map.p1.p2 <- reest_rf(map.p1.p2, method = "ols", input.mat = full.mat) - } else if(method == "hmm"){ - map.p1.p2 <- reest_rf(map.p1.p2, method = "hmm") - } else { - stop("Invalid method.", call. = FALSE) - } - map.p1.p2 -} - -#' Add markers to a pre-existing sequence using HMM analysis and evaluating difference in LOD -#' -#' @param input.map An object of class \code{mappoly.map} -#' @param mrk.to.include vector for marker names to be included -#' @param input.seq an object of class \code{mappoly.sequence} containing all markers (the ones in the mappoly.map and also the ones to be included) -#' @param input.matrix object of class \code{mappoly.rf.matrix} -#' @param input.genoprob an object of class \code{mappoly.genoprob} obtained with calc_genoprob of the input.map object -#' @param input.data an object of class \code{mappoly.data} -#' @param input.mds An object of class \code{mappoly.map} -#' @param thresh the LOD threshold used to determine if the marker will be included or not after hmm analysis (default = 30) -#' @param extend.tail the length of the chain's tail that should be used to calculate the likelihood of -#' the map. If NULL (default), the function uses all markers positioned. Even if info.tail = TRUE, -#' it uses at least extend.tail as the tail length -#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) or 'wMDS_to_1D_pc' -#' (weighted MDS followed by fitting a one dimensional principal curve) to re-estimate the recombination fractions after adding markers -#' @param verbose If TRUE (default), current progress is shown; if FALSE, no output is produced -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} -#' -#' @keywords internal -#' -add_md_markers <- function(input.map, - mrk.to.include, - input.seq, - input.matrix, - input.genoprob, - input.data, - input.mds = NULL, - thresh = 500, - extend.tail = 50, - method = c("hmm", "wMDS_to_1D_pc"), - verbose = T){ - method <- match.arg(method) - if(method == "wMDS_to_1D_pc" & is.null(input.mds)) - stop("you must provide 'input.mds' when selecting method = 'wMDS_to_1D_pc'", call. = FALSE) - p <- match(input.map$info$seq.num, input.seq$seq.num) - if(any(diff(p) <= 0)) - stop("map and sequence have different orders", call. = FALSE) - - # Positioned markers - id.names <- intersect(input.map$info$mrk.names, input.seq$seq.mrk.names) - # Markers to be positioned - id2.names <- setdiff(input.seq$seq.mrk.names, input.map$info$mrk.names) - id2.names <- intersect(id2.names, mrk.to.include) - - id <- match(id.names, input.seq$seq.mrk.names) - id2 <- match(id2.names, input.seq$seq.mrk.names) - - Pl <- Ql <- vector("list", length(id2)) - nm <- character(length(id2)) - ll <- numeric(length(id2)) - names(ll) <- id2.names - for(i in 1:length(id2)){ - u <- which(id - id2[i] < 0) - if(verbose){ - cat(crayon::bgMagenta(stringr::str_pad(paste0(round(100*i/length(id2),1), "%"), width = 6)), "----> ") - if(length(u) == 0 ){ - pos.test <- 0 - cat(crayon::bgRed(id2[i]), "--", id[pos.test + 1], "...|", sep = "") - } - else if(length(u) == length(id)){ - pos.test <- length(id) - cat("|...",id[pos.test - 1], "--", crayon::bgRed(id2[i]), sep = "") - } - else{ - pos.test <- max(u) - cat("|...", id[pos.test], "--", crayon::bgRed(id2[i]), "--", id[pos.test + 1], "...|", sep = "") - } - } - temp.map <- add_marker(input.map = input.map, - mrk = id2.names[i], - pos = pos.test, - genoprob = input.genoprob, - rf.matrix = input.matrix, - extend.tail = extend.tail, - verbose = FALSE) - l <- get_LOD(temp.map) - if(length(l) > 1){ - ll[i] <- l[2] - cat("~~~~~",round(l[2], 4),"~~~~~") - if(l[2] < thresh){ - cat(" ---> skip it!\n") - next() - } - } else ll[i] <- NA - a1 <- temp.map$maps[[1]]$seq.ph$P[pos.test + 1] - a2 <- temp.map$maps[[1]]$seq.ph$Q[pos.test + 1] - nm[i] <- input.seq$seq.num[id2][i] - Pl[[i]] <- a1[[1]] - Ql[[i]] <- a2[[1]] - cat("\n") - } - names(Pl) <- names(Ql) <- nm - ix <- which(nm == "") - if(length(ix) > 0){ - Pl <- Pl[-ix] - Ql <- Ql[-ix] - } - i1<-input.seq$seq.num - i2<-c(as.numeric(names(Pl)), input.map$info$seq.num) - seq.num <- intersect(i1, i2) - seq.ph = list(P = c(input.map$maps[[1]]$seq.ph$P, Pl)[as.character(seq.num)], - Q = c(input.map$maps[[1]]$seq.ph$Q, Ql)[as.character(seq.num)]) - id<-match(seq.num, input.seq$seq.num) - map.p1.p2.final <- .mappoly_map_skeleton(ploidy = input.seq$ploidy, - n.mrk = length(seq.num), - seq.num = seq.num, - mrk.names = input.seq$seq.mrk.names[id], - seq.dose.p1 = sapply(seq.ph$P, function(x) sum(as.logical(x))), - seq.dose.p2 = sapply(seq.ph$Q, function(x) sum(as.logical(x))), - data.name = input.seq$data.name, - seq.rf = rep(0.01, length(seq.num) - 1), - seq.ph = seq.ph, - chrom = input.seq$chrom[id], - genome.pos = input.seq$genome.pos[id]) - if(method == "hmm"){ - map.p1.p2.final <- reest_rf(map.p1.p2.final, method = "hmm", verbose = FALSE) - } else if(method == "wMDS_to_1D_pc"){ - map.p1.p2.final <- reest_rf(map.p1.p2.final, method = "wMDS_to_1D_pc", input.mds = input.mds) - } - list(map = map.p1.p2.final, ll = ll) -} - -# remove markers if causing gap > threshold -rem_mrk_clusters <- function(input.map, - size.rem.cluster = 1, - gap.threshold = 3){ - id <- which(imf_h(input.map$maps[[1]]$seq.rf) > gap.threshold) - id <- cbind(c(1, id+1), c(id, input.map$info$n.mrk)) - ## Selecting map segments larger then the specified threshold - segments <- id[apply(id, 1, diff) > size.rem.cluster - 1, , drop = FALSE] - id<-NULL - for(i in 1:nrow(segments)){ - id <- c(id, segments[i,1]:segments[i,2]) - } - output.map <- get_submap(input.map = input.map, mrk.pos = id, - reestimate.rf = FALSE, verbose = FALSE) - return(output.map) -} - - -#' Plot object mappoly.map2 -#' -#' @param x object of class \code{mappoly.map2} -#' -#' @import ggplot2 -#' @importFrom ggpubr font -#' -#' @export -plot_mappoly.map2 <- function(x){ - { - Var1 <- value <- Var2 <- position <- iteration <- hmm <- parent <- LODScore <- NULL - df1 <- t(sapply(x$both$map.list, function(x) summary(diff(extract_map(x))))) - nit <- nrow(df1) - df11 <- reshape2::melt(df1) - df11$hmm <- "partial" - if(!is.na(names(x$both)[4])){ - df2 <- t(sapply(x$both$map.list.err, function(x) summary(diff(extract_map(x))))) - df12 <- reshape2::melt(df2) - df12$hmm <- "full" - df <- rbind(df11, df12) - p1<-ggplot(df, aes(x=as.factor(Var1), y=value, group=Var2)) + - geom_line(aes(color=Var2)) + - geom_point(aes(color=Var2)) + - ylab("nearest neighbor marker distance") + - xlab("Iteration") + - #annotate(geom="label", x = 1:nit, y = df1[,6] + .5, - # label=sapply(x$both$map.list, function(x) x$info$n.mrk), - # color="red") + - theme(legend.position = "none") + facet_wrap(.~hmm) - } else{ - df <- df11 - p1<-ggplot(df, aes(x=as.factor(Var1), y=value, group=Var2)) + - geom_line(aes(color=Var2)) + - geom_point(aes(color=Var2)) + - ylab("nearest neighbor marker distance") + - xlab("Iteration") + - annotate(geom="label", x = 1:nit, y = df1[,6] + .5, - label=sapply(x$both$map.list, function(x) x$info$n.mrk), - color="red") + - theme(legend.position = "none") - } - } - { - df31 <- lapply(x$both$map.list, function(x) extract_map(x)) - df41 <- reshape2::melt(df31) - df41$hmm <- "partial" - colnames(df41)[1:2] <- c("position", "iteration") - len.map1 <- sapply(df31, max) - if(!is.na(names(x$both)[4])){ - df32 <- lapply(x$both$map.list.err, function(x) extract_map(x)) - df42 <- reshape2::melt(df32) - df42$hmm <- "full" - colnames(df42)[1:2] <- c("position", "iteration") - len.map2 <- sapply(df32, max) - df4 <- rbind(df41, df42) - #len.map <- c(len.map1, len.map2) - } else{ - df4 <- df41 - } - p2<-ggplot(df4, aes(y=-position, x = as.factor(iteration), color = hmm)) + - geom_point(shape = 95, size = 5) + - # annotate(geom="label", y = -len.map + 1, x = 1:length(len.map) + 0.2, - # label=round(len.map, 1), - # color="red", size = 3) + - xlab("iteration") + facet_wrap(.~hmm) + theme(legend.position = "none")+ - scale_color_manual(values=c('#E69F00', '#56B4E9')) - } - - df5 <- lapply(x$single, function(x) extract_map(x)) - df6 <- reshape2::melt(df5) - head(df6) - colnames(df6) <- c("position", "parent") - p3<-ggplot(df6, aes(x=position, y=parent, group = as.factor(parent))) + - geom_point(ggplot2::aes(color = as.factor(parent)), shape = 108, size = 5, show.legend = FALSE) + - scale_color_brewer(palette="Dark2") - - - df7 <- reshape2::melt(lapply(x$both$calc.lod, na.omit)) - colnames(df7) <- c("LODScore", "iteration") - p4<-ggplot(df7, aes(x=as.factor(iteration), y=LODScore, group = as.factor(iteration))) + - geom_boxplot(fill='#A4A4A4', color="black") + xlab("iteration") - - f1 <- ggarrange(p1, p3 + font("x.text", size = 9), - ncol = 2, nrow = 1, widths = c(2,1)) - f2 <- ggarrange(p2, p4 + font("x.text", size = 9), - ncol = 2, nrow = 1, widths = c(2,1)) - ggarrange(f1, f2, ncol = 1, nrow = 2) -} +#' Design linkage map framework in two steps: i) estimating the recombination fraction with +#' HMM approach for each parent separately using only markers segregating individually +#' (e.g. map 1 - P1:3 x P2:0, P1: 2x4; map 2 - P1:0 x P2:3, P1:4 x P2:2); ii) merging both +#' maps and re-estimate recombination fractions. +#' +#' @param input.seq object of class \code{mappoly.sequence} +#' @param twopt object of class \code{mappoly.twopt} +#' @param start.set number of markers to start the phasing procedure (default = 4) +#' @param thres.twopt the LOD threshold used to determine if the linkage phases compared via two-point +#' analysis should be considered for the search space reduction (A.K.A. η in Mollinari and Garcia (2019), +#' default = 5) +#' @param thres.hmm the LOD threshold used to determine if the linkage phases compared via hmm analysis +#' should be evaluated in the next round of marker inclusion (default = 50) +#' @param extend.tail the length of the chain's tail that should be used to calculate the likelihood of +#' the map. If NULL (default), the function uses all markers positioned. Even if info.tail = TRUE, +#' it uses at least extend.tail as the tail length +#' @param inflation.lim.p1 the maximum accepted length difference between the current and the previous +#' parent 1 sub-map defined by arguments info.tail and extend.tail. If the size exceeds this limit, the marker will +#' not be inserted. If NULL(default), then it will insert all markers. +#' @param inflation.lim.p2 same as `inflation.lim.p1` but for parent 2 sub-map. +#' @param phase.number.limit the maximum number of linkage phases of the sub-maps defined by arguments info.tail +#' and extend.tail. Default is 20. If the size exceeds this limit, the marker will not be inserted. If Inf, +#' then it will insert all markers. +#' @param tol the desired accuracy during the sequential phase of each parental map (default = 10e-02) +#' @param tol.final the desired accuracy for the final parental map (default = 10e-04) +#' @param verbose If TRUE (default), current progress is shown; if FALSE, no output is produced +#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) +#' to re-estimate the recombination fractions while merging the parental maps (default:hmm) +#' +#' @return list containing three \code{mappoly.map} objects:1) map built with markers with segregation information from parent 1; +#' 2) map built with markers with segregation information from parent 2; 3) maps in 1 and 2 merged +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} +#' +#' +#' @export +framework_map <- function(input.seq, + twopt, + start.set = 10, + thres.twopt = 10, + thres.hmm = 30, + extend.tail = 30, + inflation.lim.p1 = 5, + inflation.lim.p2 = 5, + phase.number.limit = 10, + tol = 10e-3, + tol.final = 10e-4, + verbose = TRUE, + method = "hmm"){ + + if (!inherits(input.seq, "mappoly.sequence")) { + stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") + } + + if (!inherits(twopt, "mappoly.twopt")) { + stop(deparse(substitute(twopt)), " is not an object of class 'mappoly.twopt'") + } + ##### Map for P1 #### + s.p1 <- make_seq_mappoly(input.seq, info.parent = "p1") + if(length(s.p1$seq.num) > 0){ + tpt.p1 <- make_pairs_mappoly(twopt, s.p1) + map.p1 <- est_rf_hmm_sequential(input.seq = s.p1, + twopt = tpt.p1, + start.set = start.set, + thres.twopt = thres.twopt, + thres.hmm = thres.hmm, + extend.tail = extend.tail, + sub.map.size.diff.limit = inflation.lim.p1, + phase.number.limit = phase.number.limit, + reestimate.single.ph.configuration = TRUE, + tol = tol, + tol.final = tol.final, + verbose = verbose) + } else { + warning("No linkage information for parent 1") + map.p1 <- NULL + map.p1.p2 <- NULL + } + ##### Map for P2 #### + s.p2 <- make_seq_mappoly(input.seq, info.parent = "p2") + if(length(s.p2$seq.num) > 0){ + tpt.p2 <- make_pairs_mappoly(twopt, s.p2) + map.p2 <- est_rf_hmm_sequential(input.seq = s.p2, + twopt = tpt.p2, + start.set = start.set, + thres.twopt = thres.twopt, + thres.hmm = thres.hmm, + extend.tail = extend.tail, + sub.map.size.diff.limit = inflation.lim.p2, + phase.number.limit = phase.number.limit, + reestimate.single.ph.configuration = TRUE, + tol = tol, + tol.final = tol.final, + verbose = verbose) + } else { + warning("No linkage information for parent 2") + map.p2 <- NULL + map.p1.p2 <- NULL + } + #### Merging P1 and P2 #### + + if(length(s.p1$seq.num) > 0 & length(s.p2$seq.num) > 0){ + rf.mat <- rf_list_to_matrix(twopt, + thresh.LOD.ph = thres.twopt, + shared.alleles = TRUE, + verbose = verbose) + + map.p1.p2 <- merge_parental_maps(map.p1 = map.p1, + map.p2 = map.p2, + full.seq = input.seq, + full.mat = rf.mat, + method = method, + verbose = verbose) + } + + init.map.list <- list(map.p1 = map.p1, + map.p2 = map.p2, + map.p1.p2 = map.p1.p2) + + return(init.map.list) +} + +#' Add markers that are informative in both parents using HMM approach and evaluating difference +#' in LOD and gap size +#' +#' @param input.map.list list containing three \code{mappoly.map} objects:1) map built with markers with segregation information from parent 1; +#' 2) map built with markers with segregation information from parent 2; 3) maps in 1 and 2 merged +#' @param input.seq object of class \code{mappoly.sequence} containing all markers for specific group +#' @param twopt object of class \code{mappoly.twopt} +#' @param thres.twopt the LOD threshold used to determine if the linkage phases compared via two-point +#' analysis should be considered for the search space reduction (A.K.A. η in Mollinari and Garcia (2019), +#' default = 5) +#' @param init.LOD the LOD threshold used to determine if the marker will be included or not after hmm analysis (default = 30) +#' @param verbose If TRUE (default), current progress is shown; if FALSE, no output is produced +#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) or 'wMDS_to_1D_pc' +#' (weighted MDS followed by fitting a one dimensional principal curve) to re-estimate the recombination fractions after adding markers +#' @param input.mds An object of class \code{mappoly.map} +#' @param max.rounds integer defining number of times to try to fit the remaining markers in the sequence +#' @param gap.threshold threshold for gap size +#' @param size.rem.cluster threshold for number of markers that must contain in a segment after a gap is removed to keep this segment in the sequence +#' +#' @return object of class \code{mappoly.map2} +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} +#' +#' @export +update_framework_map <- function(input.map.list, + input.seq, + twopt, + thres.twopt = 10, + init.LOD = 30, + verbose = TRUE, + method = "hmm", + input.mds = NULL, + max.rounds = 50, + size.rem.cluster = 2, + gap.threshold = 4) +{ + + if (!all(sapply(input.map.list, function(x) inherits(x, "mappoly.map")))) { + stop(deparse(substitute(twopt)), " is not an object of class 'mappoly.map'") + } + if (!inherits(input.seq, "mappoly.sequence")) { + stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") + } + + #### Inserting remaining markers #### + if(is.null(input.map.list$map.p1.p2)) { + warning("Map only have linkage information for one of the parents.") + cur.map <- input.map.list[[which(sapply(input.map.list, function(x) !is.null(x)))]] + } else { + cur.map <- input.map.list$map.p1.p2 + } + cur.seq <- input.seq + LOD <- init.LOD + ll <- map.list <- vector("list", max.rounds) + la <- numeric(max.rounds) + i <- 1 + dat <- get(cur.map$info$data.name, pos = 1) + mrk.to.include <- setdiff(cur.seq$seq.mrk.names, cur.map$info$mrk.names) + + rf.mat <- rf_list_to_matrix(twopt, + thresh.LOD.ph = thres.twopt, + shared.alleles = TRUE, verbose = verbose) + + rem.mrk <- NULL + while(length(mrk.to.include) > 0 & i <= length(la)){ + cur.gen <- calc_genoprob(cur.map, verbose = verbose) + cur.res <- add_md_markers(input.map = cur.map, + mrk.to.include = mrk.to.include, + input.seq = cur.seq, + input.matrix = rf.mat, + input.genoprob = cur.gen, + input.data = dat, + input.mds = input.mds, + thresh = LOD, + method = "hmm", + verbose = verbose) + + a <- rem_mrk_clusters(cur.res$map, + size.rem.cluster = size.rem.cluster, + gap.threshold = gap.threshold) + + rem.mrk <- c(rem.mrk, setdiff(cur.res$map$info$mrk.names, a$info$mrk.names)) + if(a$info$n.mrk < cur.res$map$info$n.mrk){ + if(method == "hmm"){ + cur.res$map <- reest_rf(a, method = "hmm", verbose = verbose) + } else if(method == "wMDS_to_1D_pc"){ + cur.res$map <- reest_rf(a, method = "wMDS_to_1D_pc", input.mds = input.mds, verbose=verbose) + } + } + #### resulting map + cur.map <- map.list[[i]] <- cur.res$map + ### resulting log-likelihood vector + ll[[i]] <- cur.res$ll[cur.res$map$info$mrk.names] + if(i != 1){ + if(map.list[[i-1]]$info$n.mrk >= map.list[[i]]$info$n.mrk) + { + LOD <- max(cur.res$ll, na.rm = TRUE) - 10 + if(LOD <= 0) LOD <- 10e-5 + } + } + la[i] <- LOD + mrk.to.include <- setdiff(cur.seq$seq.mrk.names, cur.map$info$mrk.names) + mrk.to.include <- setdiff(mrk.to.include, rem.mrk) + plot_map_list(map.list[1:i]) + text(rep(0, i), 1:i+.5, labels = unlist(sapply(map.list, function(x) x$info$n.mrk)), adj=-1) + i <- i + 1 + if(verbose){ + cat("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") + print(summary_maps(map.list[!sapply(map.list, is.null)], verbose = verbose)) + cat("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n") + if(length(mrk.to.include) == 0) cat(paste("No more markers to add. Stopping in iteration:", i)) + } + } + #### Post-mapping #### + id <- which(la!=0) + structure(list(single = list(map.p1 = input.map.list$map.p1, + map.p2 = input.map.list$map.p2, + map.p1.p2 = input.map.list$map.p1.p2), + both = list(map.list = map.list[id], + lod.thresh = la[id], + calc.lod = ll[id])), class = "mappoly.map2") +} + + +#' Build merged parental maps +#' +#' @param map.p1 object of class \code{mappoly.map} with parent 1 phased +#' @param map.p2 object of class \code{mappoly.map} with parent 2 phased +#' @param full.seq object of class \code{mappoly.sequence} containing parent 1 and parent 2 markers +#' @param full.mat object of class \code{mappoly.rf.matrix} containing two-points recombination +#' fraction estimations for parent 1 and parent 2 markers +#' +#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) +#' to re-estimate the recombination fractions +#' +#' @importFrom dplyr left_join `%>%` +#' +#' @return object of class \code{mappoly.map} with both parents information +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} +#' +#' @keywords internal +#' +merge_parental_maps <- function(map.p1, + map.p2, + full.seq, + full.mat, + method = c("ols", "hmm"), + verbose = TRUE){ + + method <- match.arg(method) + df.map <- data.frame(mrk = full.seq$seq.mrk.names, + pos = seq(0, 100, length.out = length(full.seq$seq.mrk.names)), + row.names = full.seq$seq.mrk.names) + mrk.p1 <- data.frame(mrk = map.p1$info$mrk.names, p1 = TRUE) + mrk.p2 <- data.frame(mrk = map.p2$info$mrk.names, p2 = TRUE) + mrk.md <- data.frame(mrk = setdiff(full.seq$seq.mrk.names, + c(mrk.p1$mrk, mrk.p2$mrk)), + p1p2 = TRUE) + df.map <- df.map %>% left_join(mrk.p1, by = "mrk") %>% + left_join(mrk.md, by = "mrk") %>% left_join(mrk.p2, by = "mrk") + mrk.p1.p2 <- df.map$mrk[which(df.map$p1 | df.map$p2)] + seq.num <- full.seq$seq.num[match(mrk.p1.p2, full.seq$seq.mrk.names)] + seq.ph = list(P = c(map.p1$maps[[1]]$seq.ph$P, + map.p2$maps[[1]]$seq.ph$P)[as.character(seq.num)], + Q = c(map.p1$maps[[1]]$seq.ph$Q, + map.p2$maps[[1]]$seq.ph$Q)[as.character(seq.num)]) + map.p1.p2 <- .mappoly_map_skeleton(ploidy = full.seq$ploidy, + n.mrk = length(mrk.p1.p2), + seq.num = seq.num, + mrk.names = mrk.p1.p2, + seq.dose.p1 = sapply(seq.ph$P, function(x) sum(as.logical(x))), + seq.dose.p2 = sapply(seq.ph$Q, function(x) sum(as.logical(x))), + data.name = full.seq$data.name, + seq.rf = mf_h(diff(df.map$pos[match(mrk.p1.p2, df.map$mrk)])), + seq.ph = seq.ph) + if(method == "ols"){ + map.p1.p2 <- reest_rf(map.p1.p2, method = "ols", input.mat = full.mat, verbose = verbose) + } else if(method == "hmm"){ + map.p1.p2 <- reest_rf(map.p1.p2, method = "hmm", verbose = verbose) + } else { + stop("Invalid method.", call. = FALSE) + } + map.p1.p2 +} + +#' Add markers to a pre-existing sequence using HMM analysis and evaluating difference in LOD +#' +#' @param input.map An object of class \code{mappoly.map} +#' @param mrk.to.include vector for marker names to be included +#' @param input.seq an object of class \code{mappoly.sequence} containing all markers (the ones in the mappoly.map and also the ones to be included) +#' @param input.matrix object of class \code{mappoly.rf.matrix} +#' @param input.genoprob an object of class \code{mappoly.genoprob} obtained with calc_genoprob of the input.map object +#' @param input.data an object of class \code{mappoly.data} +#' @param input.mds An object of class \code{mappoly.map} +#' @param thresh the LOD threshold used to determine if the marker will be included or not after hmm analysis (default = 30) +#' @param extend.tail the length of the chain's tail that should be used to calculate the likelihood of +#' the map. If NULL (default), the function uses all markers positioned. Even if info.tail = TRUE, +#' it uses at least extend.tail as the tail length +#' @param method indicates whether to use 'hmm' (Hidden Markov Models), 'ols' (Ordinary Least Squares) or 'wMDS_to_1D_pc' +#' (weighted MDS followed by fitting a one dimensional principal curve) to re-estimate the recombination fractions after adding markers +#' @param verbose If TRUE (default), current progress is shown; if FALSE, no output is produced +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with documentation and minor modifications by Cristiane Taniguti \email{chtaniguti@tamu.edu} +#' +#' @keywords internal +#' +add_md_markers <- function(input.map, + mrk.to.include, + input.seq, + input.matrix, + input.genoprob, + input.data, + input.mds = NULL, + thresh = 500, + extend.tail = 50, + method = c("hmm", "wMDS_to_1D_pc"), + verbose = T){ + method <- match.arg(method) + if(method == "wMDS_to_1D_pc" & is.null(input.mds)) + stop("you must provide 'input.mds' when selecting method = 'wMDS_to_1D_pc'", call. = FALSE) + p <- match(input.map$info$seq.num, input.seq$seq.num) + if(any(diff(p) <= 0)) + stop("map and sequence have different orders", call. = FALSE) + + # Positioned markers + id.names <- intersect(input.map$info$mrk.names, input.seq$seq.mrk.names) + # Markers to be positioned + id2.names <- setdiff(input.seq$seq.mrk.names, input.map$info$mrk.names) + id2.names <- intersect(id2.names, mrk.to.include) + + id <- match(id.names, input.seq$seq.mrk.names) + id2 <- match(id2.names, input.seq$seq.mrk.names) + + Pl <- Ql <- vector("list", length(id2)) + nm <- character(length(id2)) + ll <- numeric(length(id2)) + names(ll) <- id2.names + for(i in 1:length(id2)){ + u <- which(id - id2[i] < 0) + + if(verbose) cat(crayon::bgMagenta(stringr::str_pad(paste0(round(100*i/length(id2),1), "%"), width = 6)), "----> ") + if(length(u) == 0 ){ + pos.test <- 0 + if(verbose) cat(crayon::bgRed(id2[i]), "--", id[pos.test + 1], "...|", sep = "") + } else if(length(u) == length(id)){ + pos.test <- length(id) + if(verbose) cat("|...",id[pos.test - 1], "--", crayon::bgRed(id2[i]), sep = "") + } else{ + pos.test <- max(u) + if(verbose) cat("|...", id[pos.test], "--", crayon::bgRed(id2[i]), "--", id[pos.test + 1], "...|", sep = "") + } + + temp.map <- add_marker(input.map = input.map, + mrk = id2.names[i], + pos = pos.test, + genoprob = input.genoprob, + rf.matrix = input.matrix, + extend.tail = extend.tail, + verbose = FALSE) + l <- get_LOD(temp.map) + if(length(l) > 1){ + ll[i] <- l[2] + if(verbose) cat("~~~~~",round(l[2], 4),"~~~~~") + if(l[2] < thresh){ + if(verbose) cat(" ---> skip it!\n") + next() + } + } else ll[i] <- NA + a1 <- temp.map$maps[[1]]$seq.ph$P[pos.test + 1] + a2 <- temp.map$maps[[1]]$seq.ph$Q[pos.test + 1] + nm[i] <- input.seq$seq.num[id2][i] + Pl[[i]] <- a1[[1]] + Ql[[i]] <- a2[[1]] + if(verbose) cat("\n") + } + names(Pl) <- names(Ql) <- nm + ix <- which(nm == "") + if(length(ix) > 0){ + Pl <- Pl[-ix] + Ql <- Ql[-ix] + } + i1<-input.seq$seq.num + i2<-c(as.numeric(names(Pl)), input.map$info$seq.num) + seq.num <- intersect(i1, i2) + seq.ph = list(P = c(input.map$maps[[1]]$seq.ph$P, Pl)[as.character(seq.num)], + Q = c(input.map$maps[[1]]$seq.ph$Q, Ql)[as.character(seq.num)]) + id<-match(seq.num, input.seq$seq.num) + map.p1.p2.final <- .mappoly_map_skeleton(ploidy = input.seq$ploidy, + n.mrk = length(seq.num), + seq.num = seq.num, + mrk.names = input.seq$seq.mrk.names[id], + seq.dose.p1 = sapply(seq.ph$P, function(x) sum(as.logical(x))), + seq.dose.p2 = sapply(seq.ph$Q, function(x) sum(as.logical(x))), + data.name = input.seq$data.name, + seq.rf = rep(0.01, length(seq.num) - 1), + seq.ph = seq.ph, + chrom = input.seq$chrom[id], + genome.pos = input.seq$genome.pos[id]) + if(method == "hmm"){ + map.p1.p2.final <- reest_rf(map.p1.p2.final, method = "hmm", verbose = FALSE) + } else if(method == "wMDS_to_1D_pc"){ + map.p1.p2.final <- reest_rf(map.p1.p2.final, method = "wMDS_to_1D_pc", input.mds = input.mds, verbose = FALSE) + } + list(map = map.p1.p2.final, ll = ll) +} + +# remove markers if causing gap > threshold +rem_mrk_clusters <- function(input.map, + size.rem.cluster = 1, + gap.threshold = 3){ + id <- which(imf_h(input.map$maps[[1]]$seq.rf) > gap.threshold) + id <- cbind(c(1, id+1), c(id, input.map$info$n.mrk)) + ## Selecting map segments larger then the specified threshold + segments <- id[apply(id, 1, diff) > size.rem.cluster - 1, , drop = FALSE] + id<-NULL + for(i in 1:nrow(segments)){ + id <- c(id, segments[i,1]:segments[i,2]) + } + output.map <- get_submap(input.map = input.map, mrk.pos = id, + reestimate.rf = FALSE, verbose = FALSE) + return(output.map) +} + + +#' Plot object mappoly.map2 +#' +#' @param x object of class \code{mappoly.map2} +#' +#' @import ggplot2 +#' @importFrom ggpubr font +#' +#' @export +plot_mappoly.map2 <- function(x){ + { + Var1 <- value <- Var2 <- position <- iteration <- hmm <- parent <- LODScore <- NULL + df1 <- t(sapply(x$both$map.list, function(x) summary(diff(extract_map(x))))) + nit <- nrow(df1) + df11 <- reshape2::melt(df1) + df11$hmm <- "partial" + if(!is.na(names(x$both)[4])){ + df2 <- t(sapply(x$both$map.list.err, function(x) summary(diff(extract_map(x))))) + df12 <- reshape2::melt(df2) + df12$hmm <- "full" + df <- rbind(df11, df12) + p1<-ggplot(df, aes(x=as.factor(Var1), y=value, group=Var2)) + + geom_line(aes(color=Var2)) + + geom_point(aes(color=Var2)) + + ylab("nearest neighbor marker distance") + + xlab("Iteration") + + #annotate(geom="label", x = 1:nit, y = df1[,6] + .5, + # label=sapply(x$both$map.list, function(x) x$info$n.mrk), + # color="red") + + theme(legend.position = "none") + facet_wrap(.~hmm) + } else{ + df <- df11 + p1<-ggplot(df, aes(x=as.factor(Var1), y=value, group=Var2)) + + geom_line(aes(color=Var2)) + + geom_point(aes(color=Var2)) + + ylab("nearest neighbor marker distance") + + xlab("Iteration") + + annotate(geom="label", x = 1:nit, y = df1[,6] + .5, + label=sapply(x$both$map.list, function(x) x$info$n.mrk), + color="red") + + theme(legend.position = "none") + } + } + { + df31 <- lapply(x$both$map.list, function(x) extract_map(x)) + df41 <- reshape2::melt(df31) + df41$hmm <- "partial" + colnames(df41)[1:2] <- c("position", "iteration") + len.map1 <- sapply(df31, max) + if(!is.na(names(x$both)[4])){ + df32 <- lapply(x$both$map.list.err, function(x) extract_map(x)) + df42 <- reshape2::melt(df32) + df42$hmm <- "full" + colnames(df42)[1:2] <- c("position", "iteration") + len.map2 <- sapply(df32, max) + df4 <- rbind(df41, df42) + #len.map <- c(len.map1, len.map2) + } else{ + df4 <- df41 + } + p2<-ggplot(df4, aes(y=-position, x = as.factor(iteration), color = hmm)) + + geom_point(shape = 95, size = 5) + + # annotate(geom="label", y = -len.map + 1, x = 1:length(len.map) + 0.2, + # label=round(len.map, 1), + # color="red", size = 3) + + xlab("iteration") + facet_wrap(.~hmm) + theme(legend.position = "none")+ + scale_color_manual(values=c('#E69F00', '#56B4E9')) + } + + single_maps <- x$single[which(!sapply(x$single, is.null))] + df5 <- lapply(single_maps, function(x) extract_map(x)) + df6 <- reshape2::melt(df5) + colnames(df6) <- c("position", "parent") + p3<-ggplot(df6, aes(x=position, y=parent, group = as.factor(parent))) + + geom_point(ggplot2::aes(color = as.factor(parent)), shape = 108, size = 5, show.legend = FALSE) + + scale_color_brewer(palette="Dark2") + + + df7 <- reshape2::melt(lapply(x$both$calc.lod, na.omit)) + colnames(df7) <- c("LODScore", "iteration") + p4<-ggplot(df7, aes(x=as.factor(iteration), y=LODScore, group = as.factor(iteration))) + + geom_boxplot(fill='#A4A4A4', color="black") + xlab("iteration") + + f1 <- ggarrange(p1, p3 + font("x.text", size = 9), + ncol = 2, nrow = 1, widths = c(2,1)) + f2 <- ggarrange(p2, p4 + font("x.text", size = 9), + ncol = 2, nrow = 1, widths = c(2,1)) + ggarrange(f1, f2, ncol = 1, nrow = 2) +} diff --git a/R/filters.R b/R/filters.R index 2dcf10ce..fc2881a6 100644 --- a/R/filters.R +++ b/R/filters.R @@ -1,621 +1,653 @@ -#' Filter non-conforming classes in F1, non double reduced population. -#' -#' @param void internal function to be documented -#' @keywords internal -#' @export -filter_non_conforming_classes <- function(input.data, prob.thres = NULL) -{ - if (!inherits(input.data, "mappoly.data")) { - stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") - } - ploidy <- input.data$ploidy - dp <- input.data$dosage.p1 - dq <- input.data$dosage.p2 - Ds <- array(NA, dim = c(ploidy+1, ploidy+1, ploidy+1)) - for(i in 0:ploidy) - for(j in 0:ploidy) - Ds[i+1,j+1,] <- segreg_poly(ploidy = ploidy, dP = i, dQ = j) - Dpop <- cbind(dp,dq) - #Gathering segregation probabilities given parental dosages - M <- t(apply(Dpop, 1, function(x) Ds[x[1]+1, x[2]+1,])) - M[M != 0] <- 1 - dimnames(M) <- list(input.data$mrk.names, 0:ploidy) - ##if no prior probabilities - if(!is.prob.data(input.data)){ - for(i in 1:nrow(M)){ - id0 <- !as.numeric(input.data$geno.dose[i,])%in%(which(M[i,] == 1)-1) - if(any(id0)) - input.data$geno.dose[i,id0] <- (ploidy+1) - } - return(input.data) - } - ## 1 represents conforming classes/ 0 represents non-conforming classes - dp <- rep(dp, input.data$n.ind) - dq <- rep(dq, input.data$n.ind) - M <- M[rep(seq_len(nrow(M)), input.data$n.ind),] - R <- input.data$geno[,-c(1:2)] - input.data$geno[,-c(1:2)]*M - id1 <- apply(R, 1, function(x) abs(sum(x))) > 0.3 # if the sum of the excluded classes is greater than 0.3, use segreg_poly - N <- matrix(NA, sum(id1), input.data$ploidy+1) - ct <- 1 - for(i in which(id1)){ - N[ct,] <- Ds[dp[i]+1, dq[i]+1, ] - ct <- ct+1 - } - input.data$geno[id1,-c(1:2)] <- N - # if the sum of the excluded classes is greater than zero - # and smaller than 0.3, assign zero to those classes and normalize the vector - input.data$geno[,-c(1:2)][R > 0] <- 0 - input.data$geno[,-c(1:2)] <- sweep(input.data$geno[,-c(1:2)], 1, rowSums(input.data$geno[,-c(1:2)]), FUN = "/") - if(is.null(prob.thres)) - prob.thres <- input.data$prob.thres - geno.dose <- dist_prob_to_class(geno = input.data$geno, prob.thres = prob.thres) - if(geno.dose$flag) - { - input.data$geno <- geno.dose$geno - input.data$geno.dose <- geno.dose$geno.dose - } else { - input.data$geno.dose <- geno.dose$geno.dose - } - input.data$geno.dose[is.na(input.data$geno.dose)] <- ploidy + 1 - input.data$n.ind <- ncol(input.data$geno.dose) - input.data$ind.names <- colnames(input.data$geno.dose) - return(input.data) -} - -#' Filter missing genotypes -#' -#' Excludes markers or individuals based on their proportion of missing data -#' -#' @param input.data an object of class \code{mappoly.data} -#' -#' @param type one of the following options: -#' \code{'marker'}{filter out markers based on their percentage of missing data (default)} -#' \code{'individual'}{filter out individuals based on their percentage of missing data} -#' Please notice that removing individuals with certain amount of data can change some marker parameters -#' (such as depth), and can also change the estimated genotypes for other individuals. -#' So be careful when removing individuals. -#' -#' @param filter.thres maximum percentage of missing data (default = 0.2) -#' -#' @param inter if \code{TRUE}, expects user-input to proceed with filtering -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} -#' @examples -#' plot(tetra.solcap) -#' dat.filt.mrk <- filter_missing(input.data = tetra.solcap, -#' type = "marker", -#' filter.thres = 0.1, -#' inter = TRUE) -#' plot(dat.filt.mrk) -#' @export -#' @importFrom magrittr "%>%" -#' @importFrom dplyr filter -#' @importFrom graphics axis -filter_missing <- function(input.data, - type = c("marker", "individual"), - filter.thres = 0.2, - inter = TRUE) -{ - if (!inherits(input.data, "mappoly.data")) { - stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") - } - type <- match.arg(type) - switch(type, - marker = filter_missing_mrk(input.data, - filter.thres = filter.thres, - inter = inter), - individual = filter_missing_ind(input.data, - filter.thres = filter.thres, - inter = inter) - ) -} - -#' Filter markers based on missing genotypes -#' -#' @param input.data an object of class \code{"mappoly.data"} -#' @param filter.thres maximum percentage of missing data -#' @param inter if \code{TRUE}, expects user-input to proceed with filtering -#' @keywords internal -filter_missing_mrk <- function(input.data, filter.thres = 0.2, inter = TRUE) -{ - op <- par(pty="s") - on.exit(par(op)) - ANSWER <- "flag" - if(interactive() && inter) - { - while(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") - { - na.num <- apply(input.data$geno.dose, 1, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) - perc.na <- na.num/input.data$n.ind - x <- sort(perc.na) - plot(x, - xlab = "markers", - ylab = "frequency of missing data", - col = ifelse(x <= filter.thres, 4, 2), - pch =ifelse(x <= filter.thres, 1, 4)) - abline(h = filter.thres, lty = 2) - f<-paste0("Filtered out: ", sum(perc.na > filter.thres)) - i<-paste0("Included: ", sum(perc.na <= filter.thres)) - legend("topleft", c(f, i) , col = c(2,4), pch = c(4,1)) - ANSWER <- readline("Enter 'Y/n' to proceed or update the filter threshold: ") - if(substr(ANSWER, 1, 1) == "n" | substr(ANSWER, 1, 1) == "no" | substr(ANSWER, 1, 1) == "N") - stop("Stop function.") - if(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") - filter.thres <- as.numeric(ANSWER) - } - mrks.id <- which(perc.na <= filter.thres) - if(length(mrks.id) == input.data$n.mrk){ - return(input.data) - } - out.dat <- sample_data(input.data, type = "markers", selected.mrk = names(mrks.id)) - return(out.dat) - } - else{ - na.num <- apply(input.data$geno.dose, 1, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) - perc.na <- na.num/input.data$n.ind - mrks.id <- which(perc.na <= filter.thres) - if(length(mrks.id) == input.data$n.mrk){ - return(input.data) - } - out.dat <- sample_data(input.data, type = "markers", selected.mrk = names(mrks.id)) - return(out.dat) - } -} - -#' Filter individuals based on missing genotypes -#' -#' @param input.data an object of class \code{"mappoly.data"} -#' @param filter.thres maximum percentage of missing data -#' @param inter if \code{TRUE}, expects user-input to proceed with filtering -#' @keywords internal -#' @importFrom magrittr "%>%" -#' @importFrom dplyr filter -#' @importFrom graphics axis -filter_missing_ind <- function(input.data, filter.thres = 0.2, inter = TRUE) -{ - op <- par(pty="s") - on.exit(par(op)) - ANSWER <- "flag" - ind <- NULL - if(interactive() && inter) - { - while(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") - { - na.num <- apply(input.data$geno.dose, 2, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) - perc.na <- na.num/input.data$n.mrk - x <- sort(perc.na) - plot(x, - xlab = "offspring", - ylab = "frequency of missing data", - col = ifelse(x <= filter.thres, 4, 2), - pch =ifelse(x <= filter.thres, 1, 4)); - abline(h = filter.thres, lty = 2) - f<-paste0("Filtered out: ", sum(perc.na > filter.thres)) - i<-paste0("Included: ", sum(perc.na <= filter.thres)) - legend("topleft", c(f, i) , col = c(2,4), pch = c(4,1)) - ANSWER <- readline("Enter 'Y/n' to proceed or update the filter threshold: ") - if(substr(ANSWER, 1, 1) == "n" | substr(ANSWER, 1, 1) == "no" | substr(ANSWER, 1, 1) == "N") - stop("You decided to stop the function.") - if(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") - filter.thres <- as.numeric(ANSWER) - } - ind.id <- which(perc.na <= filter.thres) - if(length(ind.id) == input.data$n.ind){ - return(input.data) - } - ind <- names(ind.id) - out.dat <- sample_data(input.data, type = "individual", selected.ind = names(ind.id)) - return(out.dat) - } - else{ - na.num <- apply(input.data$geno.dose, 2, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) - perc.na <- na.num/input.data$n.mrk - ind.id <- which(perc.na <= filter.thres) - if(length(ind.id) == input.data$n.ind){ - return(input.data) - } - ind <- names(ind.id) - out.dat <- sample_data(input.data, type = "individual", selected.ind = names(ind.id)) - return(out.dat) - } -} - -#' Filter markers based on chi-square test -#' -#' This function filter markers based on p-values of a chi-square test. -#' The chi-square test assumes that markers follow the expected segregation -#' patterns under Mendelian inheritance, random chromosome bivalent -#' pairing and no double reduction. -#' -#' @param input.obj name of input object (class \code{mappoly.data}) -#' -#' @param chisq.pval.thres p-value threshold used for chi-square tests -#' (default = Bonferroni aproximation with global alpha of 0.05, i.e., -#' 0.05/n.mrk) -#' -#' @param inter if TRUE (default), plots distorted vs. non-distorted markers -#' -#' @return An object of class \code{mappoly.chitest.seq} which contains a list with the following components: -#' \item{keep}{markers that follow Mendelian segregation pattern} -#' \item{exclude}{markers with distorted segregation} -#' \item{chisq.pval.thres}{threshold p-value used for chi-square tests} -#' \item{data.name}{input dataset used to perform the chi-square tests} -#' -#'@examples -#' mrks.chi.filt <- filter_segregation(input.obj = tetra.solcap, -#' chisq.pval.thres = 0.05/tetra.solcap$n.mrk, -#' inter = TRUE) -#' seq.init <- make_seq_mappoly(mrks.chi.filt) -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} -#' -#' @importFrom graphics axis -#' @export -filter_segregation <- function(input.obj, chisq.pval.thres = NULL, inter = TRUE){ - op <- par(pty="s") - on.exit(par(op)) - if(inherits(input.obj, c("mappoly.data"))){ - chisq.val <- input.obj$chisq.pval - n.mrk <- input.obj$n.mrk - data.name <- as.character(sys.call())[2] - } else if (inherits(input.obj, c("mappoly.sequence"))){ - chisq.val <- input.obj$chisq.pval[input.obj$seq.mrk.names] - n.mrk <- length(input.obj$seq.num) - data.name <- input.obj$data.name - } else { - stop(deparse(substitute(input.obj)), - " is not an object of class 'mappoly.data' or 'mappoly.sequence'") - } - ##Bonferroni approx - if(is.null(chisq.pval.thres)) - chisq.pval.thres <- 0.05/n.mrk - ANSWER <- "flag" - - if(interactive() && inter) - { - while(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") - { - x <- log10(sort(chisq.val, decreasing = TRUE)) - th <- log10(chisq.pval.thres) - plot(x, - xlab = "markers", - ylab = bquote(log[10](P)), - col = ifelse(x <= th, 2, 4), - pch =ifelse(x <= th, 4, 1)) - abline(h = th, lty = 2) - f<-paste0("Filtered out: ", sum(x < th)) - i<-paste0("Included: ", sum(x >= th)) - legend("bottomleft", c(f, i) , col = c(2,4), pch = c(4,1)) - ANSWER <- readline("Enter 'Y/n' to proceed or update the p value threshold: ") - if(substr(ANSWER, 1, 1) == "n" | substr(ANSWER, 1, 1) == "no" | substr(ANSWER, 1, 1) == "N") - stop("You decided to stop the function.") - if(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") - chisq.pval.thres <- as.numeric(ANSWER) - } - } - keep <- names(which(chisq.val >= chisq.pval.thres)) - exclude <- names(which(chisq.val < chisq.pval.thres)) - structure(list(keep = keep, exclude = exclude, chisq.pval.thres = chisq.pval.thres, - data.name = data.name), - class = "mappoly.chitest.seq") -} - -#' Filter out individuals -#' -#' This function removes individuals from the data set. Individuals can be -#' user-defined or can be accessed via interactive kinship analysis. -#' -#' @param input.data name of input object (class \code{mappoly.data}) -#' -#' @param ind.to.remove individuals to be removed. If \code{NULL} it opens -#' an interactive graphic to proceed with the individual -#' selection -#' @param inter if \code{TRUE}, expects user-input to proceed with filtering -#' -#' @param verbose if \code{TRUE} (default), shows the filtered out individuals -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} -#' -#' @export -#' -filter_individuals <- function(input.data, ind.to.remove = NULL, inter = TRUE, verbose = TRUE){ - if (!inherits(input.data, "mappoly.data")) { - stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") - } - op <- par(pty="s") - on.exit(par(op)) - # if (!require(AGHmatrix)) -# stop("Please install package 'AGHmatrix' to proceed") - D <- t(input.data$geno.dose) - D[D == input.data$ploidy+1] <- NA - D <- rbind(input.data$dosage.p1, input.data$dosage.p2, D) - rownames(D)[1:2] <- c("P1", "P2") - G <- AGHmatrix::Gmatrix(D, method = "VanRaden",ploidy = input.data$ploidy) - x <- G[1,] - y <- G[2,] - #a <- 2*atan(y/x)/pi - #b <- sqrt(x^2 + y^2) - df <- data.frame(x = x, y = y, type = c(2, 2, rep(4, length(x)-2))) - plot(df[,1:2], col = df$type, pch = 19, - xlab = "relationships between the offspring and P1", - ylab = "relationships between the offspring and P2") - abline(c(0,1), lty = 2) - abline(c(-0.4,1), lty = 2, col = "gray") - abline(c(0.4,1), lty = 2, col = "gray") - legend("topright", c("Parents", "Offspring") , col = c(2,4), pch = 19) - if(!is.null(ind.to.remove)){ - out.data <- sample_data(input.data, selected.ind = setdiff(input.data$ind.names, ind.to.remove)) - return(out.data) - } - if(interactive() && inter) - { - # if (!require(gatepoints)) - # stop("Please install package 'gatepoints' to proceed") - ANSWER <- readline("Enter 'Y/n' to proceed with interactive filtering or quit: ") - if(substr(ANSWER, 1, 1) == "y" | substr(ANSWER, 1, 1) == "yes" | substr(ANSWER, 1, 1) == "Y" | ANSWER == "") - { - ind.to.remove <- gatepoints::fhs(df, mark = TRUE) - ind.to.remove <- setdiff(ind.to.remove, c("P1", "P2")) - ind.to.include <- setdiff(rownames(df)[-c(1:2)], ind.to.remove) - if(verbose){ - cat("Removing individual(s): \n") - print(ind.to.remove) - cat("...\n") - } - if(length(ind.to.remove) == 0){ - warning("No individuals removed. Returning original data set.") - return(input.data) - } - out.data <- sample_data(input.data, selected.ind = ind.to.include) - return(out.data) - } else{ - warning("No individuals removed. Returning original data set.") - return(input.data) - } - } -} -#' Remove markers that do not meet a LOD criteria -#' -#' Remove markers that do not meet a LOD and recombination fraction -#' criteria for at least a percentage of the pairwise marker -#' combinations. It also removes markers with strong evidence of -#' linkage across the whole linkage group (false positive). -#' -#' \code{thresh.LOD.ph} should be set in order to only select -#' recombination fractions that have LOD scores associated to the -#' linkage phase configuration higher than \code{thresh_LOD_ph} -#' when compared to the second most likely linkage phase configuration. -#' That action usually eliminates markers that are unlinked to the -#' set of analyzed markers. -#' -#' @param input.twopt an object of class \code{mappoly.twopt} -#' -#' @param thresh.LOD.ph LOD score threshold for linkage phase configuration -#' (default = 5) -#' -#' @param thresh.LOD.rf LOD score threshold for recombination fraction -#' (default = 5) -#' -#' @param thresh.rf threshold for recombination fractions (default = 0.15) -#' -#' @param probs indicates the probability corresponding to the filtering -#' quantiles. (default = c(0.05, 1)) -#' -#' @param diag.markers A window where marker pairs should be considered. -#' If NULL (default), all markers are considered. -#' -#' @param mrk.order marker order. Only has effect if 'diag.markers' is not NULL -#' -#' @param ncpus number of parallel processes (i.e. cores) to spawn -#' (default = 1) -#' -#' @param diagnostic.plot if \code{TRUE} produces a diagnostic plot -#' -#' @param breaks number of cells for the histogram -#' -#' @return A filtered object of class \code{mappoly.sequence}. -#' See \code{\link[mappoly]{make_seq_mappoly}} for details -#' -#' @examples -#' all.mrk <- make_seq_mappoly(hexafake, 1:20) -#' red.mrk <- elim_redundant(all.mrk) -#' unique.mrks <- make_seq_mappoly(red.mrk) -#' all.pairs <- est_pairwise_rf(input.seq = unique.mrks, -#' ncpus = 1, -#' verbose = TRUE) -#' -#' ## Full recombination fraction matrix -#' mat.full <- rf_list_to_matrix(input.twopt = all.pairs) -#' plot(mat.full) -#' -#' ## Removing disruptive SNPs -#' tpt.filt <- rf_snp_filter(all.pairs, 2, 2, 0.07, probs = c(0.15, 1)) -#' p1.filt <- make_pairs_mappoly(input.seq = tpt.filt, input.twopt = all.pairs) -#' m1.filt <- rf_list_to_matrix(input.twopt = p1.filt) -#' plot(mat.full, main.text = "LG1") -#' plot(m1.filt, main.text = "LG1.filt") -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with updates by Gabriel Gesteira, \email{gdesiqu@ncsu.edu} -#' -#' @references -#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage -#' analysis and haplotype phasing in experimental autopolyploid -#' populations with high ploidy level using hidden Markov -#' models, _G3: Genes, Genomes, Genetics_. -#' \doi{10.1534/g3.119.400378} -#' -#' @export rf_snp_filter -#' @importFrom ggplot2 ggplot geom_histogram aes scale_fill_manual xlab ggtitle -#' @importFrom graphics hist -rf_snp_filter <- function(input.twopt, - thresh.LOD.ph = 5, - thresh.LOD.rf = 5, - thresh.rf = 0.15, - probs = c(0.05, 1), - diag.markers = NULL, - mrk.order = NULL, - ncpus = 1L, - diagnostic.plot = TRUE, - breaks = 100) -{ - - input_classes <- c("mappoly.twopt", "mappoly.twopt2") - if (!inherits(input.twopt, input_classes)) { - stop(deparse(substitute(input.twopt)), paste0(" is not an object of class ", paste0(input_classes, collapse = " or "))) - } - probs <- range(probs) - ## Getting filtered rf matrix - rf_mat <- rf_list_to_matrix(input.twopt = input.twopt, thresh.LOD.ph = thresh.LOD.ph, - thresh.LOD.rf = thresh.LOD.rf, thresh.rf = thresh.rf, - ncpus = ncpus, verbose = FALSE) - M <- rf_mat$rec.mat - if(!is.null(mrk.order)) - M <- M[mrk.order, mrk.order] - if(!is.null(diag.markers)) - M[abs(col(M) - row(M)) > diag.markers] <- NA - x <- apply(M, 1, function(x) sum(!is.na(x))) - w <- hist(x, breaks = breaks, plot = FALSE) - th <- quantile(x, probs = probs) - rem <- c(which(x < th[1]), which(x > th[2])) - ids <- names(which(x >= th[1] & x <= th[2])) - value <- type <- NULL - if(diagnostic.plot){ - d <- rbind(data.frame(type = "original", value = x), - data.frame(type = "filtered", value = x[ids])) - p <- ggplot2::ggplot(d, ggplot2::aes(value)) + - ggplot2::geom_histogram(ggplot2::aes(fill = type), - alpha = 0.4, position = "identity", binwidth = diff(w$mids)[1]) + - ggplot2::scale_fill_manual(values = c("#00AFBB", "#E7B800")) + - ggplot2::ggtitle( paste0("Filtering probs: [", probs[1], " : ", probs[2], "] - Non NA values by row in rf matrix - b width: ", diff(w$mids)[1])) + - ggplot2::xlab(paste0("Non 'NA' values at LOD.ph = ", thresh.LOD.ph, ", LOD.rf = ", thresh.LOD.rf, ", and thresh.rf = ", thresh.rf)) - print(p) - } - ## Returning sequence object - ch_filt <- make_seq_mappoly(input.obj = get(input.twopt$data.name, pos = 1), arg = ids, data.name = input.twopt$data.name) - return(ch_filt) -} - -#' Edit sequence ordered by reference genome positions -#' comparing to another set order -#' -#' @param input.seq object of class mappoly.sequence with alternative order (not genomic order) -#' -#' @author Cristiane Taniguti, \email{chtaniguti@tamu.edu} -#' -#' @examples -#' \donttest{ -#' dat <- filter_segregation(tetra.solcap, inter = FALSE) -#' seq_dat <- make_seq_mappoly(dat) -#' seq_chr <- make_seq_mappoly(seq_dat, arg = seq_dat$seq.mrk.names[which(seq_dat$chrom=="1")]) -#' -#' tpt <- est_pairwise_rf(seq_chr) -#' seq.filt <- rf_snp_filter(tpt, probs = c(0.05, 0.95)) -#' mat <- rf_list_to_matrix(tpt) -#' mat2 <- make_mat_mappoly(mat, seq.filt) -#' -#' seq_test_mds <- mds_mappoly(mat2) -#' seq_mds <- make_seq_mappoly(seq_test_mds) -#' edit_seq <- edit_order(input.seq = seq_mds) -#' } -#' -#' @return object of class \code{mappoly.edit.order}: a list containing -#' vector of marker names ordered according to editions (`edited_order`); -#' vector of removed markers names (`removed`); -#' vector of inverted markers names (`inverted`). -#' -#' @export -edit_order <- function(input.seq){ - - if (!inherits(input.seq, "mappoly.sequence")) { - stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") - } - - get_weird <- data.frame(x = 1:length(input.seq$genome.pos), - y = input.seq$genome.pos) - - rownames(get_weird) <- input.seq$seq.mrk.names - get_weird <- get_weird[order(get_weird$y),] - plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") - - cat("Mark at least three points on the plot and press `Esc` to continue.") - inverted <- removed <- vector() - if(interactive()){ - ANSWER <- "Y" - while(substr(ANSWER, 1, 1) == "y" | substr(ANSWER, 1, 1) == "yes" | substr(ANSWER, 1, 1) == "Y" | ANSWER == ""){ - plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") - mks.to.remove <- gatepoints::fhs(get_weird, mark = TRUE) - if(length(which(rownames(get_weird) %in% mks.to.remove)) > 0){ - ANSWER2 <- readline("Enter 'invert/remove' to proceed with the edition: ") - if(ANSWER2 == "invert"){ - inverted <- c(inverted, as.vector(mks.to.remove)) - repl <- get_weird[rev(which(rownames(get_weird) %in% as.vector(mks.to.remove))),] - get_weird[which(rownames(get_weird) %in% as.vector(mks.to.remove)),2] <- repl[,2] - } else { - removed <- c(removed, as.vector(mks.to.remove)) - get_weird <- get_weird[-which(rownames(get_weird) %in% mks.to.remove),] - } - } - ANSWER <- readline("Enter 'Y/n' to proceed with interactive edition or quit: ") - } - plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") - } - - return(structure(list(edited_order = rownames(get_weird), - removed = removed, - inverted = inverted, - data.name = input.seq$data.name), class = "mappoly.edit.order")) -} - -#' Filter aneuploid chromosomes from progeny individuals -#' -#' @param input.data name of input object (class \code{mappoly.data}) -#' -#' @param aneuploid.info data.frame with ploidy information by chromosome (columns) for each individual -#' in progeny (rows). The chromosome and individuals names must match the ones in the file used as input -#' in mappoly. -#' -#' @param ploidy main ploidy -#' -#' @examples -#' aneuploid.info <- matrix(4, nrow=tetra.solcap$n.ind, ncol = 12) -#' set.seed(8080) -#' aneuploid.info[sample(1:length(aneuploid.info), round((4*length(aneuploid.info))/100),0)] <- 3 -#' aneuploid.info[sample(1:length(aneuploid.info), round((4*length(aneuploid.info))/100),0)] <- 5 -#' -#' colnames(aneuploid.info) <- paste0(1:12) -#' aneuploid.info <- cbind(inds = tetra.solcap$ind.names, aneuploid.info) -#' -#' filt.dat <- filter_aneuploid(input.data = tetra.solcap, -#' aneuploid.info = aneuploid.info, ploidy = 4) -#' -#' @author Cristiane Taniguti, \email{chtaniguti@tamu.edu} -#' -#' @return object of class \code{mappoly.data} -#' -#' @export -filter_aneuploid <- function(input.data, aneuploid.info, ploidy){ - - if (!inherits(input.data, "mappoly.data")) { - stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") - } - - aneu.chroms <- colnames(aneuploid.info)[-1] - - keep.ind <- match(input.data$ind.names, aneuploid.info[,1]) - aneuploid.info <- aneuploid.info[keep.ind,] - - idx.list <- apply(aneuploid.info[,-1], 1, function(x) which(x != ploidy)) - names(idx.list) <- aneuploid.info[,1] - - idx.list <- idx.list[-which(!sapply(idx.list, function(x) length(x) >0))] - - cat(round((length(unlist(idx.list))/(dim(aneuploid.info)[1]*(dim(aneuploid.info)[2]-1)))*100,2), "% of the chromosomes x individuals are aneuploids\n") - - for(i in 1:length(idx.list)){ - idx <- which(input.data$chrom %in% names(idx.list[[i]])) - input.data$geno.dose[idx,names(idx.list[i])] <- ploidy + 1 - } - - return(input.data) -} - +#' Filter non-conforming classes in F1, non double reduced population. +#' +#' @param void internal function to be documented +#' @keywords internal +#' @export +filter_non_conforming_classes <- function(input.data, prob.thres = NULL) +{ + if (!inherits(input.data, "mappoly.data")) { + stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") + } + ploidy <- input.data$ploidy + dp <- input.data$dosage.p1 + dq <- input.data$dosage.p2 + Ds <- array(NA, dim = c(ploidy+1, ploidy+1, ploidy+1)) + for(i in 0:ploidy) + for(j in 0:ploidy) + Ds[i+1,j+1,] <- segreg_poly(ploidy = ploidy, dP = i, dQ = j) + Dpop <- cbind(dp,dq) + #Gathering segregation probabilities given parental dosages + M <- t(apply(Dpop, 1, function(x) Ds[x[1]+1, x[2]+1,])) + M[M != 0] <- 1 + dimnames(M) <- list(input.data$mrk.names, 0:ploidy) + ##if no prior probabilities + if(!is.prob.data(input.data)){ + for(i in 1:nrow(M)){ + id0 <- !as.numeric(input.data$geno.dose[i,])%in%(which(M[i,] == 1)-1) + if(any(id0)) + input.data$geno.dose[i,id0] <- (ploidy+1) + } + return(input.data) + } + ## 1 represents conforming classes/ 0 represents non-conforming classes + dp <- rep(dp, input.data$n.ind) + dq <- rep(dq, input.data$n.ind) + M <- M[rep(seq_len(nrow(M)), input.data$n.ind),] + R <- input.data$geno[,-c(1:2)] - input.data$geno[,-c(1:2)]*M + id1 <- apply(R, 1, function(x) abs(sum(x))) > 0.3 # if the sum of the excluded classes is greater than 0.3, use segreg_poly + N <- matrix(NA, sum(id1), input.data$ploidy+1) + ct <- 1 + for(i in which(id1)){ + N[ct,] <- Ds[dp[i]+1, dq[i]+1, ] + ct <- ct+1 + } + input.data$geno[id1,-c(1:2)] <- N + # if the sum of the excluded classes is greater than zero + # and smaller than 0.3, assign zero to those classes and normalize the vector + input.data$geno[,-c(1:2)][R > 0] <- 0 + input.data$geno[,-c(1:2)] <- sweep(input.data$geno[,-c(1:2)], 1, rowSums(input.data$geno[,-c(1:2)]), FUN = "/") + if(is.null(prob.thres)) + prob.thres <- input.data$prob.thres + geno.dose <- dist_prob_to_class(geno = input.data$geno, prob.thres = prob.thres) + if(geno.dose$flag) + { + input.data$geno <- geno.dose$geno + input.data$geno.dose <- geno.dose$geno.dose + } else { + input.data$geno.dose <- geno.dose$geno.dose + } + input.data$geno.dose[is.na(input.data$geno.dose)] <- ploidy + 1 + input.data$n.ind <- ncol(input.data$geno.dose) + input.data$ind.names <- colnames(input.data$geno.dose) + return(input.data) +} + +#' Filter missing genotypes +#' +#' Excludes markers or individuals based on their proportion of missing data +#' +#' @param input.data an object of class \code{mappoly.data} +#' +#' @param type one of the following options: +#' \code{'marker'}{filter out markers based on their percentage of missing data (default)} +#' \code{'individual'}{filter out individuals based on their percentage of missing data} +#' Please notice that removing individuals with certain amount of data can change some marker parameters +#' (such as depth), and can also change the estimated genotypes for other individuals. +#' So be careful when removing individuals. +#' +#' @param filter.thres maximum percentage of missing data (default = 0.2) +#' +#' @param inter if \code{TRUE}, expects user-input to proceed with filtering +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} +#' @examples +#' plot(tetra.solcap) +#' dat.filt.mrk <- filter_missing(input.data = tetra.solcap, +#' type = "marker", +#' filter.thres = 0.1, +#' inter = TRUE) +#' plot(dat.filt.mrk) +#' @export +#' @importFrom magrittr "%>%" +#' @importFrom dplyr filter +#' @importFrom graphics axis +filter_missing <- function(input.data, + type = c("marker", "individual"), + filter.thres = 0.2, + inter = TRUE) +{ + if (!inherits(input.data, "mappoly.data")) { + stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") + } + type <- match.arg(type) + switch(type, + marker = filter_missing_mrk(input.data, + filter.thres = filter.thres, + inter = inter), + individual = filter_missing_ind(input.data, + filter.thres = filter.thres, + inter = inter) + ) +} + +#' Filter markers based on missing genotypes +#' +#' @param input.data an object of class \code{"mappoly.data"} +#' @param filter.thres maximum percentage of missing data +#' @param inter if \code{TRUE}, expects user-input to proceed with filtering +#' @keywords internal +filter_missing_mrk <- function(input.data, filter.thres = 0.2, inter = TRUE) +{ + op <- par(pty="s") + on.exit(par(op)) + ANSWER <- "flag" + if(interactive() && inter) + { + while(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") + { + na.num <- apply(input.data$geno.dose, 1, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) + perc.na <- na.num/input.data$n.ind + x <- sort(perc.na) + plot(x, + xlab = "markers", + ylab = "frequency of missing data", + col = ifelse(x <= filter.thres, 4, 2), + pch =ifelse(x <= filter.thres, 1, 4)) + abline(h = filter.thres, lty = 2) + f<-paste0("Filtered out: ", sum(perc.na > filter.thres)) + i<-paste0("Included: ", sum(perc.na <= filter.thres)) + legend("topleft", c(f, i) , col = c(2,4), pch = c(4,1)) + ANSWER <- readline("Enter 'Y/n' to proceed or update the filter threshold: ") + if(substr(ANSWER, 1, 1) == "n" | substr(ANSWER, 1, 1) == "no" | substr(ANSWER, 1, 1) == "N") + stop("Stop function.") + if(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") + filter.thres <- as.numeric(ANSWER) + } + mrks.id <- which(perc.na <= filter.thres) + if(length(mrks.id) == input.data$n.mrk){ + return(input.data) + } + out.dat <- sample_data(input.data, type = "markers", selected.mrk = names(mrks.id)) + return(out.dat) + } + else{ + na.num <- apply(input.data$geno.dose, 1, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) + perc.na <- na.num/input.data$n.ind + mrks.id <- which(perc.na <= filter.thres) + if(length(mrks.id) == input.data$n.mrk){ + return(input.data) + } + out.dat <- sample_data(input.data, type = "markers", selected.mrk = names(mrks.id)) + return(out.dat) + } +} + +#' Filter individuals based on missing genotypes +#' +#' @param input.data an object of class \code{"mappoly.data"} +#' @param filter.thres maximum percentage of missing data +#' @param inter if \code{TRUE}, expects user-input to proceed with filtering +#' @keywords internal +#' @importFrom magrittr "%>%" +#' @importFrom dplyr filter +#' @importFrom graphics axis +filter_missing_ind <- function(input.data, filter.thres = 0.2, inter = TRUE) +{ + op <- par(pty="s") + on.exit(par(op)) + ANSWER <- "flag" + ind <- NULL + if(interactive() && inter) + { + while(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") + { + na.num <- apply(input.data$geno.dose, 2, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) + perc.na <- na.num/input.data$n.mrk + x <- sort(perc.na) + plot(x, + xlab = "offspring", + ylab = "frequency of missing data", + col = ifelse(x <= filter.thres, 4, 2), + pch =ifelse(x <= filter.thres, 1, 4)); + abline(h = filter.thres, lty = 2) + f<-paste0("Filtered out: ", sum(perc.na > filter.thres)) + i<-paste0("Included: ", sum(perc.na <= filter.thres)) + legend("topleft", c(f, i) , col = c(2,4), pch = c(4,1)) + ANSWER <- readline("Enter 'Y/n' to proceed or update the filter threshold: ") + if(substr(ANSWER, 1, 1) == "n" | substr(ANSWER, 1, 1) == "no" | substr(ANSWER, 1, 1) == "N") + stop("You decided to stop the function.") + if(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") + filter.thres <- as.numeric(ANSWER) + } + ind.id <- which(perc.na <= filter.thres) + if(length(ind.id) == input.data$n.ind){ + return(input.data) + } + ind <- names(ind.id) + out.dat <- sample_data(input.data, type = "individual", selected.ind = names(ind.id)) + return(out.dat) + } + else{ + na.num <- apply(input.data$geno.dose, 2, function(x,ploidy) sum(x == ploidy+1), ploidy = input.data$ploidy) + perc.na <- na.num/input.data$n.mrk + ind.id <- which(perc.na <= filter.thres) + if(length(ind.id) == input.data$n.ind){ + return(input.data) + } + ind <- names(ind.id) + out.dat <- sample_data(input.data, type = "individual", selected.ind = names(ind.id)) + return(out.dat) + } +} + +#' Filter markers based on chi-square test +#' +#' This function filter markers based on p-values of a chi-square test. +#' The chi-square test assumes that markers follow the expected segregation +#' patterns under Mendelian inheritance, random chromosome bivalent +#' pairing and no double reduction. +#' +#' @param input.obj name of input object (class \code{mappoly.data}) +#' +#' @param chisq.pval.thres p-value threshold used for chi-square tests +#' (default = Bonferroni aproximation with global alpha of 0.05, i.e., +#' 0.05/n.mrk) +#' +#' @param inter if TRUE (default), plots distorted vs. non-distorted markers +#' +#' @return An object of class \code{mappoly.chitest.seq} which contains a list with the following components: +#' \item{keep}{markers that follow Mendelian segregation pattern} +#' \item{exclude}{markers with distorted segregation} +#' \item{chisq.pval.thres}{threshold p-value used for chi-square tests} +#' \item{data.name}{input dataset used to perform the chi-square tests} +#' +#'@examples +#' mrks.chi.filt <- filter_segregation(input.obj = tetra.solcap, +#' chisq.pval.thres = 0.05/tetra.solcap$n.mrk, +#' inter = TRUE) +#' seq.init <- make_seq_mappoly(mrks.chi.filt) +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} +#' +#' @importFrom graphics axis +#' @export +filter_segregation <- function(input.obj, chisq.pval.thres = NULL, inter = TRUE){ + op <- par(pty="s") + on.exit(par(op)) + if(inherits(input.obj, c("mappoly.data"))){ + chisq.val <- input.obj$chisq.pval + n.mrk <- input.obj$n.mrk + data.name <- as.character(sys.call())[2] + } else if (inherits(input.obj, c("mappoly.sequence"))){ + chisq.val <- input.obj$chisq.pval[input.obj$seq.mrk.names] + n.mrk <- length(input.obj$seq.num) + data.name <- input.obj$data.name + } else { + stop(deparse(substitute(input.obj)), + " is not an object of class 'mappoly.data' or 'mappoly.sequence'") + } + ##Bonferroni approx + if(is.null(chisq.pval.thres)) + chisq.pval.thres <- 0.05/n.mrk + ANSWER <- "flag" + + if(interactive() && inter) + { + while(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") + { + x <- log10(sort(chisq.val, decreasing = TRUE)) + th <- log10(chisq.pval.thres) + plot(x, + xlab = "markers", + ylab = bquote(log[10](P)), + col = ifelse(x <= th, 2, 4), + pch =ifelse(x <= th, 4, 1)) + abline(h = th, lty = 2) + f<-paste0("Filtered out: ", sum(x < th)) + i<-paste0("Included: ", sum(x >= th)) + legend("bottomleft", c(f, i) , col = c(2,4), pch = c(4,1)) + ANSWER <- readline("Enter 'Y/n' to proceed or update the p value threshold: ") + if(substr(ANSWER, 1, 1) == "n" | substr(ANSWER, 1, 1) == "no" | substr(ANSWER, 1, 1) == "N") + stop("You decided to stop the function.") + if(substr(ANSWER, 1, 1) != "y" && substr(ANSWER, 1, 1) != "yes" && substr(ANSWER, 1, 1) != "Y" && ANSWER != "") + chisq.pval.thres <- as.numeric(ANSWER) + } + } + keep <- names(which(chisq.val >= chisq.pval.thres)) + exclude <- names(which(chisq.val < chisq.pval.thres)) + structure(list(keep = keep, exclude = exclude, chisq.pval.thres = chisq.pval.thres, + data.name = data.name), + class = "mappoly.chitest.seq") +} + +#' Filter out individuals +#' +#' This function removes individuals from the data set. Individuals can be +#' user-defined or can be accessed via interactive kinship analysis. +#' +#' @param input.data name of input object (class \code{mappoly.data}) +#' +#' @param ind.to.remove individuals to be removed. If \code{NULL} it opens +#' an interactive graphic to proceed with the individual +#' selection +#' @param inter if \code{TRUE}, expects user-input to proceed with filtering +#' +#' @param verbose if \code{TRUE} (default), shows the filtered out individuals +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} +#' +#' @export +#' +filter_individuals <- function(input.data, ind.to.remove = NULL, inter = TRUE, verbose = TRUE){ + if (!inherits(input.data, "mappoly.data")) { + stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") + } + op <- par(pty="s") + on.exit(par(op)) + # if (!require(AGHmatrix)) + # stop("Please install package 'AGHmatrix' to proceed") + D <- t(input.data$geno.dose) + D[D == input.data$ploidy+1] <- NA + D <- rbind(input.data$dosage.p1, input.data$dosage.p2, D) + rownames(D)[1:2] <- c("P1", "P2") + G <- AGHmatrix::Gmatrix(D, method = "VanRaden",ploidy = input.data$ploidy) + x <- G[1,] + y <- G[2,] + #a <- 2*atan(y/x)/pi + #b <- sqrt(x^2 + y^2) + df <- data.frame(x = x, y = y, type = c(2, 2, rep(4, length(x)-2))) + plot(df[,1:2], col = df$type, pch = 19, + xlab = "relationships between the offspring and P1", + ylab = "relationships between the offspring and P2") + abline(c(0,1), lty = 2) + abline(c(-0.4,1), lty = 2, col = "gray") + abline(c(0.4,1), lty = 2, col = "gray") + legend("topright", c("Parents", "Offspring") , col = c(2,4), pch = 19) + if(!is.null(ind.to.remove)){ + out.data <- sample_data(input.data, selected.ind = setdiff(input.data$ind.names, ind.to.remove)) + return(out.data) + } + if(interactive() && inter) + { + # if (!require(gatepoints)) + # stop("Please install package 'gatepoints' to proceed") + ANSWER <- readline("Enter 'Y/n' to proceed with interactive filtering or quit: ") + if(substr(ANSWER, 1, 1) == "y" | substr(ANSWER, 1, 1) == "yes" | substr(ANSWER, 1, 1) == "Y" | ANSWER == "") + { + ind.to.remove <- gatepoints::fhs(df, mark = TRUE) + ind.to.remove <- setdiff(ind.to.remove, c("P1", "P2")) + ind.to.include <- setdiff(rownames(df)[-c(1:2)], ind.to.remove) + if(verbose){ + cat("Removing individual(s): \n") + print(ind.to.remove) + cat("...\n") + } + if(length(ind.to.remove) == 0){ + warning("No individuals removed. Returning original data set.") + return(input.data) + } + out.data <- sample_data(input.data, selected.ind = ind.to.include) + return(out.data) + } else{ + warning("No individuals removed. Returning original data set.") + return(input.data) + } + } +} +#' Remove markers that do not meet a LOD criteria +#' +#' Remove markers that do not meet a LOD and recombination fraction +#' criteria for at least a percentage of the pairwise marker +#' combinations. It also removes markers with strong evidence of +#' linkage across the whole linkage group (false positive). +#' +#' \code{thresh.LOD.ph} should be set in order to only select +#' recombination fractions that have LOD scores associated to the +#' linkage phase configuration higher than \code{thresh_LOD_ph} +#' when compared to the second most likely linkage phase configuration. +#' That action usually eliminates markers that are unlinked to the +#' set of analyzed markers. +#' +#' @param input.twopt an object of class \code{mappoly.twopt} +#' +#' @param thresh.LOD.ph LOD score threshold for linkage phase configuration +#' (default = 5) +#' +#' @param thresh.LOD.rf LOD score threshold for recombination fraction +#' (default = 5) +#' +#' @param thresh.rf threshold for recombination fractions (default = 0.15) +#' +#' @param probs indicates the probability corresponding to the filtering +#' quantiles. (default = c(0.05, 1)) +#' +#' @param diag.markers A window where marker pairs should be considered. +#' If NULL (default), all markers are considered. +#' +#' @param mrk.order marker order. Only has effect if 'diag.markers' is not NULL +#' +#' @param ncpus number of parallel processes (i.e. cores) to spawn +#' (default = 1) +#' +#' @param diagnostic.plot if \code{TRUE} produces a diagnostic plot +#' +#' @param breaks number of cells for the histogram +#' +#' @return A filtered object of class \code{mappoly.sequence}. +#' See \code{\link[mappoly]{make_seq_mappoly}} for details +#' +#' @examples +#' all.mrk <- make_seq_mappoly(hexafake, 1:20) +#' red.mrk <- elim_redundant(all.mrk) +#' unique.mrks <- make_seq_mappoly(red.mrk) +#' all.pairs <- est_pairwise_rf(input.seq = unique.mrks, +#' ncpus = 1, +#' verbose = TRUE) +#' +#' ## Full recombination fraction matrix +#' mat.full <- rf_list_to_matrix(input.twopt = all.pairs) +#' plot(mat.full) +#' +#' ## Removing disruptive SNPs +#' tpt.filt <- rf_snp_filter(all.pairs, 2, 2, 0.07, probs = c(0.15, 1)) +#' p1.filt <- make_pairs_mappoly(input.seq = tpt.filt, input.twopt = all.pairs) +#' m1.filt <- rf_list_to_matrix(input.twopt = p1.filt) +#' plot(mat.full, main.text = "LG1") +#' plot(m1.filt, main.text = "LG1.filt") +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} with updates by Gabriel Gesteira, \email{gdesiqu@ncsu.edu} +#' +#' @references +#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage +#' analysis and haplotype phasing in experimental autopolyploid +#' populations with high ploidy level using hidden Markov +#' models, _G3: Genes, Genomes, Genetics_. +#' \doi{10.1534/g3.119.400378} +#' +#' @export rf_snp_filter +#' @importFrom ggplot2 ggplot geom_histogram aes scale_fill_manual xlab ggtitle +#' @importFrom graphics hist +rf_snp_filter <- function(input.twopt, + thresh.LOD.ph = 5, + thresh.LOD.rf = 5, + thresh.rf = 0.15, + probs = c(0.05, 1), + diag.markers = NULL, + mrk.order = NULL, + ncpus = 1L, + diagnostic.plot = TRUE, + breaks = 100) +{ + + input_classes <- c("mappoly.twopt", "mappoly.twopt2") + if (!inherits(input.twopt, input_classes)) { + stop(deparse(substitute(input.twopt)), paste0(" is not an object of class ", paste0(input_classes, collapse = " or "))) + } + probs <- range(probs) + ## Getting filtered rf matrix + rf_mat <- rf_list_to_matrix(input.twopt = input.twopt, thresh.LOD.ph = thresh.LOD.ph, + thresh.LOD.rf = thresh.LOD.rf, thresh.rf = thresh.rf, + ncpus = ncpus, verbose = FALSE) + M <- rf_mat$rec.mat + if(!is.null(mrk.order)) + M <- M[mrk.order, mrk.order] + if(!is.null(diag.markers)) + M[abs(col(M) - row(M)) > diag.markers] <- NA + x <- apply(M, 1, function(x) sum(!is.na(x))) + w <- hist(x, breaks = breaks, plot = FALSE) + th <- quantile(x, probs = probs) + rem <- c(which(x < th[1]), which(x > th[2])) + ids <- names(which(x >= th[1] & x <= th[2])) + value <- type <- NULL + if(diagnostic.plot){ + d <- rbind(data.frame(type = "original", value = x), + data.frame(type = "filtered", value = x[ids])) + p <- ggplot2::ggplot(d, ggplot2::aes(value)) + + ggplot2::geom_histogram(ggplot2::aes(fill = type), + alpha = 0.4, position = "identity", binwidth = diff(w$mids)[1]) + + ggplot2::scale_fill_manual(values = c("#00AFBB", "#E7B800")) + + ggplot2::ggtitle( paste0("Filtering probs: [", probs[1], " : ", probs[2], "] - Non NA values by row in rf matrix - b width: ", diff(w$mids)[1])) + + ggplot2::xlab(paste0("Non 'NA' values at LOD.ph = ", thresh.LOD.ph, ", LOD.rf = ", thresh.LOD.rf, ", and thresh.rf = ", thresh.rf)) + print(p) + } + ## Returning sequence object + ch_filt <- make_seq_mappoly(input.obj = get(input.twopt$data.name, pos = 1), arg = ids, data.name = input.twopt$data.name) + return(ch_filt) +} + +#' Edit sequence ordered by reference genome positions +#' comparing to another set order +#' +#' @param input.seq object of class mappoly.sequence with alternative order (not genomic order) +#' @param invert vector of marker names to be inverted +#' @param remove vector of marker names to be removed +#' +#' @author Cristiane Taniguti, \email{chtaniguti@tamu.edu} +#' +#' @examples +#' \donttest{ +#' dat <- filter_segregation(tetra.solcap, inter = FALSE) +#' seq_dat <- make_seq_mappoly(dat) +#' seq_chr <- make_seq_mappoly(seq_dat, arg = seq_dat$seq.mrk.names[which(seq_dat$chrom=="1")]) +#' +#' tpt <- est_pairwise_rf(seq_chr) +#' seq.filt <- rf_snp_filter(tpt, probs = c(0.05, 0.95)) +#' mat <- rf_list_to_matrix(tpt) +#' mat2 <- make_mat_mappoly(mat, seq.filt) +#' +#' seq_test_mds <- mds_mappoly(mat2) +#' seq_mds <- make_seq_mappoly(seq_test_mds) +#' edit_seq <- edit_order(input.seq = seq_mds) +#' } +#' +#' @return object of class \code{mappoly.edit.order}: a list containing +#' vector of marker names ordered according to editions (`edited_order`); +#' vector of removed markers names (`removed`); +#' vector of inverted markers names (`inverted`). +#' +#' @export +edit_order <- function(input.seq, invert= NULL, remove = NULL){ + + if (!inherits(input.seq, "mappoly.sequence")) { + stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") + } + + get_weird <- data.frame(x = 1:length(input.seq$genome.pos), + y = input.seq$genome.pos) + + rownames(get_weird) <- input.seq$seq.mrk.names + get_weird <- get_weird[order(get_weird$y),] + plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") + + inverted <- removed <- vector() + if(!is.null(invert) | !is.null(remove)){ + if(!is.null(invert)){ + inverted <- c(inverted, as.vector(invert)) + repl <- get_weird[rev(match(as.vector(invert),rownames(get_weird))),] + get_weird[match(as.vector(invert),rownames(get_weird)),2] <- repl[,2] + rownames(get_weird)[match(as.vector(invert), rownames(get_weird))] <- rownames(repl) + } + if(!is.null(remove)){ + removed <- c(removed, as.vector(remove)) + get_weird <- get_weird[-match(remove, rownames(get_weird)),] + } + plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") + } else { + cat("Mark at least three points on the plot and press `Esc` to continue.") + if(interactive()){ + ANSWER <- "Y" + while(substr(ANSWER, 1, 1) == "y" | substr(ANSWER, 1, 1) == "yes" | substr(ANSWER, 1, 1) == "Y" | ANSWER == ""){ + plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") + mks.to.remove <- gatepoints::fhs(get_weird, mark = TRUE) + if(length(which(rownames(get_weird) %in% mks.to.remove)) > 0){ + ANSWER2 <- readline("Enter 'invert/remove' to proceed with the edition: ") + if(ANSWER2 == "invert"){ + inverted <- c(inverted, as.vector(mks.to.remove)) + repl <- get_weird[rev(match(as.vector(mks.to.remove),rownames(get_weird))),] + get_weird[match(as.vector(mks.to.remove),rownames(get_weird)),2] <- repl[,2] + rownames(get_weird)[match(as.vector(mks.to.remove), rownames(get_weird))] <- rownames(repl) + } else { + removed <- c(removed, as.vector(mks.to.remove)) + get_weird <- get_weird[-match(mks.to.remove, rownames(get_weird)),] + } + } + ANSWER <- readline("Enter 'Y/n' to proceed with interactive edition or quit: ") + } + plot(get_weird$x, get_weird$y, xlab="input sequence order", ylab = "genomic position (bp)") + } + } + + return(structure(list(edited_order = rownames(get_weird), + removed = removed, + inverted = inverted, + data.name = input.seq$data.name), class = "mappoly.edit.order")) +} + +#' Filter aneuploid chromosomes from progeny individuals +#' +#' @param input.data name of input object (class \code{mappoly.data}) +#' +#' @param aneuploid.info data.frame with ploidy information by chromosome (columns) for each individual +#' in progeny (rows). The chromosome and individuals names must match the ones in the file used as input +#' in mappoly. +#' +#' @param ploidy main ploidy +#' +#' @param rm_missing remove also genotype information from chromosomes with missing data (NA) in the aneuploid.info file +#' +#' @examples +#' aneuploid.info <- matrix(4, nrow=tetra.solcap$n.ind, ncol = 12) +#' set.seed(8080) +#' aneuploid.info[sample(1:length(aneuploid.info), round((4*length(aneuploid.info))/100),0)] <- 3 +#' aneuploid.info[sample(1:length(aneuploid.info), round((4*length(aneuploid.info))/100),0)] <- 5 +#' +#' colnames(aneuploid.info) <- paste0(1:12) +#' aneuploid.info <- cbind(inds = tetra.solcap$ind.names, aneuploid.info) +#' +#' filt.dat <- filter_aneuploid(input.data = tetra.solcap, +#' aneuploid.info = aneuploid.info, ploidy = 4) +#' +#' @author Cristiane Taniguti, \email{chtaniguti@tamu.edu} +#' +#' @return object of class \code{mappoly.data} +#' +#' @export +filter_aneuploid <- function(input.data, aneuploid.info, ploidy, rm_missing = TRUE){ + + if (!inherits(input.data, "mappoly.data")) { + stop(deparse(substitute(input.data)), " is not an object of class 'mappoly.data'") + } + + aneu.chroms <- colnames(aneuploid.info)[-1] + + keep.ind <- match(input.data$ind.names, aneuploid.info[,1]) + + if(length(keep.ind) != length(input.data$ind.names) | any(is.na(keep.ind))) + stop("The aneuploid information ploidy is missing at least one sample.") + + aneuploid.info.filt <- aneuploid.info[keep.ind,] + + idx.list <- apply(aneuploid.info.filt[,-1], 1, function(x) which(x != ploidy)) + names(idx.list) <- aneuploid.info.filt[,1] + + idx.list <- idx.list[-which(!sapply(idx.list, function(x) length(x) > 0))] + + if(rm_missing){ + idx.list.na <- apply(aneuploid.info.filt[,-1], 1, function(x) which(is.na(x))) + if(length(idx.list.na) > 0){ + names(idx.list.na) <- aneuploid.info.filt[,1] + idx.list.na <- idx.list.na[-which(!sapply(idx.list.na, function(x) length(x) > 0))] + idx.list <- c(idx.list, idx.list.na) + } + cat(round((length(unlist(idx.list))/(dim(aneuploid.info.filt)[1]*(dim(aneuploid.info.filt)[2]-1)))*100,2), "% of the chromosomes x individuals are aneuploids or has missing ploidy information\n") + } else + cat(round((length(unlist(idx.list))/(dim(aneuploid.info.filt)[1]*(dim(aneuploid.info.filt)[2]-1)))*100,2), "% of the chromosomes x individuals are aneuploids\n") + + for(i in 1:length(idx.list)){ + idx <- which(input.data$chrom %in% names(idx.list[[i]])) + input.data$geno.dose[idx,names(idx.list[i])] <- ploidy + 1 + } + + return(input.data) +} + diff --git a/man/edit_order.Rd b/man/edit_order.Rd index 8115528c..0646aa9a 100644 --- a/man/edit_order.Rd +++ b/man/edit_order.Rd @@ -5,10 +5,14 @@ \title{Edit sequence ordered by reference genome positions comparing to another set order} \usage{ -edit_order(input.seq) +edit_order(input.seq, invert = NULL, remove = NULL) } \arguments{ \item{input.seq}{object of class mappoly.sequence with alternative order (not genomic order)} + +\item{invert}{vector of marker names to be inverted} + +\item{remove}{vector of marker names to be removed} } \value{ object of class \code{mappoly.edit.order}: a list containing diff --git a/man/filter_aneuploid.Rd b/man/filter_aneuploid.Rd index 8b3614e6..15a638d5 100644 --- a/man/filter_aneuploid.Rd +++ b/man/filter_aneuploid.Rd @@ -4,7 +4,7 @@ \alias{filter_aneuploid} \title{Filter aneuploid chromosomes from progeny individuals} \usage{ -filter_aneuploid(input.data, aneuploid.info, ploidy) +filter_aneuploid(input.data, aneuploid.info, ploidy, rm_missing = TRUE) } \arguments{ \item{input.data}{name of input object (class \code{mappoly.data})} @@ -14,6 +14,8 @@ in progeny (rows). The chromosome and individuals names must match the ones in t in mappoly.} \item{ploidy}{main ploidy} + +\item{rm_missing}{remove also genotype information from chromosomes with missing data (NA) in the aneuploid.info file} } \value{ object of class \code{mappoly.data} diff --git a/man/merge_parental_maps.Rd b/man/merge_parental_maps.Rd index 3debc8f6..7d0ed3e6 100644 --- a/man/merge_parental_maps.Rd +++ b/man/merge_parental_maps.Rd @@ -9,7 +9,8 @@ merge_parental_maps( map.p2, full.seq, full.mat, - method = c("ols", "hmm") + method = c("ols", "hmm"), + verbose = TRUE ) } \arguments{ diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 7e21fe7e..d9f0e2eb 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -1,106 +1,106 @@ -// Generated by using Rcpp::compileAttributes() -> do not edit by hand -// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 - -#include - -using namespace Rcpp; - -#ifdef RCPP_USE_GLOBAL_ROSTREAM -Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); -Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); -#endif - -// vcf_get_probabilities -Rcpp::List vcf_get_probabilities(Rcpp::StringMatrix& mat, int pl_pos); -RcppExport SEXP _mappoly_vcf_get_probabilities(SEXP matSEXP, SEXP pl_posSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); - Rcpp::traits::input_parameter< int >::type pl_pos(pl_posSEXP); - rcpp_result_gen = Rcpp::wrap(vcf_get_probabilities(mat, pl_pos)); - return rcpp_result_gen; -END_RCPP -} -// vcf_transform_dosage -Rcpp::NumericMatrix vcf_transform_dosage(Rcpp::StringMatrix& mat, int gt_pos); -RcppExport SEXP _mappoly_vcf_transform_dosage(SEXP matSEXP, SEXP gt_posSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); - Rcpp::traits::input_parameter< int >::type gt_pos(gt_posSEXP); - rcpp_result_gen = Rcpp::wrap(vcf_transform_dosage(mat, gt_pos)); - return rcpp_result_gen; -END_RCPP -} -// vcf_get_ploidy -Rcpp::NumericMatrix vcf_get_ploidy(Rcpp::StringMatrix& mat, int gt_pos); -RcppExport SEXP _mappoly_vcf_get_ploidy(SEXP matSEXP, SEXP gt_posSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); - Rcpp::traits::input_parameter< int >::type gt_pos(gt_posSEXP); - rcpp_result_gen = Rcpp::wrap(vcf_get_ploidy(mat, gt_pos)); - return rcpp_result_gen; -END_RCPP -} -// vcf_get_depth -Rcpp::NumericMatrix vcf_get_depth(Rcpp::StringMatrix& mat, int dp_pos); -RcppExport SEXP _mappoly_vcf_get_depth(SEXP matSEXP, SEXP dp_posSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); - Rcpp::traits::input_parameter< int >::type dp_pos(dp_posSEXP); - rcpp_result_gen = Rcpp::wrap(vcf_get_depth(mat, dp_pos)); - return rcpp_result_gen; -END_RCPP -} - -RcppExport SEXP calc_genoprob(void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP calc_genoprob_prior(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP calc_genprob_haplo(void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP calc_genprob_haplo_highprec(void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP calc_genprob_single_parent(void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP est_haplotype_map(void *, void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP est_haplotype_map_highprec(void *, void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP est_hmm_map_single_parent(void *, void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP est_map_hmm(void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP est_map_hmm_highprec(void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP get_counts_single_parent_cpp(void *, void *, void *, void *, void *, void *); -RcppExport SEXP loglike_hmm(void *, void *, void *, void *, void *, void *); -RcppExport SEXP pairwise_rf_estimation_disc(void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP pairwise_rf_estimation_disc_rcpp(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP pairwise_rf_estimation_prob(void *, void *, void *, void *, void *, void *, void *, void *); -RcppExport SEXP poly_hmm_est_CPP(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); - -static const R_CallMethodDef CallEntries[] = { - {"_mappoly_vcf_get_probabilities", (DL_FUNC) &_mappoly_vcf_get_probabilities, 2}, - {"_mappoly_vcf_transform_dosage", (DL_FUNC) &_mappoly_vcf_transform_dosage, 2}, - {"_mappoly_vcf_get_ploidy", (DL_FUNC) &_mappoly_vcf_get_ploidy, 2}, - {"_mappoly_vcf_get_depth", (DL_FUNC) &_mappoly_vcf_get_depth, 2}, - {"calc_genoprob", (DL_FUNC) &calc_genoprob, 7}, - {"calc_genoprob_prior", (DL_FUNC) &calc_genoprob_prior, 12}, - {"calc_genprob_haplo", (DL_FUNC) &calc_genprob_haplo, 8}, - {"calc_genprob_haplo_highprec", (DL_FUNC) &calc_genprob_haplo_highprec, 8}, - {"calc_genprob_single_parent", (DL_FUNC) &calc_genprob_single_parent, 8}, - {"est_haplotype_map", (DL_FUNC) &est_haplotype_map, 9}, - {"est_haplotype_map_highprec", (DL_FUNC) &est_haplotype_map_highprec, 9}, - {"est_hmm_map_single_parent", (DL_FUNC) &est_hmm_map_single_parent, 9}, - {"est_map_hmm", (DL_FUNC) &est_map_hmm, 8}, - {"est_map_hmm_highprec", (DL_FUNC) &est_map_hmm_highprec, 8}, - {"get_counts_single_parent_cpp", (DL_FUNC) &get_counts_single_parent_cpp, 6}, - {"loglike_hmm", (DL_FUNC) &loglike_hmm, 6}, - {"pairwise_rf_estimation_disc", (DL_FUNC) &pairwise_rf_estimation_disc, 7}, - {"pairwise_rf_estimation_disc_rcpp", (DL_FUNC) &pairwise_rf_estimation_disc_rcpp, 13}, - {"pairwise_rf_estimation_prob", (DL_FUNC) &pairwise_rf_estimation_prob, 8}, - {"poly_hmm_est_CPP", (DL_FUNC) &poly_hmm_est_CPP, 13}, - {NULL, NULL, 0} -}; - -RcppExport void R_init_mappoly(DllInfo *dll) { - R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); - R_useDynamicSymbols(dll, FALSE); -} +// Generated by using Rcpp::compileAttributes() -> do not edit by hand +// Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 + +#include + +using namespace Rcpp; + +#ifdef RCPP_USE_GLOBAL_ROSTREAM +Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); +Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); +#endif + +// vcf_get_probabilities +Rcpp::List vcf_get_probabilities(Rcpp::StringMatrix& mat, int pl_pos); +RcppExport SEXP _mappoly_vcf_get_probabilities(SEXP matSEXP, SEXP pl_posSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); + Rcpp::traits::input_parameter< int >::type pl_pos(pl_posSEXP); + rcpp_result_gen = Rcpp::wrap(vcf_get_probabilities(mat, pl_pos)); + return rcpp_result_gen; +END_RCPP +} +// vcf_transform_dosage +Rcpp::NumericMatrix vcf_transform_dosage(Rcpp::StringMatrix& mat, int gt_pos); +RcppExport SEXP _mappoly_vcf_transform_dosage(SEXP matSEXP, SEXP gt_posSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); + Rcpp::traits::input_parameter< int >::type gt_pos(gt_posSEXP); + rcpp_result_gen = Rcpp::wrap(vcf_transform_dosage(mat, gt_pos)); + return rcpp_result_gen; +END_RCPP +} +// vcf_get_ploidy +Rcpp::NumericMatrix vcf_get_ploidy(Rcpp::StringMatrix& mat, int gt_pos); +RcppExport SEXP _mappoly_vcf_get_ploidy(SEXP matSEXP, SEXP gt_posSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); + Rcpp::traits::input_parameter< int >::type gt_pos(gt_posSEXP); + rcpp_result_gen = Rcpp::wrap(vcf_get_ploidy(mat, gt_pos)); + return rcpp_result_gen; +END_RCPP +} +// vcf_get_depth +Rcpp::NumericMatrix vcf_get_depth(Rcpp::StringMatrix& mat, int dp_pos); +RcppExport SEXP _mappoly_vcf_get_depth(SEXP matSEXP, SEXP dp_posSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< Rcpp::StringMatrix& >::type mat(matSEXP); + Rcpp::traits::input_parameter< int >::type dp_pos(dp_posSEXP); + rcpp_result_gen = Rcpp::wrap(vcf_get_depth(mat, dp_pos)); + return rcpp_result_gen; +END_RCPP +} + +RcppExport SEXP calc_genoprob(void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP calc_genoprob_prior(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP calc_genprob_haplo(void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP calc_genprob_haplo_highprec(void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP calc_genprob_single_parent(void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP est_haplotype_map(void *, void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP est_haplotype_map_highprec(void *, void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP est_hmm_map_single_parent(void *, void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP est_map_hmm(void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP est_map_hmm_highprec(void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP get_counts_single_parent_cpp(void *, void *, void *, void *, void *, void *); +RcppExport SEXP loglike_hmm(void *, void *, void *, void *, void *, void *); +RcppExport SEXP pairwise_rf_estimation_disc(void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP pairwise_rf_estimation_disc_rcpp(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP pairwise_rf_estimation_prob(void *, void *, void *, void *, void *, void *, void *, void *); +RcppExport SEXP poly_hmm_est_CPP(void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *, void *); + +static const R_CallMethodDef CallEntries[] = { + {"_mappoly_vcf_get_probabilities", (DL_FUNC) &_mappoly_vcf_get_probabilities, 2}, + {"_mappoly_vcf_transform_dosage", (DL_FUNC) &_mappoly_vcf_transform_dosage, 2}, + {"_mappoly_vcf_get_ploidy", (DL_FUNC) &_mappoly_vcf_get_ploidy, 2}, + {"_mappoly_vcf_get_depth", (DL_FUNC) &_mappoly_vcf_get_depth, 2}, + {"calc_genoprob", (DL_FUNC) &calc_genoprob, 7}, + {"calc_genoprob_prior", (DL_FUNC) &calc_genoprob_prior, 12}, + {"calc_genprob_haplo", (DL_FUNC) &calc_genprob_haplo, 8}, + {"calc_genprob_haplo_highprec", (DL_FUNC) &calc_genprob_haplo_highprec, 8}, + {"calc_genprob_single_parent", (DL_FUNC) &calc_genprob_single_parent, 8}, + {"est_haplotype_map", (DL_FUNC) &est_haplotype_map, 9}, + {"est_haplotype_map_highprec", (DL_FUNC) &est_haplotype_map_highprec, 9}, + {"est_hmm_map_single_parent", (DL_FUNC) &est_hmm_map_single_parent, 9}, + {"est_map_hmm", (DL_FUNC) &est_map_hmm, 8}, + {"est_map_hmm_highprec", (DL_FUNC) &est_map_hmm_highprec, 8}, + {"get_counts_single_parent_cpp", (DL_FUNC) &get_counts_single_parent_cpp, 6}, + {"loglike_hmm", (DL_FUNC) &loglike_hmm, 6}, + {"pairwise_rf_estimation_disc", (DL_FUNC) &pairwise_rf_estimation_disc, 7}, + {"pairwise_rf_estimation_disc_rcpp", (DL_FUNC) &pairwise_rf_estimation_disc_rcpp, 13}, + {"pairwise_rf_estimation_prob", (DL_FUNC) &pairwise_rf_estimation_prob, 8}, + {"poly_hmm_est_CPP", (DL_FUNC) &poly_hmm_est_CPP, 13}, + {NULL, NULL, 0} +}; + +RcppExport void R_init_mappoly(DllInfo *dll) { + R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); + R_useDynamicSymbols(dll, FALSE); +}