diff --git a/R/build_map_by_steps.R b/R/build_map_by_steps.R index 8cf6596d..5c27b8a4 100644 --- a/R/build_map_by_steps.R +++ b/R/build_map_by_steps.R @@ -197,7 +197,7 @@ update_framework_map <- function(input.map.list, method = "hmm", verbose = verbose) - a <- rem_mrk_clusters(cur.res$map, + a <- rem_mrk_clusters(input.map = cur.res$map, size.rem.cluster = size.rem.cluster, gap.threshold = gap.threshold) @@ -337,7 +337,7 @@ add_md_markers <- function(input.map, thresh = 500, extend.tail = 50, method = c("hmm", "wMDS_to_1D_pc"), - verbose = T){ + verbose = TRUE){ method <- match.arg(method) if(method == "wMDS_to_1D_pc" & is.null(input.mds)) stop("you must provide 'input.mds' when selecting method = 'wMDS_to_1D_pc'", call. = FALSE) @@ -435,6 +435,7 @@ rem_mrk_clusters <- function(input.map, id <- cbind(c(1, id+1), c(id, input.map$info$n.mrk)) ## Selecting map segments larger then the specified threshold segments <- id[apply(id, 1, diff) > size.rem.cluster - 1, , drop = FALSE] + if(dim(segments)[1] == 0) stop("All markers were discarted using the defined gap.threshold.") id<-NULL for(i in 1:nrow(segments)){ id <- c(id, segments[i,1]:segments[i,2]) diff --git a/R/est_map_hmm.R b/R/est_map_hmm.R index c48492b0..48bbc4a4 100755 --- a/R/est_map_hmm.R +++ b/R/est_map_hmm.R @@ -1,1134 +1,1139 @@ -#' Multipoint analysis using Hidden Markov Models in autopolyploids -#' -#' Performs the multipoint analysis proposed by \cite{Mollinari and -#' Garcia (2019)} in a sequence of markers -#' -#' This function first enumerates a set of linkage phase configurations -#' based on two-point recombination fraction information using a threshold -#' provided by the user (argument \code{thresh}). After that, for each -#' configuration, it reconstructs the genetic map using the -#' HMM approach described in Mollinari and Garcia (2019). As result, it returns -#' the multipoint likelihood for each configuration in form of LOD Score comparing -#' each configuration to the most likely one. It is recommended to use a small number -#' of markers (e.g. 50 markers for hexaploids) since the possible linkage -#' phase combinations bounded only by the two-point information can be huge. -#' Also, it can be quite sensible to small changes in \code{'thresh'}. -#' For a large number of markers, please see \code{\link[mappoly]{est_rf_hmm_sequential}}. -# -#' @param input.seq an object of class \code{mappoly.sequence} -#' -#' @param input.ph an object of class \code{two.pts.linkage.phases}. -#' If not available (default = NULL), it will be computed -#' -#' @param thres LOD Score threshold used to determine if the linkage phases -#' compared via two-point analysis should be considered. Smaller -#' values will result in smaller number of linkage phase -#' configurations to be evaluated by the multipoint algorithm. -#' -#' @param twopt an object of class \code{mappoly.twopt} -#' containing two-point information -#' -#' @param verbose if \code{TRUE}, current progress is shown; if -#' \code{FALSE} (default), no output is produced -#' -#' @param tol the desired accuracy (default = 1e-04) -#' -#' @param est.given.0.rf logical. If TRUE returns a map forcing all -#' recombination fractions equals to 0 (1e-5, for internal use only. -#' Default = FALSE) -#' -#' @param reestimate.single.ph.configuration logical. If \code{TRUE} -#' returns a map without re-estimating the map parameters for cases -#' where there is only one possible linkage phase configuration. -#' This argument is intended to be used in a sequential map construction -#' -#' @param high.prec logical. If \code{TRUE} (default) uses high precision -#' long double numbers in the HMM procedure -#' -#' @param x an object of the class \code{mappoly.map} -#' -#' @param detailed logical. if TRUE, prints the linkage phase configuration and the marker -#' position for all maps. If FALSE (default), prints a map summary -#' -#' @param left.lim the left limit of the plot (in cM, default = 0). -#' -#' @param right.lim the right limit of the plot (in cM, default = Inf, i.e., -#' will print the entire map) -#' -#' @param phase logical. If \code{TRUE} (default) plots the phase configuration -#' for both parents -#' -#' @param mrk.names if TRUE, marker names are displayed (default = FALSE) -#' -#' @param cex The magnification to be used for marker names -#' -#' @param config should be \code{'best'} or the position of the -#' configuration to be plotted. If \code{'best'}, plot the configuration -#' with the highest likelihood -#' -#' @param P a string containing the name of parent P -#' -#' @param Q a string containing the name of parent Q -#' -#' @param xlim range of the x-axis. If \code{xlim = NULL} (default) it uses the -#' map range. -#' -#' @param ... currently ignored -#' -#' @return A list of class \code{mappoly.map} with two elements: -#' -#' i) info: a list containing information about the map, regardless of the linkage phase configuration: -#' \item{ploidy}{the ploidy level} -#' \item{n.mrk}{number of markers} -#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, -#' according to the input file} -#' \item{mrk.names}{the names of markers in the map} -#' \item{seq.dose.p1}{a vector containing the dosage in parent 1 for all markers in the map} -#' \item{seq.dose.p2}{a vector containing the dosage in parent 2 for all markers in the map} -#' \item{chrom}{a vector indicating the sequence (usually chromosome) each marker belongs -#' as informed in the input file. If not available, -#' \code{chrom = NULL}} -#' \item{genome.pos}{physical position (usually in megabase) of the markers into the sequence} -#' \item{seq.ref}{reference base used for each marker (i.e. A, T, C, G). If not available, -#' \code{seq.ref = NULL}} -#' \item{seq.alt}{alternative base used for each marker (i.e. A, T, C, G). If not available, -#' \code{seq.ref = NULL}} -#' \item{chisq.pval}{a vector containing p-values of the chi-squared test of Mendelian -#' segregation for all markers in the map} -#' \item{data.name}{name of the dataset of class \code{mappoly.data}} -#' \item{ph.thres}{the LOD threshold used to define the linkage phase configurations to test} -#' -#' ii) a list of maps with possible linkage phase configuration. Each map in the list is also a -#' list containing -#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, -#' according to the input file} -#' \item{seq.rf}{a vector of size (\code{n.mrk - 1}) containing a sequence of recombination -#' fraction between the adjacent markers in the map} -#' \item{seq.ph}{linkage phase configuration for all markers in both parents} -#' \item{loglike}{the hmm-based multipoint likelihood} -#' -#' @examples -#' mrk.subset <- make_seq_mappoly(hexafake, 1:10) -#' red.mrk <- elim_redundant(mrk.subset) -#' unique.mrks <- make_seq_mappoly(red.mrk) -#' subset.pairs <- est_pairwise_rf(input.seq = unique.mrks, -#' ncpus = 1, -#' verbose = TRUE) -#' -#' ## Estimating subset map with a low tolerance for the E.M. procedure -#' ## for CRAN testing purposes -#' subset.map <- est_rf_hmm(input.seq = unique.mrks, -#' thres = 2, -#' twopt = subset.pairs, -#' verbose = TRUE, -#' tol = 0.1, -#' est.given.0.rf = FALSE) -#' subset.map -#' ## linkage phase configuration with highest likelihood -#' plot(subset.map, mrk.names = TRUE, config = "best") -#' ## the second one -#' plot(subset.map, mrk.names = TRUE, config = 2) -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} -#' -#' @references -#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage -#' analysis and haplotype phasing in experimental autopolyploid -#' populations with high ploidy level using hidden Markov -#' models, _G3: Genes, Genomes, Genetics_. -#' https://doi.org/10.1534/g3.119.400378 -#' @rdname est_rf_hmm -#' @export est_rf_hmm -est_rf_hmm <- function(input.seq, - input.ph = NULL, - thres = 0.5, - twopt = NULL, - verbose = FALSE, - tol = 1e-04, - est.given.0.rf = FALSE, - reestimate.single.ph.configuration = TRUE, - high.prec = TRUE) { - - ## checking for correct object - input_classes <- c("mappoly.sequence", "two.pts.linkage.phases") - if (!inherits(input.seq, input_classes[1])) { - stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") - } - if(length(input.seq$seq.num) == 1) - stop("Input sequence contains only one marker.", call. = FALSE) - if (is.null(input.ph)) { - if (is.null(twopt)) - stop("Two-point analysis not found!") - if (verbose) - cat("\nListing all configurations under threshold", thres, "using two-point information...\n") - input.ph <- ls_linkage_phases(input.seq = input.seq, thres = thres, twopt = twopt) - } - if (!inherits(input.ph, input_classes[2])) { - stop(deparse(substitute(input.ph)), " is not an object of class 'two.pts.linkage.phases'") - } - info.par <-detect_info_par(input.seq) - if(length(input.seq$seq.num) == 2 & !est.given.0.rf){ - maps <- vector("list", 1) - res.temp <- twopt$pairwise[[paste(sort(input.seq$seq.num), collapse = "-")]] - dp <- get(input.seq$data.name, pos = 1)$dosage.p1[input.seq$seq.num] - dq <- get(input.seq$data.name, pos = 1)$dosage.p2[input.seq$seq.num] - sh <- as.numeric(unlist(strsplit(rownames(res.temp)[1], split = "-"))) - res.ph <- list(P = c(list(0),list(0)), Q = c(list(0),list(0))) - if(dp[1] != 0) - res.ph$P[[1]] <- 1:dp[1] - if(dq[1] != 0) - res.ph$Q[[1]] <- 1:dq[1] - if(dp[2] != 0) - { - v <- (rev(res.ph$P[[1]])[1]+1):(dp[2]+rev(res.ph$P[[1]])[1]) - res.ph$P[[2]] <- v-sh[1] - } - if(dq[2] != 0) - { - v <- (rev(res.ph$Q[[1]])[1]+1):(dq[2]+rev(res.ph$Q[[1]])[1]) - res.ph$Q[[2]] <- v-sh[2] - } - names(res.ph$P) <- names(res.ph$Q) <- input.seq$seq.num - maps[[1]] <- list(seq.num = input.seq$seq.num, - seq.rf = res.temp[1,"rf"], - seq.ph = res.ph, - loglike = 0) - return(structure(list(info = list(ploidy = input.seq$ploidy, - n.mrk = length(input.seq$seq.num), - seq.num = input.seq$seq.num, - mrk.names = input.seq$seq.mrk.names, - seq.dose.p1 = get(input.seq$data.name, pos = 1)$dosage.p1[input.seq$seq.mrk.names], - seq.dose.p2 = get(input.seq$data.name, pos = 1)$dosage.p2[input.seq$seq.mrk.names], - chrom = get(input.seq$data.name, pos = 1)$chrom[input.seq$seq.mrk.names], - genome.pos = get(input.seq$data.name, pos = 1)$genome.pos[input.seq$seq.mrk.names], - seq.ref = get(input.seq$data.name, pos = 1)$seq.ref[input.seq$seq.mrk.names], - seq.alt = get(input.seq$data.name, pos = 1)$seq.alt[input.seq$seq.mrk.names], - chisq.pval = get(input.seq$data.name, pos = 1)$chisq.pval[input.seq$seq.mrk.names], - data.name = input.seq$data.name, - ph.thresh = abs(res.temp[2,1])), - maps = maps), - class = "mappoly.map")) - } - n.ph <- length(input.ph$config.to.test) - ret.map.no.rf.estimation <- FALSE - if(n.ph == 1 && !reestimate.single.ph.configuration) - ret.map.no.rf.estimation <- TRUE - maps <- vector("list", n.ph) - if (verbose){ - txt <- paste0(" ",n.ph, " phase(s): ") - cat(txt) - } - for (i in 1:n.ph) { - if (verbose) { - if((i+1)%%40 == 0) - cat("\n", paste0(rep(" ", nchar(txt)), collapse = ""), sep = "") - cat(". ") - } - if(est.given.0.rf){ - rf.temp <- rep(1e-5, length(input.seq$seq.num) - 1) - tol = 1 - } - else{ - rf.temp <- rep(0.001, length(input.seq$seq.num) - 1) - } - - ph <- list(P = input.ph$config.to.test[[i]]$P, - Q = input.ph$config.to.test[[i]]$Q) - - if(info.par == "both") - { - maps[[i]] <- est_rf_hmm_single_phase(input.seq = input.seq, - input.ph.single = ph, - rf.temp = rf.temp, - tol = tol, - verbose = FALSE, - ret.map.no.rf.estimation = ret.map.no.rf.estimation, - high.prec = high.prec) - } - else if (info.par == "p1") - { - maps[[i]] <- est_rf_hmm_single_phase_single_parent(input.seq, - input.ph.single = ph, - info.parent = 1, - uninfo.parent = 2, - rf.vec = rf.temp, - tol = tol, - verbose = FALSE, - ret.map.no.rf.estimation = ret.map.no.rf.estimation) - } - else if (info.par == "p2") - { - maps[[i]] <- est_rf_hmm_single_phase_single_parent(input.seq, - input.ph.single = ph, - info.parent = 2, - uninfo.parent = 1, - rf.vec = rf.temp, - tol = tol, - verbose = FALSE, - ret.map.no.rf.estimation = ret.map.no.rf.estimation) - - } - else stop("Should not get here.") - } - #cat("\n") - id <- order(sapply(maps, function(x) x$loglike), decreasing = TRUE) - maps <- maps[id] - if (verbose) - cat("\n") - structure(list(info = list(ploidy = input.seq$ploidy, - n.mrk = length(input.seq$seq.num), - seq.num = input.seq$seq.num, - mrk.names = input.seq$seq.mrk.names, - seq.dose.p1 = get(input.seq$data.name, pos = 1)$dosage.p1[input.seq$seq.mrk.names], - seq.dose.p2 = get(input.seq$data.name, pos = 1)$dosage.p2[input.seq$seq.mrk.names], - chrom = get(input.seq$data.name, pos = 1)$chrom[input.seq$seq.mrk.names], - genome.pos = get(input.seq$data.name, pos = 1)$genome.pos[input.seq$seq.mrk.names], - seq.ref = get(input.seq$data.name, pos = 1)$seq.ref[input.seq$seq.mrk.names], - seq.alt = get(input.seq$data.name, pos = 1)$seq.alt[input.seq$seq.mrk.names], - chisq.pval = get(input.seq$data.name, pos = 1)$chisq.pval[input.seq$seq.mrk.names], - data.name = input.seq$data.name, ph.thresh = input.ph$thres), - maps = maps), - class = "mappoly.map") -} - -#' Multipoint analysis using Hidden Markov Models: Sequential phase elimination -#' -#' Performs the multipoint analysis proposed by \cite{Mollinari and -#' Garcia (2019)} in a sequence of markers removing unlikely phases -#' using sequential multipoint information. -#' -#' This function sequentially includes markers into a map given an -#' ordered sequence. It uses two-point information to eliminate -#' unlikely linkage phase configurations given \code{thres.twopt}. The -#' search is made within a window of size \code{extend.tail}. For the -#' remaining configurations, the HMM-based likelihood is computed and -#' the ones that pass the HMM threshold (\code{thres.hmm}) are eliminated. -#' -#' @param input.seq an object of class \code{mappoly.sequence} -#' -#' @param twopt an object of class \code{mappoly.twopt} -#' containing the two-point information -#' -#' @param start.set number of markers to start the phasing procedure (default = 4) -#' -#' @param thres.twopt the LOD threshold used to determine if the linkage -#' phases compared via two-point analysis should be considered -#' for the search space reduction (A.K.A. \eqn{\eta} in -#' \cite{Mollinari and Garcia (2019)}, default = 5) -#' -#' @param thres.hmm the LOD threshold used to determine if the linkage -#' phases compared via hmm analysis should be evaluated in the -#' next round of marker inclusion (default = 50) -#' -#' @param extend.tail the length of the chain's tail that should -#' be used to calculate the likelihood of the map. If \code{NULL} (default), -#' the function uses all markers positioned. Even if \code{info.tail = TRUE}, -#' it uses at least \code{extend.tail} as the tail length -#' -#' @param phase.number.limit the maximum number of linkage phases of the sub-maps defined -#' by arguments \code{info.tail} and \code{extend.tail}. Default is 20. If the -#' size exceeds this limit, the marker will not be inserted. If -#' \code{Inf}, then it will insert all markers. -#' -#' @param sub.map.size.diff.limit the maximum accepted length -#' difference between the current and the previous sub-map defined -#' by arguments \code{info.tail} and \code{extend.tail}. If the -#' size exceeds this limit, the marker will not be inserted. If -#' \code{NULL}(default), then it will insert all markers. -#' -#' @param info.tail if \code{TRUE} (default), it uses the complete informative tail -#' of the chain (i.e. number of markers where all homologous -#' (\eqn{ploidy x 2}) can be distinguished) to calculate the map likelihood -#' -#'@param reestimate.single.ph.configuration logical. If \code{FALSE} (default) -#' returns a map without re-estimating the map parameters in cases -#' where there are only one possible linkage phase configuration -#' -#' @param tol the desired accuracy during the sequential phase (default = 10e-02) -#' -#' @param tol.final the desired accuracy for the final map (default = 10e-04) -#' -#' @param verbose If \code{TRUE} (default), current progress is shown; if -#' \code{FALSE}, no output is produced -#' -#' @param detailed.verbose If \code{TRUE}, the expansion of the current -#' submap is shown; -#' -#' @param high.prec logical. If \code{TRUE} uses high precision -#' (long double) numbers in the HMM procedure implemented in C++, -#' which can take a long time to perform (default = FALSE) -#' -#' @return A list of class \code{mappoly.map} with two elements: -#' -#' i) info: a list containing information about the map, regardless of the linkage phase configuration: -#' \item{ploidy}{the ploidy level} -#' \item{n.mrk}{number of markers} -#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, -#' according to the input file} -#' \item{mrk.names}{the names of markers in the map} -#' \item{seq.dose.p1}{a vector containing the dosage in parent 1 for all markers in the map} -#' \item{seq.dose.p2}{a vector containing the dosage in parent 2 for all markers in the map} -#' \item{chrom}{a vector indicating the sequence (usually chromosome) each marker belongs -#' as informed in the input file. If not available, -#' \code{chrom = NULL}} -#' \item{genome.pos}{physical position (usually in megabase) of the markers into the sequence} -#' \item{seq.ref}{reference base used for each marker (i.e. A, T, C, G). If not available, -#' \code{seq.ref = NULL}} -#' \item{seq.alt}{alternative base used for each marker (i.e. A, T, C, G). If not available, -#' \code{seq.ref = NULL}} -#' \item{chisq.pval}{a vector containing p-values of the chi-squared test of Mendelian -#' segregation for all markers in the map} -#' \item{data.name}{name of the dataset of class \code{mappoly.data}} -#' \item{ph.thres}{the LOD threshold used to define the linkage phase configurations to test} -#' -#' ii) a list of maps with possible linkage phase configuration. Each map in the list is also a -#' list containing -#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, -#' according to the input file} -#' \item{seq.rf}{a vector of size (\code{n.mrk - 1}) containing a sequence of recombination -#' fraction between the adjacent markers in the map} -#' \item{seq.ph}{linkage phase configuration for all markers in both parents} -#' \item{loglike}{the hmm-based multipoint likelihood} -#' -#' @examples -#' \donttest{ -#' mrk.subset <- make_seq_mappoly(hexafake, 1:20) -#' red.mrk <- elim_redundant(mrk.subset) -#' unique.mrks <- make_seq_mappoly(red.mrk) -#' subset.pairs <- est_pairwise_rf(input.seq = unique.mrks, -#' ncpus = 1, -#' verbose = TRUE) -#' subset.map <- est_rf_hmm_sequential(input.seq = unique.mrks, -#' thres.twopt = 5, -#' thres.hmm = 10, -#' extend.tail = 10, -#' tol = 0.1, -#' tol.final = 10e-3, -#' phase.number.limit = 5, -#' twopt = subset.pairs, -#' verbose = TRUE) -#' print(subset.map, detailed = TRUE) -#' plot(subset.map) -#' plot(subset.map, left.lim = 0, right.lim = 1, mrk.names = TRUE) -#' plot(subset.map, phase = FALSE) -#' -#' ## Retrieving simulated linkage phase -#' ph.P <- maps.hexafake[[1]]$maps[[1]]$seq.ph$P -#' ph.Q <- maps.hexafake[[1]]$maps[[1]]$seq.ph$Q -#' ## Estimated linkage phase -#' ph.P.est <- subset.map$maps[[1]]$seq.ph$P -#' ph.Q.est <- subset.map$maps[[1]]$seq.ph$Q -#' compare_haplotypes(ploidy = 6, h1 = ph.P[names(ph.P.est)], h2 = ph.P.est) -#' compare_haplotypes(ploidy = 6, h1 = ph.Q[names(ph.Q.est)], h2 = ph.Q.est) -#' } -#' -#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} -#' -#' @references -#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage -#' analysis and haplotype phasing in experimental autopolyploid -#' populations with high ploidy level using hidden Markov -#' models, _G3: Genes, Genomes, Genetics_. -#' \doi{10.1534/g3.119.400378} -#' -#' @importFrom utils head -#' @importFrom cli rule -#' @export est_rf_hmm_sequential -est_rf_hmm_sequential <- function(input.seq, - twopt, - start.set = 4, - thres.twopt = 5, - thres.hmm = 50, - extend.tail = NULL, - phase.number.limit = 20, - sub.map.size.diff.limit = Inf, - info.tail = TRUE, - reestimate.single.ph.configuration = FALSE, - tol = 10e-2, - tol.final = 10e-4, - verbose = TRUE, - detailed.verbose = FALSE, - high.prec = FALSE) -{## checking for correct object - input_classes <- c("mappoly.sequence", "mappoly.twopt") - if (!inherits(input.seq, input_classes[1])) { - stop(deparse(substitute(input.seq)), - " is not an object of class 'mappoly.sequence'") - } - if (!inherits(twopt, input_classes[2])) { - stop(deparse(substitute(twopt)), - " is not an object of class 'mappoly.twopt'") - } - ## Information about the map - if(verbose) { - cli::cat_line("Number of markers: ", length(input.seq$seq.num)) - msg("Initial sequence", line = 2) - } - if(is.null(extend.tail)) - extend.tail <- length(input.seq$seq.num) - ##### Map in case of two markers ##### - if(length(input.seq$seq.num) == 2) - { - cur.map <- est_rf_hmm(input.seq = input.seq, thres = thres.twopt, - twopt = twopt, tol = tol.final, high.prec = high.prec, - verbose = verbose, - reestimate.single.ph.configuration = reestimate.single.ph.configuration) - return(filter_map_at_hmm_thres(cur.map, thres.hmm)) - } - ##### More than 3 markers #### - ##Starting sequential algorithm for maps with more than 3 markers - ## First step: test all possible phase configurations under - ## a 'thres.twopt' threshold for a sequence of size 'start.set' - if(start.set > length(input.seq$seq.num)) - start.set <- length(input.seq$seq.num) - if(verbose) cat(start.set, " markers...\n", sep = "") - cte <- 0 - rf.temp <- c(Inf, Inf) - while(any(rf.temp >= sub.map.size.diff.limit)) - { - cte <- cte+1 - if(length(rf.temp) == 1) { - ## cat("Impossible build a map using the given thresholds\n") - ## stop() - stop("Impossible build a map using the given thresholds\n") - } - if(verbose) cat(cli::symbol$bullet, " Trying sequence:", cte:(start.set+cte-1), ":\n") - cur.seq <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), na.omit(input.seq$seq.num[cte:(start.set+cte-1)]), data.name = input.seq$data.name) - input.ph <- ls_linkage_phases(input.seq = cur.seq, - thres = thres.twopt, - twopt = twopt) - cur.map <- est_rf_hmm(input.seq = cur.seq, thres = thres.twopt, - twopt = twopt, tol = tol, high.prec = high.prec, - verbose = verbose, input.ph = input.ph, - reestimate.single.ph.configuration = reestimate.single.ph.configuration) - cur.map <- filter_map_at_hmm_thres(cur.map, thres.hmm) - cur.map <- reest_rf(cur.map, tol = tol.final, verbose = FALSE) - rf.temp <- imf_h(cur.map$maps[[1]]$seq.rf) - } - if(verbose) - msg("Done with initial sequence", line = 2) - if(start.set >= length(input.seq$seq.num)){ - if(verbose) cat("\nDone phasing", cur.map$info$n.mrk, "markers\nReestimating final recombination fractions.") - return(reest_rf(cur.map, tol = tol.final)) - } - ##### More than 'start.set' markers ##### - ## For maps with more markers than 'start.set', include - ## next makers in a sequential fashion - ct <- start.set+cte - all.ph <- update_ph_list_at_hmm_thres(cur.map, thres.hmm) - if(sub.map.size.diff.limit != Inf & !reestimate.single.ph.configuration){ - if (verbose) message("Making 'reestimate.single.ph.configuration = TRUE' to use map expansion") - reestimate.single.ph.configuration <- TRUE - } - while(ct <= length(input.seq$seq.num)) - { - ## Number of multipoint phases evaluated in the - ## previous round of marker insertion - hmm.phase.number <- length(cur.map$maps) - ## Get the map tail containing sufficient markers to - ## distinguish among the ploidy*2 possible homologs. - if(info.tail) - tail.temp <- get_full_info_tail(input.obj = cur.map) - input.ph <- NULL - ## for each linkage phase inherited from HMM - ## analysis from previous round, evaluate the - ## possible linkage phases for the marker inserted - ## in the current round using two-point information - ## under the threshold of thres.twopt - for(i in 1:length(all.ph$config.to.test)) - { - seq.num <- NULL - if(info.tail) - seq.num <- tail.temp$maps[[i]]$seq.num - if(length(seq.num) < extend.tail) - seq.num <- tail(cur.map$maps[[i]]$seq.num, extend.tail) - ## If the tail do not contain the marker responsible for carrying - ## multiple linkage phases through the rounds of insertion, - ## extend the tail so it contains that marker - if(max(check_ls_phase(all.ph)) >= length(seq.num)){ - seq.num <- tail(cur.map$maps[[i]]$seq.num, max(check_ls_phase(all.ph)) + start.set) - } - seq.cur <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), - arg = seq.num, - data.name = input.seq$data.name) - all.ph.temp <- get_ph_list_subset(all.ph, seq.num, i) - input.ph.temp <- ls_linkage_phases(input.seq = seq.cur, - thres = thres.twopt, - twopt = twopt, - mrk.to.add = input.seq$seq.num[ct], - prev.info = all.ph.temp) - input.ph <- concatenate_ph_list(ph.list.1 = input.ph, ph.list.2 = input.ph.temp) - } - ## This is the number of linkage phases to be tested - ## combining HMM phases from previous round of mrk - ## insertion and the linkage phases from the two-point - ## information obtained in the current round - twopt.phase.number <- length(input.ph$config.to.test) - ## If this number is higher than phase.number.limit, - ## proceed to the next iteration - if(length(input.ph$config.to.test) > phase.number.limit) { - if(verbose) - cat(crayon::italic$yellow(paste0(ct ,": not included (*linkage phases*)\n", sep = ""))) - ct <- ct + 1 - next() - } - ## Appending the marker to the numeric - ## sequence and making a new sequence - seq.num <- c(seq.num, input.seq$seq.num[ct]) - seq.cur <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), - arg = seq.num, - data.name = input.seq$data.name) - if(verbose) - cat_phase(input.seq, input.ph, all.ph, ct, seq.num, twopt.phase.number, hmm.phase.number) - ## Evaluation all maps using HMM - cur.map.temp <- est_rf_hmm(input.seq = seq.cur, - input.ph = input.ph, - twopt = twopt, - tol = tol, - verbose = FALSE, - reestimate.single.ph.configuration = reestimate.single.ph.configuration, - high.prec = high.prec) - cur.map.temp$maps <- unique(cur.map.temp$maps) - ## Filtering linkage phase configurations under a HMM LOD threshold - cur.map.temp <- filter_map_at_hmm_thres(cur.map.temp, thres.hmm) - - ## Gathering linkage phases of the current map, excluding the marker inserted - ## in the current round - ph.new <- lapply(cur.map.temp$maps, function(x) list(P = head(x$seq.ph$P, -1), - Q = head(x$seq.ph$Q, -1))) - ## Gathering linkage phases of the previous map, excluding the marker inserted - ## in the current round - ph.old <- lapply(cur.map$maps, function(x, id) list(P = x$seq.ph$P[id], - Q = x$seq.ph$Q[id]), id = names(ph.new[[1]]$P)) - - ## Check in which whole phase configurations the new - ## HMM tail should be appended - MQ <- MP <- matrix(NA, length(ph.old), length(ph.new)) - for(j1 in 1:nrow(MP)){ - for(j2 in 1:ncol(MP)){ - MP[j1, j2] <- identical(ph.old[[j1]]$P, ph.new[[j2]]$P) - MQ[j1, j2] <- identical(ph.old[[j1]]$Q, ph.new[[j2]]$Q) - } - } - M <- which(MP & MQ, arr.ind = TRUE) - colnames(M) <- c("old", "new") - ## Checking for quality (map size and LOD Score) - submap.length.old <- sapply(cur.map$maps, function(x) sum(imf_h(x$seq.rf)))[M[,1]] - last.dist.old <- sapply(cur.map$maps, function(x) tail(imf_h(x$seq.rf),1))[M[,1]] - submap.length.new <- sapply(cur.map.temp$maps, function(x) sum(imf_h(x$seq.rf)))[M[,2]] - last.dist.new <- sapply(cur.map.temp$maps, function(x) tail(imf_h(x$seq.rf),1))[M[,2]] - last.mrk.expansion <- submap.expansion <- numeric(nrow(M)) - submap.expansion <- submap.length.new - submap.length.old - last.mrk.expansion <- last.dist.new - last.dist.old - - - cur.map.temp$maps <- cur.map.temp$maps[M[,2]] - LOD <- get_LOD(cur.map.temp, sorted = FALSE) - if(sub.map.size.diff.limit != Inf){ - selected.map <- submap.expansion < sub.map.size.diff.limit & LOD < thres.hmm - if(detailed.verbose){ - x <- round(cbind(submap.length.new, submap.expansion, last.mrk.expansion),2) - for(j1 in 1:nrow(x)){ - cat(" ", x[j1,1], ": (", x[j1,2], "/", x[j1,3],")", sep = "") - if(j1 != nrow(x)) cat("\n") - } - if(all(!selected.map)) cat(paste0(crayon::red(cli::symbol$cross), "\n")) - else cat(paste0(crayon::green(cli::symbol$tick), "\n")) - } - if(all(!selected.map)){ - if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (~map extension~)\n", sep = ""))) - ct <- ct + 1 - next() - } - } else { selected.map <- LOD < thres.hmm } - all.ph.temp <- update_ph_list_at_hmm_thres(cur.map.temp, Inf) - id <- which(selected.map & which(!sapply(cur.map.temp$maps, is.null))) - if(length(id) == 0){ - if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (-linkage phases-)\n", sep = ""))) - ct <- ct + 1 - next() - } - cur.map.temp$maps <- cur.map.temp$maps[id] - if(any(unique(M[id,2,drop = FALSE]) > length(all.ph.temp$config.to.test))){ - if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (-linkage phases-)\n", sep = ""))) - ct <- ct + 1 - next() - } - if(length(id) > nrow(M)) - all.ph.temp <- add_mrk_at_tail_ph_list(all.ph, all.ph.temp, M[,,drop = FALSE]) - else if(length(id) == nrow(M)){ - M[,2] <- id - all.ph.temp <- add_mrk_at_tail_ph_list(all.ph, all.ph.temp, M[,,drop = FALSE]) - } else { - all.ph.temp <- add_mrk_at_tail_ph_list(all.ph, all.ph.temp, M[id,,drop = FALSE]) - } - all.ph <- all.ph.temp - cur.map <- cur.map.temp - ct <- ct + 1 - } - ##### Re-estimating final map #### - ## Re-estimating final map with higher tolerance - seq.final <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), - arg = as.numeric(names(all.ph$config.to.test[[1]]$P)), - data.name = input.seq$data.name) - #msg(paste("Done phasing", length(seq.final$seq.num), "markers")) - if (verbose){ - msg("Reestimating final recombination fractions", line = 2) - cat("") - } - final.map <- est_rf_hmm(input.seq = seq.final, - input.ph = all.ph, - twopt = twopt, - tol = tol.final, - verbose = FALSE, - reestimate.single.ph.configuration = TRUE, - high.prec = TRUE) - if(verbose) { - #cat("\n------------------------------------------") - cat("Markers in the initial sequence: ", length(input.seq$seq.num), sep = "") - cat("\nMapped markers : ", final.map$info$n.mrk, " (", - round(100*final.map$info$n.mrk/length(input.seq$seq.num),1) ,"%)\n", sep = "") - msg("", line = 2) - } - return(final.map) -} - -#' @rdname est_rf_hmm -#' @export -print.mappoly.map <- function(x, detailed = FALSE, ...) { - cat("This is an object of class 'mappoly.map'\n") - cat(" Ploidy level:\t", x$info$ploidy, "\n") - cat(" No. individuals:\t", get(x$info$data.name)$n.ind, "\n") - cat(" No. markers:\t", x$info$n.mrk, "\n") - cat(" No. linkage phases:\t", length(x$maps), "\n") - l <- sapply(x$maps, function(x) x$loglike) - LOD <- round(l - max(l), 2) - if(detailed) - { - for (j in 1:length(x$maps)) { - cat("\n ---------------------------------------------\n") - cat(" Linkage phase configuration: ", j) - cat("\n log-likelihood:\t", x$map[[j]]$loglike) - cat("\n LOD:\t\t", format(round(LOD[j], 2), digits = 2)) - cat("\n\n") - M <- matrix("|", x$info$n.mrk, x$info$ploidy * 2) - M <- cbind(M, " ") - M <- cbind(M, format(round(cumsum(c(0, imf_h(x$maps[[j]]$seq.rf))), 1), digits = 1)) - for (i in 1:x$info$n.mrk) { - if (all(x$maps[[j]]$seq.ph$Q[[i]] != 0)) - M[i, c(x$maps[[j]]$seq.ph$P[[i]], x$maps[[j]]$seq.ph$Q[[i]] + x$info$ploidy)] <- "o" else M[i, x$maps[[j]]$seq.ph$P[[i]]] <- "o" - } - M <- cbind(get(x$info$data.name)$mrk.names[x$maps[[j]]$seq.num], M) - big.name <- max(nchar(M[,1])) - format_name <- function(y, big.name){ - paste0(y, paste0(rep(" ", big.name-nchar(y)), collapse = "")) - } - M <- rbind(c("", letters[1:(ncol(M)-3)], "", ""), M) - format(apply(M, 1, function(y) cat(c("\t", format_name(y[1], big.name), "\t", y[2:(x$info$ploidy + 1)], rep(" ", 4), y[(x$info$ploidy + 2):(x$info$ploidy * 2 + 3)], "\n"), collapse = ""))) - } - } else { - cat("\n ---------------------------------------------\n") - cat(" Number of linkage phase configurations: ", length(x$maps)) - cat("\n ---------------------------------------------\n") - for (j in 1:length(x$maps)) { - cat(" Linkage phase configuration: ", j) - cat("\n map length:\t", format(round(sum(imf_h(x$map[[j]]$seq.rf)), 2))) - cat("\n log-likelihood:\t", format(round(x$map[[j]]$loglike, 2))) - cat("\n LOD:\t\t", format(round(LOD[j], 2), digits = 2)) - cat("\n ~~~~~~~~~~~~~~~~~~\n") - } - } -} - -#' @rdname est_rf_hmm -#' @importFrom grDevices rgb -#' @importFrom graphics rect -#' @export -plot.mappoly.map <- function(x, left.lim = 0, right.lim = Inf, - phase = TRUE, mrk.names = FALSE, - cex = 1, config = "best", P = "Parent 1", - Q = "Parent 2", xlim = NULL, ...) { - oldpar <- par(no.readonly = TRUE) - on.exit(par(oldpar)) - if(phase){ - map.info <- prepare_map(x, config) - if(any(map.info$ph.p == "B")){ - var.col <- c("black", "darkgray") - var.col <- var.col[1:2] - names(var.col) <- c("A", "B") - } else { - var.col <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3") - names(var.col) <- c("A", "T", "C", "G") - } - ploidy <- map.info$ploidy - if(ploidy == 2) { - d.col <- c(NA, "#1B9E77", "#D95F02") - }else if(ploidy == 4){ - d.col <- c(NA, "#1B9E77", "#D95F02", "#7570B3", "#E7298A") - }else if(ploidy == 6){ - d.col <- c(NA, "#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02") - }else if(ploidy == 8){ - d.col <- c(NA, "#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666") - } else d.col <- c(NA, gg_color_hue(ploidy)) - names(d.col) <- 0:ploidy - d.col[1] <- NA - x <- map.info$map - lab <- names(x) - zy <- seq(0, 0.5, length.out = ploidy) + 1.5 - pp <- map.info$ph.p - pq <- map.info$ph.q - dp <- map.info$dp - dq <- map.info$dq - x1 <- abs(left.lim - x) - x2 <- abs(right.lim - x) - id.left <- which(x1 == min(x1))[1] - id.right <- rev(which(x2 == min(x2)))[1] - par(mai = c(1,0.15,0,0), mar = c(4.5,2,1,2)) - curx <- x[id.left:id.right] - layout(mat = matrix(c(4,2,3, 1), ncol = 2), heights = c(2, 10), widths = c(1, 10)) - if(is.null(xlim)) - xlim <- range(curx) - plot(x = curx, - y = rep(.5,length(curx)), - type = "n" , - ylim = c(.25, 4.5), - axes = FALSE, - xlab = "Distance (cM)", - ylab = "", - xlim = xlim) - lines(c(x[id.left], x[id.right]), c(.5, .5), lwd = 15, col = "gray") - points(x = curx, - y = rep(.5,length(curx)), - xlab = "", ylab = "", - pch = "|", cex = 1.5, - ylim = c(0,2)) - axis(side = 1) - diplocol1 <- c("#4DAF4A", "#984EA3") - diplocol2 <- c("#E41A1C", "#377EB8") - #Parent 2 - x1 <- seq(x[id.left], x[id.right], length.out = length(curx)) - x.control <- diff(x1[1:2])/2 - if(length(x1) < 150) - x.control <- x.control * .8 - if(length(x1) < 100) - x.control <- x.control * .8 - if(length(x1) < 75) - x.control <- x.control * .8 - if(length(x1) < 50) - x.control <- x.control * .8 - if(length(x1) < 25) - x.control <- x.control * .8 - for(i in 1:ploidy) - { - if(ploidy == 2 && any(map.info$ph.p == "B")){ - lines(range(x1), c(zy[i], zy[i]), lwd = 15, col = diplocol1[i]) - } else { - lines(range(x1), c(zy[i], zy[i]), lwd = 8, col = "gray") - } - y1 <- rep(zy[i], length(curx)) - pal <- var.col[pq[id.left:id.right,i]] - rect(xleft = x1 - x.control, ybottom = y1 -.05, xright = x1 + x.control, ytop = y1 +.05, col = pal, border = NA) - } - #connecting allelic variants to markers - for(i in 1:length(x1)) - lines(c(curx[i], x1[i]), c(0.575, zy[1]-.05), lwd = 0.2) - points(x = x1, - y = zy[ploidy]+0.05+dq[id.left:id.right]/20, - col = d.col[as.character(dq[id.left:id.right])], - pch = 19, cex = .7) - #Parent 1 - zy <- zy + 1.1 - for(i in 1:ploidy) - { - if(ploidy == 2 && any(map.info$ph.p == "B")){ - lines(range(x1), c(zy[i], zy[i]), lwd = 12, col = diplocol2[i]) - } else { - lines(range(x1), c(zy[i], zy[i]), lwd = 8, col = "gray") - } - y1 <- rep(zy[i], length(curx)) - pal <- var.col[pp[id.left:id.right,i]] - rect(xleft = x1 - x.control, ybottom = y1 -.05, xright = x1 + x.control, ytop = y1 +.05, col = pal, border = NA) - } - points(x = x1, - y = zy[ploidy]+0.05+dp[id.left:id.right]/20, - col = d.col[as.character(dp[id.left:id.right])], - pch = 19, cex = .7) - if(mrk.names) - text(x = x1, - y = rep(zy[ploidy]+0.05+.3, length(curx)), - labels = names(curx), - srt = 90, adj = 0, cex = cex) - par(mar = c(4.5,1,1,0), xpd = TRUE) - plot(x = 0, - y = 0, - type = "n" , - axes = FALSE, - ylab = "", - xlab = "", - ylim = c(.25, 4.5)) - zy <- zy - 1.1 - mtext(text = Q, side = 4, at = mean(zy), line = -1, font = 4) - for(i in 1:ploidy) - mtext(letters[(ploidy+1):(2*ploidy)][i], line = 0, at = zy[i], side = 4) - zy <- zy + 1.1 - mtext(text = P, side = 4, at = mean(zy), line = -1, font = 4) - for(i in 1:ploidy) - mtext(letters[1:ploidy][i], line = 0, at = zy[i], side = 4) - par(mar = c(0,1,2,4), xpd = FALSE) - plot(x = curx, - y = rep(.5,length(curx)), - type = "n" , - axes = FALSE, - xlab = "", - ylab = "") - if(any(map.info$ph.p == "B")){ - legend("bottomleft", legend = c("A", "B"), - fill = c(var.col), title = "Variants", - box.lty = 0, bg = "transparent", ncol = 2) - } else { - legend("bottomleft", legend = c("A", "T", "C", "G", "-"), - fill = c(var.col, "white"), title = "Nucleotides", - box.lty = 0, bg = "transparent", ncol = 4) - } - legend("bottomright", legend = names(d.col)[-1], title = "Doses" , - col = d.col[-1], ncol = ploidy/2, pch = 19, - box.lty = 0) - } else { - plot_map_list(x) - } -} - -#' prepare maps for plot -#' @param void internal function to be documented -#' @keywords internal -prepare_map <- function(input.map, config = "best"){ - if (!inherits(input.map, "mappoly.map")) { - stop(deparse(substitute(input.map)), " is not an object of class 'mappoly.map'") - } - ## Choosing the linkage phase configuration - LOD.conf <- get_LOD(input.map, sorted = FALSE) - if(config == "best") { - i.lpc <- which.min(LOD.conf) - } else if(config == "all"){ - i.lpc <- seq_along(LOD.conf) } else if (config > length(LOD.conf)) { - stop("invalid linkage phase configuration") - } else i.lpc <- config - ## Gathering marker positions - map <- cumsum(imf_h(c(0, input.map$maps[[i.lpc]]$seq.rf))) - names(map) <- input.map$info$mrk.names - ## - ph.p <- ph_list_to_matrix(input.map$maps[[i.lpc]]$seq.ph$P, input.map$info$ploidy) - ph.q <- ph_list_to_matrix(input.map$maps[[i.lpc]]$seq.ph$Q, input.map$info$ploidy) - dimnames(ph.p) <- list(names(map), letters[1:input.map$info$ploidy]) - dimnames(ph.q) <- list(names(map), letters[(1+input.map$info$ploidy):(2*input.map$info$ploidy)]) - if(is.null(input.map$info$seq.alt)) - { - ph.p[ph.p == 1] <- ph.q[ph.q == 1] <- "A" - ph.p[ph.p == 0] <- ph.q[ph.q == 0] <- "B" - } else { - for(i in input.map$info$mrk.names){ - ph.p[i, ph.p[i,] == 1] <- input.map$info$seq.alt[i] - ph.p[i, ph.p[i,] == 0] <- input.map$info$seq.ref[i] - ph.q[i, ph.q[i,] == 1] <- input.map$info$seq.alt[i] - ph.q[i, ph.q[i,] == 0] <- input.map$info$seq.ref[i] - } - } - dp <- input.map$info$seq.dose.p1 - dq <- input.map$info$seq.dose.p2 - list(ploidy = input.map$info$ploidy, map = map, ph.p = ph.p, ph.q = ph.q, dp = dp, dq = dq) -} - -#' Get the tail of a marker sequence up to the point where the markers -#' provide no additional information. -#' -#' @param void internal function to be documented -#' @keywords internal -get_full_info_tail <- function(input.obj, extend = NULL) { - ## checking for correct object - if(!inherits(input.obj, "mappoly.map")) - stop(deparse(substitute(input.obj)), " is not an object of class 'mappoly.map''") - if (!is.null(extend)) - if (extend > input.obj$info$n.mrk) - return(input.obj) - ploidy <- input.obj$info$ploidy - i <- ploidy/2 - w1 <- ph_list_to_matrix(input.obj$maps[[1]]$seq.ph$P, ploidy) - max1 <- length(unique(apply(w1, 2, paste0, collapse = ""))) - w2 <- ph_list_to_matrix(input.obj$maps[[1]]$seq.ph$Q, ploidy) - max2 <- length(unique(apply(w2, 2, paste0, collapse = ""))) - while (i < input.obj$info$n.mrk) { - wp <- ph_list_to_matrix(tail(input.obj$maps[[1]]$seq.ph$P, i), ploidy) - xp <- apply(wp, 2, paste, collapse = "-") - wq <- ph_list_to_matrix(tail(input.obj$maps[[1]]$seq.ph$Q, i), ploidy) - xq <- apply(wq, 2, paste, collapse = "-") - if (length(unique(xp)) == max1 && length(unique(xq)) == max2) - break() - i <- i + 1 - } - if (!is.null(extend)) - if (i < extend) - i <- extend - input.obj$info$n.mrk <- i - - - - input.obj$info$seq.num <- tail(input.obj$info$seq.num, i) - input.obj$info$mrk.names <- tail(input.obj$info$mrk.names, i) - input.obj$info$seq.dose.p1 <- tail(input.obj$info$seq.dose.p1, i) - input.obj$info$seq.dose.p2 <- tail(input.obj$info$seq.dose.p2, i) - input.obj$info$chrom <- tail(input.obj$info$chrom, i) - input.obj$info$genome.pos <- tail(input.obj$info$genome.pos, i) - input.obj$info$chisq.pval <- tail(input.obj$info$chisq.pval, i) - - for (j in 1:length(input.obj$maps)) { - input.obj$maps[[j]]$loglike <- 0 - input.obj$maps[[j]]$seq.ph$P <- tail(input.obj$maps[[j]]$seq.ph$P, n = i) - input.obj$maps[[j]]$seq.ph$Q <- tail(input.obj$maps[[j]]$seq.ph$Q, n = i) - input.obj$maps[[j]]$seq.num <- tail(input.obj$maps[[j]]$seq.num, n = i) - input.obj$maps[[j]]$seq.rf <- tail(input.obj$maps[[j]]$seq.rf, n = (i - 1)) - } - return(input.obj) -} - -#' remove maps under a certain threshold -#' @param void internal function to be documented -#' @keywords internal -#' @export filter_map_at_hmm_thres -filter_map_at_hmm_thres <- function(map, thres.hmm){ - map$info$ph.thresh <- NULL - map$maps <- map$maps[get_LOD(map, sorted = FALSE) < thres.hmm] - map -} - -#' makes a phase list from map, selecting only -#' configurations under a certain threshold -#' @param void internal function to be documented -#' @keywords internal -update_ph_list_at_hmm_thres <- function(map, thres.hmm){ - temp.map <- filter_map_at_hmm_thres(map, thres.hmm) - config.to.test <- lapply(temp.map$maps, function(x) x$seq.ph) - structure(list(config.to.test = config.to.test, - ploidy = map$info$ploidy, seq.num = map$maps[[1]]$seq.num, - thres = map$info$ph.thresh, data.name = map$info$data.name, - thres.hmm = thres.hmm), - class = "two.pts.linkage.phases") -} - -#' subset of a linkage phase list -#' @param void internal function to be documented -#' @keywords internal -get_ph_list_subset <- function(ph.list, seq.num, conf){ - config.to.test <- list(lapply(ph.list$config.to.test[[conf]], function(x, seq.num) x[as.character(seq.num)], seq.num)) - structure(list(config.to.test = config.to.test, - ploidy = ph.list$ploidy, seq.num = ph.list$seq.num, - thres = ph.list$ph.thresh, data.name = ph.list$data.name, - thres.hmm = ph.list$thres.hmm), - class = "two.pts.linkage.phases") -} - -#' concatenate two linkage phase lists -#' @param void internal function to be documented -#' @keywords internal -concatenate_ph_list <- function(ph.list.1, ph.list.2){ - if(length(ph.list.1) == 0) - return(ph.list.2) - config.to.test <- c(ph.list.1$config.to.test, ph.list.2$config.to.test) - id <- which(!duplicated(config.to.test)) - config.to.test <- config.to.test[id] - structure(list(config.to.test = config.to.test, - ploidy = ph.list.1$ploidy, seq.num = ph.list.1$seq.num, - thres = ph.list.1$ph.thresh, data.name = ph.list.1$data.name, - thres.hmm = ph.list.1$thres.hmm), - class = "two.pts.linkage.phases") -} - -#' add a single marker at the tail of a linkage phase list -#' @param void internal function -#' @keywords internal -add_mrk_at_tail_ph_list <- function(ph.list.1, ph.list.2, cor.index){ - config.to.test <- vector("list", length = nrow(cor.index)) - for(i in 1:nrow(cor.index)){ - config.to.test[[i]] <- list(P = c(ph.list.1$config.to.test[[cor.index[i,1]]]$P, tail(ph.list.2$config.to.test[[cor.index[i,2]]]$P,1)), - Q = c(ph.list.1$config.to.test[[cor.index[i,1]]]$Q, tail(ph.list.2$config.to.test[[cor.index[i,2]]]$Q,1))) - } - structure(list(config.to.test = config.to.test, - ploidy = ph.list.1$ploidy, seq.num = ph.list.1$seq.num, - thres = ph.list.1$ph.thresh, data.name = ph.list.1$data.name, - thres.hmm = ph.list.1$thres.hmm), - class = "two.pts.linkage.phases") -} - -#' Compare a list of linkage phases and return the -#' markers for which they are different. -#' @param void internal function to be documented -#' @keywords internal -check_ls_phase <- function(ph){ - if(length(ph$config.to.test) == 1) return(0) - id <- rep(1, length(ph$config.to.test[[1]]$P)) - for(i in 1:length(ph$config.to.test[[1]]$P)){ - w <- ph$config.to.test[[1]]$P[i] - for(j in 2:length(ph$config.to.test)) - id[i] <- id[i] + identical(w, ph$config.to.test[[j]]$P[i]) - } - names(id) <- names((ph$config.to.test[[1]]$P)) - w <- length(ph$config.to.test[[1]]$P) - which(id < length(ph$config.to.test)) - if(length(w) == 0) return(0) - w -} - -#' cat for graphical representation of the phases -#' @param void internal function to be documented -#' @keywords internal -print_ph <- function(input.ph){ - phs.P <- lapply(input.ph$config.to.test, - function(x, ploidy) { - M <- matrix("|", nrow = 1, ncol = ploidy) - M[unlist(tail(x$P, 1))] <- crayon::red(cli::symbol$bullet) - paste(M, collapse = "")}, - ploidy = input.ph$ploidy) - phs.Q <- lapply(input.ph$config.to.test, - function(x, ploidy) { - M <- matrix("|", nrow = 1, ncol = ploidy) - M[unlist(tail(x$Q, 1))] <- crayon::cyan(cli::symbol$bullet) - paste(M, collapse = "")}, - ploidy = input.ph$ploidy) - if(length(phs.P) == 1) - return(paste(unlist(phs.P)[1], unlist(phs.Q)[1], " ")) - if(length(phs.P) == 2) - return(paste(unlist(phs.P)[1], unlist(phs.Q)[1], " ", unlist(phs.P)[2], unlist(phs.Q)[2])) - if(length(phs.P) > 2) - return(paste(unlist(phs.P)[1], unlist(phs.Q)[1], " ... ", unlist(phs.P)[2], unlist(phs.Q)[2])) -} - -#' cat for phase information -#' @param void internal function to be documented -#' @keywords internal -cat_phase <- function(input.seq, - input.ph, - all.ph, - ct, - seq.num, - twopt.phase.number, - hmm.phase.number){ - pc <- round(100*ct/length(input.seq$seq.num),1) - xmax11 <- nchar(length(input.seq$seq.num)) - x11 <- ct - x11 <- paste0(x11, paste(rep(" ", xmax11-nchar(x11)), collapse = "")) - x12 <- length(all.ph$config.to.test[[1]]$P) + 1 - x12 <- paste0(x12, paste(rep(" ", xmax11-nchar(x12)), collapse = "")) - x13 <- pc - x13 <- paste0(":(",pc,"%)", paste(rep(" ", 5-nchar(x13)), collapse = "")) - x1 <- paste0(x12,"/",x11, x13) - xmax21 <- nchar(max(input.seq$seq.num)) - x21 <- input.seq$seq.num[ct] - x21 <- paste0(x21, paste(rep(" ", xmax21-nchar(x21)), collapse = "")) - x22 <- length(input.ph$config.to.test) - x22 <- paste0(x22, " ph ", paste(rep(" ", 5-nchar(x22)), collapse = "")) - x23 <- paste0("(", hmm.phase.number, "/", twopt.phase.number,") ") - x23 <- paste0(x23, paste(rep(" ", 10-nchar(x23)), collapse = "")) - x2 <- paste0( x21, ": ", x22, x23) - x31 <- length(seq.num)-1 - x31 <- paste0(x31, paste(rep(" ", xmax11-nchar(x31)), collapse = "")) - x3 <- paste0(" -- tail: ", x31) - cat(x1, x2, x3, print_ph(input.ph), "\n") -} +#' Multipoint analysis using Hidden Markov Models in autopolyploids +#' +#' Performs the multipoint analysis proposed by \cite{Mollinari and +#' Garcia (2019)} in a sequence of markers +#' +#' This function first enumerates a set of linkage phase configurations +#' based on two-point recombination fraction information using a threshold +#' provided by the user (argument \code{thresh}). After that, for each +#' configuration, it reconstructs the genetic map using the +#' HMM approach described in Mollinari and Garcia (2019). As result, it returns +#' the multipoint likelihood for each configuration in form of LOD Score comparing +#' each configuration to the most likely one. It is recommended to use a small number +#' of markers (e.g. 50 markers for hexaploids) since the possible linkage +#' phase combinations bounded only by the two-point information can be huge. +#' Also, it can be quite sensible to small changes in \code{'thresh'}. +#' For a large number of markers, please see \code{\link[mappoly]{est_rf_hmm_sequential}}. +# +#' @param input.seq an object of class \code{mappoly.sequence} +#' +#' @param input.ph an object of class \code{two.pts.linkage.phases}. +#' If not available (default = NULL), it will be computed +#' +#' @param thres LOD Score threshold used to determine if the linkage phases +#' compared via two-point analysis should be considered. Smaller +#' values will result in smaller number of linkage phase +#' configurations to be evaluated by the multipoint algorithm. +#' +#' @param twopt an object of class \code{mappoly.twopt} +#' containing two-point information +#' +#' @param verbose if \code{TRUE}, current progress is shown; if +#' \code{FALSE} (default), no output is produced +#' +#' @param tol the desired accuracy (default = 1e-04) +#' +#' @param est.given.0.rf logical. If TRUE returns a map forcing all +#' recombination fractions equals to 0 (1e-5, for internal use only. +#' Default = FALSE) +#' +#' @param reestimate.single.ph.configuration logical. If \code{TRUE} +#' returns a map without re-estimating the map parameters for cases +#' where there is only one possible linkage phase configuration. +#' This argument is intended to be used in a sequential map construction +#' +#' @param high.prec logical. If \code{TRUE} (default) uses high precision +#' long double numbers in the HMM procedure +#' +#' @param x an object of the class \code{mappoly.map} +#' +#' @param detailed logical. if TRUE, prints the linkage phase configuration and the marker +#' position for all maps. If FALSE (default), prints a map summary +#' +#' @param left.lim the left limit of the plot (in cM, default = 0). +#' +#' @param right.lim the right limit of the plot (in cM, default = Inf, i.e., +#' will print the entire map) +#' +#' @param phase logical. If \code{TRUE} (default) plots the phase configuration +#' for both parents +#' +#' @param mrk.names if TRUE, marker names are displayed (default = FALSE) +#' +#' @param cex The magnification to be used for marker names +#' +#' @param config should be \code{'best'} or the position of the +#' configuration to be plotted. If \code{'best'}, plot the configuration +#' with the highest likelihood +#' +#' @param P a string containing the name of parent P +#' +#' @param Q a string containing the name of parent Q +#' +#' @param xlim range of the x-axis. If \code{xlim = NULL} (default) it uses the +#' map range. +#' +#' @param ... currently ignored +#' +#' @return A list of class \code{mappoly.map} with two elements: +#' +#' i) info: a list containing information about the map, regardless of the linkage phase configuration: +#' \item{ploidy}{the ploidy level} +#' \item{n.mrk}{number of markers} +#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, +#' according to the input file} +#' \item{mrk.names}{the names of markers in the map} +#' \item{seq.dose.p1}{a vector containing the dosage in parent 1 for all markers in the map} +#' \item{seq.dose.p2}{a vector containing the dosage in parent 2 for all markers in the map} +#' \item{chrom}{a vector indicating the sequence (usually chromosome) each marker belongs +#' as informed in the input file. If not available, +#' \code{chrom = NULL}} +#' \item{genome.pos}{physical position (usually in megabase) of the markers into the sequence} +#' \item{seq.ref}{reference base used for each marker (i.e. A, T, C, G). If not available, +#' \code{seq.ref = NULL}} +#' \item{seq.alt}{alternative base used for each marker (i.e. A, T, C, G). If not available, +#' \code{seq.ref = NULL}} +#' \item{chisq.pval}{a vector containing p-values of the chi-squared test of Mendelian +#' segregation for all markers in the map} +#' \item{data.name}{name of the dataset of class \code{mappoly.data}} +#' \item{ph.thres}{the LOD threshold used to define the linkage phase configurations to test} +#' +#' ii) a list of maps with possible linkage phase configuration. Each map in the list is also a +#' list containing +#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, +#' according to the input file} +#' \item{seq.rf}{a vector of size (\code{n.mrk - 1}) containing a sequence of recombination +#' fraction between the adjacent markers in the map} +#' \item{seq.ph}{linkage phase configuration for all markers in both parents} +#' \item{loglike}{the hmm-based multipoint likelihood} +#' +#' @examples +#' mrk.subset <- make_seq_mappoly(hexafake, 1:10) +#' red.mrk <- elim_redundant(mrk.subset) +#' unique.mrks <- make_seq_mappoly(red.mrk) +#' subset.pairs <- est_pairwise_rf(input.seq = unique.mrks, +#' ncpus = 1, +#' verbose = TRUE) +#' +#' ## Estimating subset map with a low tolerance for the E.M. procedure +#' ## for CRAN testing purposes +#' subset.map <- est_rf_hmm(input.seq = unique.mrks, +#' thres = 2, +#' twopt = subset.pairs, +#' verbose = TRUE, +#' tol = 0.1, +#' est.given.0.rf = FALSE) +#' subset.map +#' ## linkage phase configuration with highest likelihood +#' plot(subset.map, mrk.names = TRUE, config = "best") +#' ## the second one +#' plot(subset.map, mrk.names = TRUE, config = 2) +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} +#' +#' @references +#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage +#' analysis and haplotype phasing in experimental autopolyploid +#' populations with high ploidy level using hidden Markov +#' models, _G3: Genes, Genomes, Genetics_. +#' https://doi.org/10.1534/g3.119.400378 +#' @rdname est_rf_hmm +#' @export est_rf_hmm +est_rf_hmm <- function(input.seq, + input.ph = NULL, + thres = 0.5, + twopt = NULL, + verbose = FALSE, + tol = 1e-04, + est.given.0.rf = FALSE, + reestimate.single.ph.configuration = TRUE, + high.prec = TRUE) { + + ## checking for correct object + input_classes <- c("mappoly.sequence", "two.pts.linkage.phases") + if (!inherits(input.seq, input_classes[1])) { + stop(deparse(substitute(input.seq)), " is not an object of class 'mappoly.sequence'") + } + if(length(input.seq$seq.num) == 1) + stop("Input sequence contains only one marker.", call. = FALSE) + if (is.null(input.ph)) { + if (is.null(twopt)) + stop("Two-point analysis not found!") + if (verbose) + cat("\nListing all configurations under threshold", thres, "using two-point information...\n") + input.ph <- ls_linkage_phases(input.seq = input.seq, thres = thres, twopt = twopt) + } + if (!inherits(input.ph, input_classes[2])) { + stop(deparse(substitute(input.ph)), " is not an object of class 'two.pts.linkage.phases'") + } + info.par <-detect_info_par(input.seq) + if(length(input.seq$seq.num) == 2 & !est.given.0.rf){ + maps <- vector("list", 1) + res.temp <- twopt$pairwise[[paste(sort(input.seq$seq.num), collapse = "-")]] + dp <- get(input.seq$data.name, pos = 1)$dosage.p1[input.seq$seq.num] + dq <- get(input.seq$data.name, pos = 1)$dosage.p2[input.seq$seq.num] + sh <- as.numeric(unlist(strsplit(rownames(res.temp)[1], split = "-"))) + res.ph <- list(P = c(list(0),list(0)), Q = c(list(0),list(0))) + if(dp[1] != 0) + res.ph$P[[1]] <- 1:dp[1] + if(dq[1] != 0) + res.ph$Q[[1]] <- 1:dq[1] + if(dp[2] != 0) + { + v <- (rev(res.ph$P[[1]])[1]+1):(dp[2]+rev(res.ph$P[[1]])[1]) + res.ph$P[[2]] <- v-sh[1] + } + if(dq[2] != 0) + { + v <- (rev(res.ph$Q[[1]])[1]+1):(dq[2]+rev(res.ph$Q[[1]])[1]) + res.ph$Q[[2]] <- v-sh[2] + } + names(res.ph$P) <- names(res.ph$Q) <- input.seq$seq.num + maps[[1]] <- list(seq.num = input.seq$seq.num, + seq.rf = res.temp[1,"rf"], + seq.ph = res.ph, + loglike = 0) + return(structure(list(info = list(ploidy = input.seq$ploidy, + n.mrk = length(input.seq$seq.num), + seq.num = input.seq$seq.num, + mrk.names = input.seq$seq.mrk.names, + seq.dose.p1 = get(input.seq$data.name, pos = 1)$dosage.p1[input.seq$seq.mrk.names], + seq.dose.p2 = get(input.seq$data.name, pos = 1)$dosage.p2[input.seq$seq.mrk.names], + chrom = get(input.seq$data.name, pos = 1)$chrom[input.seq$seq.mrk.names], + genome.pos = get(input.seq$data.name, pos = 1)$genome.pos[input.seq$seq.mrk.names], + seq.ref = get(input.seq$data.name, pos = 1)$seq.ref[input.seq$seq.mrk.names], + seq.alt = get(input.seq$data.name, pos = 1)$seq.alt[input.seq$seq.mrk.names], + chisq.pval = get(input.seq$data.name, pos = 1)$chisq.pval[input.seq$seq.mrk.names], + data.name = input.seq$data.name, + ph.thresh = abs(res.temp[2,1])), + maps = maps), + class = "mappoly.map")) + } + n.ph <- length(input.ph$config.to.test) + ret.map.no.rf.estimation <- FALSE + if(n.ph == 1 && !reestimate.single.ph.configuration) + ret.map.no.rf.estimation <- TRUE + maps <- vector("list", n.ph) + if (verbose){ + txt <- paste0(" ",n.ph, " phase(s): ") + cat(txt) + } + for (i in 1:n.ph) { + if (verbose) { + if((i+1)%%40 == 0) + cat("\n", paste0(rep(" ", nchar(txt)), collapse = ""), sep = "") + cat(". ") + } + if(est.given.0.rf){ + rf.temp <- rep(1e-5, length(input.seq$seq.num) - 1) + tol = 1 + } + else{ + rf.temp <- rep(0.001, length(input.seq$seq.num) - 1) + } + + ph <- list(P = input.ph$config.to.test[[i]]$P, + Q = input.ph$config.to.test[[i]]$Q) + + if(info.par == "both") + { + maps[[i]] <- est_rf_hmm_single_phase(input.seq = input.seq, + input.ph.single = ph, + rf.temp = rf.temp, + tol = tol, + verbose = FALSE, + ret.map.no.rf.estimation = ret.map.no.rf.estimation, + high.prec = high.prec) + } + else if (info.par == "p1") + { + maps[[i]] <- est_rf_hmm_single_phase_single_parent(input.seq, + input.ph.single = ph, + info.parent = 1, + uninfo.parent = 2, + rf.vec = rf.temp, + tol = tol, + verbose = FALSE, + ret.map.no.rf.estimation = ret.map.no.rf.estimation) + } + else if (info.par == "p2") + { + maps[[i]] <- est_rf_hmm_single_phase_single_parent(input.seq, + input.ph.single = ph, + info.parent = 2, + uninfo.parent = 1, + rf.vec = rf.temp, + tol = tol, + verbose = FALSE, + ret.map.no.rf.estimation = ret.map.no.rf.estimation) + + } + else stop("Should not get here.") + } + #cat("\n") + id <- order(sapply(maps, function(x) x$loglike), decreasing = TRUE) + maps <- maps[id] + if (verbose) + cat("\n") + structure(list(info = list(ploidy = input.seq$ploidy, + n.mrk = length(input.seq$seq.num), + seq.num = input.seq$seq.num, + mrk.names = input.seq$seq.mrk.names, + seq.dose.p1 = get(input.seq$data.name, pos = 1)$dosage.p1[input.seq$seq.mrk.names], + seq.dose.p2 = get(input.seq$data.name, pos = 1)$dosage.p2[input.seq$seq.mrk.names], + chrom = get(input.seq$data.name, pos = 1)$chrom[input.seq$seq.mrk.names], + genome.pos = get(input.seq$data.name, pos = 1)$genome.pos[input.seq$seq.mrk.names], + seq.ref = get(input.seq$data.name, pos = 1)$seq.ref[input.seq$seq.mrk.names], + seq.alt = get(input.seq$data.name, pos = 1)$seq.alt[input.seq$seq.mrk.names], + chisq.pval = get(input.seq$data.name, pos = 1)$chisq.pval[input.seq$seq.mrk.names], + data.name = input.seq$data.name, ph.thresh = input.ph$thres), + maps = maps), + class = "mappoly.map") +} + +#' Multipoint analysis using Hidden Markov Models: Sequential phase elimination +#' +#' Performs the multipoint analysis proposed by \cite{Mollinari and +#' Garcia (2019)} in a sequence of markers removing unlikely phases +#' using sequential multipoint information. +#' +#' This function sequentially includes markers into a map given an +#' ordered sequence. It uses two-point information to eliminate +#' unlikely linkage phase configurations given \code{thres.twopt}. The +#' search is made within a window of size \code{extend.tail}. For the +#' remaining configurations, the HMM-based likelihood is computed and +#' the ones that pass the HMM threshold (\code{thres.hmm}) are eliminated. +#' +#' @param input.seq an object of class \code{mappoly.sequence} +#' +#' @param twopt an object of class \code{mappoly.twopt} +#' containing the two-point information +#' +#' @param start.set number of markers to start the phasing procedure (default = 4) +#' +#' @param thres.twopt the LOD threshold used to determine if the linkage +#' phases compared via two-point analysis should be considered +#' for the search space reduction (A.K.A. \eqn{\eta} in +#' \cite{Mollinari and Garcia (2019)}, default = 5) +#' +#' @param thres.hmm the LOD threshold used to determine if the linkage +#' phases compared via hmm analysis should be evaluated in the +#' next round of marker inclusion (default = 50) +#' +#' @param extend.tail the length of the chain's tail that should +#' be used to calculate the likelihood of the map. If \code{NULL} (default), +#' the function uses all markers positioned. Even if \code{info.tail = TRUE}, +#' it uses at least \code{extend.tail} as the tail length +#' +#' @param phase.number.limit the maximum number of linkage phases of the sub-maps defined +#' by arguments \code{info.tail} and \code{extend.tail}. Default is 20. If the +#' size exceeds this limit, the marker will not be inserted. If +#' \code{Inf}, then it will insert all markers. +#' +#' @param sub.map.size.diff.limit the maximum accepted length +#' difference between the current and the previous sub-map defined +#' by arguments \code{info.tail} and \code{extend.tail}. If the +#' size exceeds this limit, the marker will not be inserted. If +#' \code{NULL}(default), then it will insert all markers. +#' +#' @param info.tail if \code{TRUE} (default), it uses the complete informative tail +#' of the chain (i.e. number of markers where all homologous +#' (\eqn{ploidy x 2}) can be distinguished) to calculate the map likelihood +#' +#'@param reestimate.single.ph.configuration logical. If \code{FALSE} (default) +#' returns a map without re-estimating the map parameters in cases +#' where there are only one possible linkage phase configuration +#' +#' @param tol the desired accuracy during the sequential phase (default = 10e-02) +#' +#' @param tol.final the desired accuracy for the final map (default = 10e-04) +#' +#' @param verbose If \code{TRUE} (default), current progress is shown; if +#' \code{FALSE}, no output is produced +#' +#' @param detailed.verbose If \code{TRUE}, the expansion of the current +#' submap is shown; +#' +#' @param high.prec logical. If \code{TRUE} uses high precision +#' (long double) numbers in the HMM procedure implemented in C++, +#' which can take a long time to perform (default = FALSE) +#' +#' @return A list of class \code{mappoly.map} with two elements: +#' +#' i) info: a list containing information about the map, regardless of the linkage phase configuration: +#' \item{ploidy}{the ploidy level} +#' \item{n.mrk}{number of markers} +#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, +#' according to the input file} +#' \item{mrk.names}{the names of markers in the map} +#' \item{seq.dose.p1}{a vector containing the dosage in parent 1 for all markers in the map} +#' \item{seq.dose.p2}{a vector containing the dosage in parent 2 for all markers in the map} +#' \item{chrom}{a vector indicating the sequence (usually chromosome) each marker belongs +#' as informed in the input file. If not available, +#' \code{chrom = NULL}} +#' \item{genome.pos}{physical position (usually in megabase) of the markers into the sequence} +#' \item{seq.ref}{reference base used for each marker (i.e. A, T, C, G). If not available, +#' \code{seq.ref = NULL}} +#' \item{seq.alt}{alternative base used for each marker (i.e. A, T, C, G). If not available, +#' \code{seq.ref = NULL}} +#' \item{chisq.pval}{a vector containing p-values of the chi-squared test of Mendelian +#' segregation for all markers in the map} +#' \item{data.name}{name of the dataset of class \code{mappoly.data}} +#' \item{ph.thres}{the LOD threshold used to define the linkage phase configurations to test} +#' +#' ii) a list of maps with possible linkage phase configuration. Each map in the list is also a +#' list containing +#' \item{seq.num}{a vector containing the (ordered) indices of markers in the map, +#' according to the input file} +#' \item{seq.rf}{a vector of size (\code{n.mrk - 1}) containing a sequence of recombination +#' fraction between the adjacent markers in the map} +#' \item{seq.ph}{linkage phase configuration for all markers in both parents} +#' \item{loglike}{the hmm-based multipoint likelihood} +#' +#' @examples +#' \donttest{ +#' mrk.subset <- make_seq_mappoly(hexafake, 1:20) +#' red.mrk <- elim_redundant(mrk.subset) +#' unique.mrks <- make_seq_mappoly(red.mrk) +#' subset.pairs <- est_pairwise_rf(input.seq = unique.mrks, +#' ncpus = 1, +#' verbose = TRUE) +#' subset.map <- est_rf_hmm_sequential(input.seq = unique.mrks, +#' thres.twopt = 5, +#' thres.hmm = 10, +#' extend.tail = 10, +#' tol = 0.1, +#' tol.final = 10e-3, +#' phase.number.limit = 5, +#' twopt = subset.pairs, +#' verbose = TRUE) +#' print(subset.map, detailed = TRUE) +#' plot(subset.map) +#' plot(subset.map, left.lim = 0, right.lim = 1, mrk.names = TRUE) +#' plot(subset.map, phase = FALSE) +#' +#' ## Retrieving simulated linkage phase +#' ph.P <- maps.hexafake[[1]]$maps[[1]]$seq.ph$P +#' ph.Q <- maps.hexafake[[1]]$maps[[1]]$seq.ph$Q +#' ## Estimated linkage phase +#' ph.P.est <- subset.map$maps[[1]]$seq.ph$P +#' ph.Q.est <- subset.map$maps[[1]]$seq.ph$Q +#' compare_haplotypes(ploidy = 6, h1 = ph.P[names(ph.P.est)], h2 = ph.P.est) +#' compare_haplotypes(ploidy = 6, h1 = ph.Q[names(ph.Q.est)], h2 = ph.Q.est) +#' } +#' +#' @author Marcelo Mollinari, \email{mmollin@ncsu.edu} +#' +#' @references +#' Mollinari, M., and Garcia, A. A. F. (2019) Linkage +#' analysis and haplotype phasing in experimental autopolyploid +#' populations with high ploidy level using hidden Markov +#' models, _G3: Genes, Genomes, Genetics_. +#' \doi{10.1534/g3.119.400378} +#' +#' @importFrom utils head +#' @importFrom cli rule +#' @export est_rf_hmm_sequential +est_rf_hmm_sequential <- function(input.seq, + twopt, + start.set = 4, + thres.twopt = 5, + thres.hmm = 50, + extend.tail = NULL, + phase.number.limit = 20, + sub.map.size.diff.limit = Inf, + info.tail = TRUE, + reestimate.single.ph.configuration = FALSE, + tol = 10e-2, + tol.final = 10e-4, + verbose = TRUE, + detailed.verbose = FALSE, + high.prec = FALSE) +{## checking for correct object + input_classes <- c("mappoly.sequence", "mappoly.twopt") + if (!inherits(input.seq, input_classes[1])) { + stop(deparse(substitute(input.seq)), + " is not an object of class 'mappoly.sequence'") + } + if (!inherits(twopt, input_classes[2])) { + stop(deparse(substitute(twopt)), + " is not an object of class 'mappoly.twopt'") + } + ## Information about the map + if(verbose) { + cli::cat_line("Number of markers: ", length(input.seq$seq.num)) + msg("Initial sequence", line = 2) + } + if(is.null(extend.tail)) + extend.tail <- length(input.seq$seq.num) + ##### Map in case of two markers ##### + if(length(input.seq$seq.num) == 2) + { + cur.map <- est_rf_hmm(input.seq = input.seq, thres = thres.twopt, + twopt = twopt, tol = tol.final, high.prec = high.prec, + verbose = verbose, + reestimate.single.ph.configuration = reestimate.single.ph.configuration) + return(filter_map_at_hmm_thres(cur.map, thres.hmm)) + } + ##### More than 3 markers #### + ##Starting sequential algorithm for maps with more than 3 markers + ## First step: test all possible phase configurations under + ## a 'thres.twopt' threshold for a sequence of size 'start.set' + if(start.set > length(input.seq$seq.num)) + start.set <- length(input.seq$seq.num) + if(verbose) cat(start.set, " markers...\n", sep = "") + cte <- 0 + rf.temp <- c(Inf, Inf) + while(any(rf.temp >= sub.map.size.diff.limit)) + { + cte <- cte+1 + if(length(rf.temp) == 1) { + ## cat("Impossible build a map using the given thresholds\n") + ## stop() + stop("Impossible build a map using the given thresholds\n") + } + if(verbose) cat(cli::symbol$bullet, " Trying sequence:", cte:(start.set+cte-1), ":\n") + cur.seq <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), as.vector(na.omit(input.seq$seq.num[cte:(start.set+cte-1)])), data.name = input.seq$data.name) + input.ph <- ls_linkage_phases(input.seq = cur.seq, + thres = thres.twopt, + twopt = twopt) + cur.map <- est_rf_hmm(input.seq = cur.seq, thres = thres.twopt, + twopt = twopt, tol = tol, high.prec = high.prec, + verbose = verbose, input.ph = input.ph, + reestimate.single.ph.configuration = reestimate.single.ph.configuration) + cur.map <- filter_map_at_hmm_thres(cur.map, thres.hmm) + cur.map <- reest_rf(cur.map, tol = tol.final, verbose = FALSE) + rf.temp <- imf_h(cur.map$maps[[1]]$seq.rf) + } + if(verbose) + msg("Done with initial sequence", line = 2) + if(start.set >= length(input.seq$seq.num)){ + if(verbose) cat("\nDone phasing", cur.map$info$n.mrk, "markers\nReestimating final recombination fractions.") + return(reest_rf(cur.map, tol = tol.final)) + } + ##### More than 'start.set' markers ##### + ## For maps with more markers than 'start.set', include + ## next makers in a sequential fashion + ct <- start.set+cte + all.ph <- update_ph_list_at_hmm_thres(cur.map, thres.hmm) + if(sub.map.size.diff.limit != Inf & !reestimate.single.ph.configuration){ + if (verbose) message("Making 'reestimate.single.ph.configuration = TRUE' to use map expansion") + reestimate.single.ph.configuration <- TRUE + } + while(ct <= length(input.seq$seq.num)) + { + ## Number of multipoint phases evaluated in the + ## previous round of marker insertion + hmm.phase.number <- length(cur.map$maps) + ## Get the map tail containing sufficient markers to + ## distinguish among the ploidy*2 possible homologs. + if(info.tail) + tail.temp <- get_full_info_tail(input.obj = cur.map) + input.ph <- NULL + ## for each linkage phase inherited from HMM + ## analysis from previous round, evaluate the + ## possible linkage phases for the marker inserted + ## in the current round using two-point information + ## under the threshold of thres.twopt + for(i in 1:length(all.ph$config.to.test)) + { + seq.num <- NULL + if(info.tail) + seq.num <- tail.temp$maps[[i]]$seq.num + if(length(seq.num) < extend.tail) + seq.num <- tail(cur.map$maps[[i]]$seq.num, extend.tail) + ## If the tail do not contain the marker responsible for carrying + ## multiple linkage phases through the rounds of insertion, + ## extend the tail so it contains that marker + if(max(check_ls_phase(all.ph)) >= length(seq.num)){ + seq.num <- tail(cur.map$maps[[i]]$seq.num, max(check_ls_phase(all.ph)) + start.set) + } + seq.cur <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), + arg = seq.num, + data.name = input.seq$data.name) + all.ph.temp <- get_ph_list_subset(all.ph, seq.num, i) + input.ph.temp <- ls_linkage_phases(input.seq = seq.cur, + thres = thres.twopt, + twopt = twopt, + mrk.to.add = input.seq$seq.num[ct], + prev.info = all.ph.temp) + input.ph <- concatenate_ph_list(ph.list.1 = input.ph, ph.list.2 = input.ph.temp) + } + ## This is the number of linkage phases to be tested + ## combining HMM phases from previous round of mrk + ## insertion and the linkage phases from the two-point + ## information obtained in the current round + twopt.phase.number <- length(input.ph$config.to.test) + ## If this number is higher than phase.number.limit, + ## proceed to the next iteration + if(length(input.ph$config.to.test) > phase.number.limit) { + if(verbose) + cat(crayon::italic$yellow(paste0(ct ,": not included (*linkage phases*)\n", sep = ""))) + ct <- ct + 1 + next() + } + ## Appending the marker to the numeric + ## sequence and making a new sequence + seq.num <- c(seq.num, input.seq$seq.num[ct]) + seq.cur <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), + arg = seq.num, + data.name = input.seq$data.name) + if(verbose) + cat_phase(input.seq, input.ph, all.ph, ct, seq.num, twopt.phase.number, hmm.phase.number) + ## Evaluation all maps using HMM + cur.map.temp <- est_rf_hmm(input.seq = seq.cur, + input.ph = input.ph, + twopt = twopt, + tol = tol, + verbose = FALSE, + reestimate.single.ph.configuration = reestimate.single.ph.configuration, + high.prec = high.prec) + cur.map.temp$maps <- unique(cur.map.temp$maps) + ## Filtering linkage phase configurations under a HMM LOD threshold + cur.map.temp <- filter_map_at_hmm_thres(cur.map.temp, thres.hmm) + + ## Gathering linkage phases of the current map, excluding the marker inserted + ## in the current round + ph.new <- lapply(cur.map.temp$maps, function(x) list(P = head(x$seq.ph$P, -1), + Q = head(x$seq.ph$Q, -1))) + ## Gathering linkage phases of the previous map, excluding the marker inserted + ## in the current round + ph.old <- lapply(cur.map$maps, function(x, id) list(P = x$seq.ph$P[id], + Q = x$seq.ph$Q[id]), id = names(ph.new[[1]]$P)) + + ## Check in which whole phase configurations the new + ## HMM tail should be appended + MQ <- MP <- matrix(NA, length(ph.old), length(ph.new)) + for(j1 in 1:nrow(MP)){ + for(j2 in 1:ncol(MP)){ + MP[j1, j2] <- identical(ph.old[[j1]]$P, ph.new[[j2]]$P) + MQ[j1, j2] <- identical(ph.old[[j1]]$Q, ph.new[[j2]]$Q) + } + } + M <- which(MP & MQ, arr.ind = TRUE) + colnames(M) <- c("old", "new") + ## Checking for quality (map size and LOD Score) + submap.length.old <- sapply(cur.map$maps, function(x) sum(imf_h(x$seq.rf)))[M[,1]] + last.dist.old <- sapply(cur.map$maps, function(x) tail(imf_h(x$seq.rf),1))[M[,1]] + submap.length.new <- sapply(cur.map.temp$maps, function(x) sum(imf_h(x$seq.rf)))[M[,2]] + last.dist.new <- sapply(cur.map.temp$maps, function(x) tail(imf_h(x$seq.rf),1))[M[,2]] + last.mrk.expansion <- submap.expansion <- numeric(nrow(M)) + submap.expansion <- submap.length.new - submap.length.old + last.mrk.expansion <- last.dist.new - last.dist.old + + if(length(M[,2]) == 0) { + if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (~map extension~)\n", sep = ""))) + ct <- ct + 1 + next() + } + + cur.map.temp$maps <- cur.map.temp$maps[M[,2]] + LOD <- get_LOD(cur.map.temp, sorted = FALSE) + if(sub.map.size.diff.limit != Inf){ + selected.map <- submap.expansion < sub.map.size.diff.limit & LOD < thres.hmm + if(detailed.verbose){ + x <- round(cbind(submap.length.new, submap.expansion, last.mrk.expansion),2) + for(j1 in 1:nrow(x)){ + cat(" ", x[j1,1], ": (", x[j1,2], "/", x[j1,3],")", sep = "") + if(j1 != nrow(x)) cat("\n") + } + if(all(!selected.map)) cat(paste0(crayon::red(cli::symbol$cross), "\n")) + else cat(paste0(crayon::green(cli::symbol$tick), "\n")) + } + if(all(!selected.map)){ + if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (~map extension~)\n", sep = ""))) + ct <- ct + 1 + next() + } + } else { selected.map <- LOD < thres.hmm } + all.ph.temp <- update_ph_list_at_hmm_thres(cur.map.temp, Inf) + id <- which(selected.map & which(!sapply(cur.map.temp$maps, is.null))) + if(length(id) == 0){ + if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (-linkage phases-)\n", sep = ""))) + ct <- ct + 1 + next() + } + cur.map.temp$maps <- cur.map.temp$maps[id] + if(any(unique(M[id,2,drop = FALSE]) > length(all.ph.temp$config.to.test))){ + if(verbose) cat(crayon::italic$yellow(paste0(ct ,": not included (-linkage phases-)\n", sep = ""))) + ct <- ct + 1 + next() + } + if(length(id) > nrow(M)) + all.ph.temp <- add_mrk_at_tail_ph_list(all.ph, all.ph.temp, M[,,drop = FALSE]) + else if(length(id) == nrow(M)){ + M[,2] <- id + all.ph.temp <- add_mrk_at_tail_ph_list(all.ph, all.ph.temp, M[,,drop = FALSE]) + } else { + all.ph.temp <- add_mrk_at_tail_ph_list(all.ph, all.ph.temp, M[id,,drop = FALSE]) + } + all.ph <- all.ph.temp + cur.map <- cur.map.temp + ct <- ct + 1 + } + ##### Re-estimating final map #### + ## Re-estimating final map with higher tolerance + seq.final <- make_seq_mappoly(input.obj = get(input.seq$data.name, pos = 1), + arg = as.numeric(names(all.ph$config.to.test[[1]]$P)), + data.name = input.seq$data.name) + #msg(paste("Done phasing", length(seq.final$seq.num), "markers")) + if (verbose){ + msg("Reestimating final recombination fractions", line = 2) + cat("") + } + final.map <- est_rf_hmm(input.seq = seq.final, + input.ph = all.ph, + twopt = twopt, + tol = tol.final, + verbose = FALSE, + reestimate.single.ph.configuration = TRUE, + high.prec = TRUE) + if(verbose) { + #cat("\n------------------------------------------") + cat("Markers in the initial sequence: ", length(input.seq$seq.num), sep = "") + cat("\nMapped markers : ", final.map$info$n.mrk, " (", + round(100*final.map$info$n.mrk/length(input.seq$seq.num),1) ,"%)\n", sep = "") + msg("", line = 2) + } + return(final.map) +} + +#' @rdname est_rf_hmm +#' @export +print.mappoly.map <- function(x, detailed = FALSE, ...) { + cat("This is an object of class 'mappoly.map'\n") + cat(" Ploidy level:\t", x$info$ploidy, "\n") + cat(" No. individuals:\t", get(x$info$data.name)$n.ind, "\n") + cat(" No. markers:\t", x$info$n.mrk, "\n") + cat(" No. linkage phases:\t", length(x$maps), "\n") + l <- sapply(x$maps, function(x) x$loglike) + LOD <- round(l - max(l), 2) + if(detailed) + { + for (j in 1:length(x$maps)) { + cat("\n ---------------------------------------------\n") + cat(" Linkage phase configuration: ", j) + cat("\n log-likelihood:\t", x$map[[j]]$loglike) + cat("\n LOD:\t\t", format(round(LOD[j], 2), digits = 2)) + cat("\n\n") + M <- matrix("|", x$info$n.mrk, x$info$ploidy * 2) + M <- cbind(M, " ") + M <- cbind(M, format(round(cumsum(c(0, imf_h(x$maps[[j]]$seq.rf))), 1), digits = 1)) + for (i in 1:x$info$n.mrk) { + if (all(x$maps[[j]]$seq.ph$Q[[i]] != 0)) + M[i, c(x$maps[[j]]$seq.ph$P[[i]], x$maps[[j]]$seq.ph$Q[[i]] + x$info$ploidy)] <- "o" else M[i, x$maps[[j]]$seq.ph$P[[i]]] <- "o" + } + M <- cbind(get(x$info$data.name)$mrk.names[x$maps[[j]]$seq.num], M) + big.name <- max(nchar(M[,1])) + format_name <- function(y, big.name){ + paste0(y, paste0(rep(" ", big.name-nchar(y)), collapse = "")) + } + M <- rbind(c("", letters[1:(ncol(M)-3)], "", ""), M) + format(apply(M, 1, function(y) cat(c("\t", format_name(y[1], big.name), "\t", y[2:(x$info$ploidy + 1)], rep(" ", 4), y[(x$info$ploidy + 2):(x$info$ploidy * 2 + 3)], "\n"), collapse = ""))) + } + } else { + cat("\n ---------------------------------------------\n") + cat(" Number of linkage phase configurations: ", length(x$maps)) + cat("\n ---------------------------------------------\n") + for (j in 1:length(x$maps)) { + cat(" Linkage phase configuration: ", j) + cat("\n map length:\t", format(round(sum(imf_h(x$map[[j]]$seq.rf)), 2))) + cat("\n log-likelihood:\t", format(round(x$map[[j]]$loglike, 2))) + cat("\n LOD:\t\t", format(round(LOD[j], 2), digits = 2)) + cat("\n ~~~~~~~~~~~~~~~~~~\n") + } + } +} + +#' @rdname est_rf_hmm +#' @importFrom grDevices rgb +#' @importFrom graphics rect +#' @export +plot.mappoly.map <- function(x, left.lim = 0, right.lim = Inf, + phase = TRUE, mrk.names = FALSE, + cex = 1, config = "best", P = "Parent 1", + Q = "Parent 2", xlim = NULL, ...) { + oldpar <- par(no.readonly = TRUE) + on.exit(par(oldpar)) + if(phase){ + map.info <- prepare_map(x, config) + if(any(map.info$ph.p == "B")){ + var.col <- c("black", "darkgray") + var.col <- var.col[1:2] + names(var.col) <- c("A", "B") + } else { + var.col <- c("#E41A1C", "#377EB8", "#4DAF4A", "#984EA3") + names(var.col) <- c("A", "T", "C", "G") + } + ploidy <- map.info$ploidy + if(ploidy == 2) { + d.col <- c(NA, "#1B9E77", "#D95F02") + }else if(ploidy == 4){ + d.col <- c(NA, "#1B9E77", "#D95F02", "#7570B3", "#E7298A") + }else if(ploidy == 6){ + d.col <- c(NA, "#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02") + }else if(ploidy == 8){ + d.col <- c(NA, "#1B9E77", "#D95F02", "#7570B3", "#E7298A", "#66A61E", "#E6AB02", "#A6761D", "#666666") + } else d.col <- c(NA, gg_color_hue(ploidy)) + names(d.col) <- 0:ploidy + d.col[1] <- NA + x <- map.info$map + lab <- names(x) + zy <- seq(0, 0.5, length.out = ploidy) + 1.5 + pp <- map.info$ph.p + pq <- map.info$ph.q + dp <- map.info$dp + dq <- map.info$dq + x1 <- abs(left.lim - x) + x2 <- abs(right.lim - x) + id.left <- which(x1 == min(x1))[1] + id.right <- rev(which(x2 == min(x2)))[1] + par(mai = c(1,0.15,0,0), mar = c(4.5,2,1,2)) + curx <- x[id.left:id.right] + layout(mat = matrix(c(4,2,3, 1), ncol = 2), heights = c(2, 10), widths = c(1, 10)) + if(is.null(xlim)) + xlim <- range(curx) + plot(x = curx, + y = rep(.5,length(curx)), + type = "n" , + ylim = c(.25, 4.5), + axes = FALSE, + xlab = "Distance (cM)", + ylab = "", + xlim = xlim) + lines(c(x[id.left], x[id.right]), c(.5, .5), lwd = 15, col = "gray") + points(x = curx, + y = rep(.5,length(curx)), + xlab = "", ylab = "", + pch = "|", cex = 1.5, + ylim = c(0,2)) + axis(side = 1) + diplocol1 <- c("#4DAF4A", "#984EA3") + diplocol2 <- c("#E41A1C", "#377EB8") + #Parent 2 + x1 <- seq(x[id.left], x[id.right], length.out = length(curx)) + x.control <- diff(x1[1:2])/2 + if(length(x1) < 150) + x.control <- x.control * .8 + if(length(x1) < 100) + x.control <- x.control * .8 + if(length(x1) < 75) + x.control <- x.control * .8 + if(length(x1) < 50) + x.control <- x.control * .8 + if(length(x1) < 25) + x.control <- x.control * .8 + for(i in 1:ploidy) + { + if(ploidy == 2 && any(map.info$ph.p == "B")){ + lines(range(x1), c(zy[i], zy[i]), lwd = 15, col = diplocol1[i]) + } else { + lines(range(x1), c(zy[i], zy[i]), lwd = 8, col = "gray") + } + y1 <- rep(zy[i], length(curx)) + pal <- var.col[pq[id.left:id.right,i]] + rect(xleft = x1 - x.control, ybottom = y1 -.05, xright = x1 + x.control, ytop = y1 +.05, col = pal, border = NA) + } + #connecting allelic variants to markers + for(i in 1:length(x1)) + lines(c(curx[i], x1[i]), c(0.575, zy[1]-.05), lwd = 0.2) + points(x = x1, + y = zy[ploidy]+0.05+dq[id.left:id.right]/20, + col = d.col[as.character(dq[id.left:id.right])], + pch = 19, cex = .7) + #Parent 1 + zy <- zy + 1.1 + for(i in 1:ploidy) + { + if(ploidy == 2 && any(map.info$ph.p == "B")){ + lines(range(x1), c(zy[i], zy[i]), lwd = 12, col = diplocol2[i]) + } else { + lines(range(x1), c(zy[i], zy[i]), lwd = 8, col = "gray") + } + y1 <- rep(zy[i], length(curx)) + pal <- var.col[pp[id.left:id.right,i]] + rect(xleft = x1 - x.control, ybottom = y1 -.05, xright = x1 + x.control, ytop = y1 +.05, col = pal, border = NA) + } + points(x = x1, + y = zy[ploidy]+0.05+dp[id.left:id.right]/20, + col = d.col[as.character(dp[id.left:id.right])], + pch = 19, cex = .7) + if(mrk.names) + text(x = x1, + y = rep(zy[ploidy]+0.05+.3, length(curx)), + labels = names(curx), + srt = 90, adj = 0, cex = cex) + par(mar = c(4.5,1,1,0), xpd = TRUE) + plot(x = 0, + y = 0, + type = "n" , + axes = FALSE, + ylab = "", + xlab = "", + ylim = c(.25, 4.5)) + zy <- zy - 1.1 + mtext(text = Q, side = 4, at = mean(zy), line = -1, font = 4) + for(i in 1:ploidy) + mtext(letters[(ploidy+1):(2*ploidy)][i], line = 0, at = zy[i], side = 4) + zy <- zy + 1.1 + mtext(text = P, side = 4, at = mean(zy), line = -1, font = 4) + for(i in 1:ploidy) + mtext(letters[1:ploidy][i], line = 0, at = zy[i], side = 4) + par(mar = c(0,1,2,4), xpd = FALSE) + plot(x = curx, + y = rep(.5,length(curx)), + type = "n" , + axes = FALSE, + xlab = "", + ylab = "") + if(any(map.info$ph.p == "B")){ + legend("bottomleft", legend = c("A", "B"), + fill = c(var.col), title = "Variants", + box.lty = 0, bg = "transparent", ncol = 2) + } else { + legend("bottomleft", legend = c("A", "T", "C", "G", "-"), + fill = c(var.col, "white"), title = "Nucleotides", + box.lty = 0, bg = "transparent", ncol = 4) + } + legend("bottomright", legend = names(d.col)[-1], title = "Doses" , + col = d.col[-1], ncol = ploidy/2, pch = 19, + box.lty = 0) + } else { + plot_map_list(x) + } +} + +#' prepare maps for plot +#' @param void internal function to be documented +#' @keywords internal +prepare_map <- function(input.map, config = "best"){ + if (!inherits(input.map, "mappoly.map")) { + stop(deparse(substitute(input.map)), " is not an object of class 'mappoly.map'") + } + ## Choosing the linkage phase configuration + LOD.conf <- get_LOD(input.map, sorted = FALSE) + if(config == "best") { + i.lpc <- which.min(LOD.conf) + } else if(config == "all"){ + i.lpc <- seq_along(LOD.conf) } else if (config > length(LOD.conf)) { + stop("invalid linkage phase configuration") + } else i.lpc <- config + ## Gathering marker positions + map <- cumsum(imf_h(c(0, input.map$maps[[i.lpc]]$seq.rf))) + names(map) <- input.map$info$mrk.names + ## + ph.p <- ph_list_to_matrix(input.map$maps[[i.lpc]]$seq.ph$P, input.map$info$ploidy) + ph.q <- ph_list_to_matrix(input.map$maps[[i.lpc]]$seq.ph$Q, input.map$info$ploidy) + dimnames(ph.p) <- list(names(map), letters[1:input.map$info$ploidy]) + dimnames(ph.q) <- list(names(map), letters[(1+input.map$info$ploidy):(2*input.map$info$ploidy)]) + if(is.null(input.map$info$seq.alt)) + { + ph.p[ph.p == 1] <- ph.q[ph.q == 1] <- "A" + ph.p[ph.p == 0] <- ph.q[ph.q == 0] <- "B" + } else { + for(i in input.map$info$mrk.names){ + ph.p[i, ph.p[i,] == 1] <- input.map$info$seq.alt[i] + ph.p[i, ph.p[i,] == 0] <- input.map$info$seq.ref[i] + ph.q[i, ph.q[i,] == 1] <- input.map$info$seq.alt[i] + ph.q[i, ph.q[i,] == 0] <- input.map$info$seq.ref[i] + } + } + dp <- input.map$info$seq.dose.p1 + dq <- input.map$info$seq.dose.p2 + list(ploidy = input.map$info$ploidy, map = map, ph.p = ph.p, ph.q = ph.q, dp = dp, dq = dq) +} + +#' Get the tail of a marker sequence up to the point where the markers +#' provide no additional information. +#' +#' @param void internal function to be documented +#' @keywords internal +get_full_info_tail <- function(input.obj, extend = NULL) { + ## checking for correct object + if(!inherits(input.obj, "mappoly.map")) + stop(deparse(substitute(input.obj)), " is not an object of class 'mappoly.map''") + if (!is.null(extend)) + if (extend > input.obj$info$n.mrk) + return(input.obj) + ploidy <- input.obj$info$ploidy + i <- ploidy/2 + w1 <- ph_list_to_matrix(input.obj$maps[[1]]$seq.ph$P, ploidy) + max1 <- length(unique(apply(w1, 2, paste0, collapse = ""))) + w2 <- ph_list_to_matrix(input.obj$maps[[1]]$seq.ph$Q, ploidy) + max2 <- length(unique(apply(w2, 2, paste0, collapse = ""))) + while (i < input.obj$info$n.mrk) { + wp <- ph_list_to_matrix(tail(input.obj$maps[[1]]$seq.ph$P, i), ploidy) + xp <- apply(wp, 2, paste, collapse = "-") + wq <- ph_list_to_matrix(tail(input.obj$maps[[1]]$seq.ph$Q, i), ploidy) + xq <- apply(wq, 2, paste, collapse = "-") + if (length(unique(xp)) == max1 && length(unique(xq)) == max2) + break() + i <- i + 1 + } + if (!is.null(extend)) + if (i < extend) + i <- extend + input.obj$info$n.mrk <- i + + + + input.obj$info$seq.num <- tail(input.obj$info$seq.num, i) + input.obj$info$mrk.names <- tail(input.obj$info$mrk.names, i) + input.obj$info$seq.dose.p1 <- tail(input.obj$info$seq.dose.p1, i) + input.obj$info$seq.dose.p2 <- tail(input.obj$info$seq.dose.p2, i) + input.obj$info$chrom <- tail(input.obj$info$chrom, i) + input.obj$info$genome.pos <- tail(input.obj$info$genome.pos, i) + input.obj$info$chisq.pval <- tail(input.obj$info$chisq.pval, i) + + for (j in 1:length(input.obj$maps)) { + input.obj$maps[[j]]$loglike <- 0 + input.obj$maps[[j]]$seq.ph$P <- tail(input.obj$maps[[j]]$seq.ph$P, n = i) + input.obj$maps[[j]]$seq.ph$Q <- tail(input.obj$maps[[j]]$seq.ph$Q, n = i) + input.obj$maps[[j]]$seq.num <- tail(input.obj$maps[[j]]$seq.num, n = i) + input.obj$maps[[j]]$seq.rf <- tail(input.obj$maps[[j]]$seq.rf, n = (i - 1)) + } + return(input.obj) +} + +#' remove maps under a certain threshold +#' @param void internal function to be documented +#' @keywords internal +#' @export filter_map_at_hmm_thres +filter_map_at_hmm_thres <- function(map, thres.hmm){ + map$info$ph.thresh <- NULL + map$maps <- map$maps[get_LOD(map, sorted = FALSE) < thres.hmm] + map +} + +#' makes a phase list from map, selecting only +#' configurations under a certain threshold +#' @param void internal function to be documented +#' @keywords internal +update_ph_list_at_hmm_thres <- function(map, thres.hmm){ + temp.map <- filter_map_at_hmm_thres(map, thres.hmm) + config.to.test <- lapply(temp.map$maps, function(x) x$seq.ph) + structure(list(config.to.test = config.to.test, + ploidy = map$info$ploidy, seq.num = map$maps[[1]]$seq.num, + thres = map$info$ph.thresh, data.name = map$info$data.name, + thres.hmm = thres.hmm), + class = "two.pts.linkage.phases") +} + +#' subset of a linkage phase list +#' @param void internal function to be documented +#' @keywords internal +get_ph_list_subset <- function(ph.list, seq.num, conf){ + config.to.test <- list(lapply(ph.list$config.to.test[[conf]], function(x, seq.num) x[as.character(seq.num)], seq.num)) + structure(list(config.to.test = config.to.test, + ploidy = ph.list$ploidy, seq.num = ph.list$seq.num, + thres = ph.list$ph.thresh, data.name = ph.list$data.name, + thres.hmm = ph.list$thres.hmm), + class = "two.pts.linkage.phases") +} + +#' concatenate two linkage phase lists +#' @param void internal function to be documented +#' @keywords internal +concatenate_ph_list <- function(ph.list.1, ph.list.2){ + if(length(ph.list.1) == 0) + return(ph.list.2) + config.to.test <- c(ph.list.1$config.to.test, ph.list.2$config.to.test) + id <- which(!duplicated(config.to.test)) + config.to.test <- config.to.test[id] + structure(list(config.to.test = config.to.test, + ploidy = ph.list.1$ploidy, seq.num = ph.list.1$seq.num, + thres = ph.list.1$ph.thresh, data.name = ph.list.1$data.name, + thres.hmm = ph.list.1$thres.hmm), + class = "two.pts.linkage.phases") +} + +#' add a single marker at the tail of a linkage phase list +#' @param void internal function +#' @keywords internal +add_mrk_at_tail_ph_list <- function(ph.list.1, ph.list.2, cor.index){ + config.to.test <- vector("list", length = nrow(cor.index)) + for(i in 1:nrow(cor.index)){ + config.to.test[[i]] <- list(P = c(ph.list.1$config.to.test[[cor.index[i,1]]]$P, tail(ph.list.2$config.to.test[[cor.index[i,2]]]$P,1)), + Q = c(ph.list.1$config.to.test[[cor.index[i,1]]]$Q, tail(ph.list.2$config.to.test[[cor.index[i,2]]]$Q,1))) + } + structure(list(config.to.test = config.to.test, + ploidy = ph.list.1$ploidy, seq.num = ph.list.1$seq.num, + thres = ph.list.1$ph.thresh, data.name = ph.list.1$data.name, + thres.hmm = ph.list.1$thres.hmm), + class = "two.pts.linkage.phases") +} + +#' Compare a list of linkage phases and return the +#' markers for which they are different. +#' @param void internal function to be documented +#' @keywords internal +check_ls_phase <- function(ph){ + if(length(ph$config.to.test) == 1) return(0) + id <- rep(1, length(ph$config.to.test[[1]]$P)) + for(i in 1:length(ph$config.to.test[[1]]$P)){ + w <- ph$config.to.test[[1]]$P[i] + for(j in 2:length(ph$config.to.test)) + id[i] <- id[i] + identical(w, ph$config.to.test[[j]]$P[i]) + } + names(id) <- names((ph$config.to.test[[1]]$P)) + w <- length(ph$config.to.test[[1]]$P) - which(id < length(ph$config.to.test)) + if(length(w) == 0) return(0) + w +} + +#' cat for graphical representation of the phases +#' @param void internal function to be documented +#' @keywords internal +print_ph <- function(input.ph){ + phs.P <- lapply(input.ph$config.to.test, + function(x, ploidy) { + M <- matrix("|", nrow = 1, ncol = ploidy) + M[unlist(tail(x$P, 1))] <- crayon::red(cli::symbol$bullet) + paste(M, collapse = "")}, + ploidy = input.ph$ploidy) + phs.Q <- lapply(input.ph$config.to.test, + function(x, ploidy) { + M <- matrix("|", nrow = 1, ncol = ploidy) + M[unlist(tail(x$Q, 1))] <- crayon::cyan(cli::symbol$bullet) + paste(M, collapse = "")}, + ploidy = input.ph$ploidy) + if(length(phs.P) == 1) + return(paste(unlist(phs.P)[1], unlist(phs.Q)[1], " ")) + if(length(phs.P) == 2) + return(paste(unlist(phs.P)[1], unlist(phs.Q)[1], " ", unlist(phs.P)[2], unlist(phs.Q)[2])) + if(length(phs.P) > 2) + return(paste(unlist(phs.P)[1], unlist(phs.Q)[1], " ... ", unlist(phs.P)[2], unlist(phs.Q)[2])) +} + +#' cat for phase information +#' @param void internal function to be documented +#' @keywords internal +cat_phase <- function(input.seq, + input.ph, + all.ph, + ct, + seq.num, + twopt.phase.number, + hmm.phase.number){ + pc <- round(100*ct/length(input.seq$seq.num),1) + xmax11 <- nchar(length(input.seq$seq.num)) + x11 <- ct + x11 <- paste0(x11, paste(rep(" ", xmax11-nchar(x11)), collapse = "")) + x12 <- length(all.ph$config.to.test[[1]]$P) + 1 + x12 <- paste0(x12, paste(rep(" ", xmax11-nchar(x12)), collapse = "")) + x13 <- pc + x13 <- paste0(":(",pc,"%)", paste(rep(" ", 5-nchar(x13)), collapse = "")) + x1 <- paste0(x12,"/",x11, x13) + xmax21 <- nchar(max(input.seq$seq.num)) + x21 <- input.seq$seq.num[ct] + x21 <- paste0(x21, paste(rep(" ", xmax21-nchar(x21)), collapse = "")) + x22 <- length(input.ph$config.to.test) + x22 <- paste0(x22, " ph ", paste(rep(" ", 5-nchar(x22)), collapse = "")) + x23 <- paste0("(", hmm.phase.number, "/", twopt.phase.number,") ") + x23 <- paste0(x23, paste(rep(" ", 10-nchar(x23)), collapse = "")) + x2 <- paste0( x21, ": ", x22, x23) + x31 <- length(seq.num)-1 + x31 <- paste0(x31, paste(rep(" ", xmax11-nchar(x31)), collapse = "")) + x3 <- paste0(" -- tail: ", x31) + cat(x1, x2, x3, print_ph(input.ph), "\n") +}