Merge branch 'baps-format' into develop

* baps-format: Increment version number to 0.0.0.9025 Skips `greedyMix()` example until #24 is fixed Updated documentation (#24) Fails for FAST input to `greedyMix()` (#24) Fixed calculation og `ninds` (#24) Bugfixes to `indMix()` and subfunctions (#24) Improved handling of BAPS format (#24) Fixed calculation of `ninds` `ekat` (#24) Added processing of BAPS format
ocbe-uio · Apr 8, 2024 · 9b8da3a · 9b8da3a
2 parents d2ec589 + 6d875df
commit 9b8da3a
Show file tree

Hide file tree

Showing 14 changed files with 134 additions and 57 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rBAPS
 Title: Bayesian Analysis of Population Structure
-Version: 0.0.0.9024
+Version: 0.0.0.9025
 Date: 2020-11-09
 Authors@R:
     c(
@@ -36,7 +36,7 @@ Description: Partial R implementation of the BAPS software
 License: GPL-3
 BugReports: https://github.com/ocbe-uio/rBAPS/issues
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Suggests:
     testthat (>= 2.1.0)
 Imports:

diff --git a/NAMESPACE b/NAMESPACE
@@ -38,5 +38,6 @@ importFrom(methods,is)
 importFrom(stats,runif)
 importFrom(stats,sd)
 importFrom(utils,read.delim)
+importFrom(utils,read.table)
 importFrom(vcfR,read.vcfR)
 importFrom(zeallot,"%<-%")
diff --git a/R/greedyMix.R b/R/greedyMix.R
@@ -2,7 +2,6 @@
 #' @param data data file
 #' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
 #' @param partitionCompare a list of partitions to compare
-#' @param ninds number of individuals
 #' @param npops number of populations
 #' @param counts counts
 #' @param sumcounts sumcounts
@@ -20,36 +19,56 @@
 #' with high-throughput sequencing data. <http://www.htslib.org/>
 #' @export
 #' @examples
-#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
-#' greedyMix(data, "fasta")
+#' \dontrun{ # TEMP: unwrap once #24 is resolved
+#' data <- system.file("extdata", "BAPS_format_clustering_diploid.txt", package = "rBAPS")
+#' greedyMix(data, "baps")
+#' } # TEMP: unwrap once #24 is resolved
 greedyMix <- function(
-  data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, ninds = 1L, npops = 1L,
+  data, format = gsub("^.*\\.", "", data), partitionCompare = NULL, npops = 1L,
   counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
   inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
 ) {
   # Importing and handling data ================================================
-  data <- importFile(data, format, verbose)
-  data <- handleData(data, tolower(format))
-  c <- list(
-    noalle = data[["noalle"]],
-    data = data[["newData"]],
-    adjprior = data[["adjprior"]],
-    priorTerm = data[["priorTerm"]],
-    rowsFromInd = data[["rowsFromInd"]]
-  )
+  if (tolower(format) %in% "fasta") {
+    stop("FASTA format not yet supported on greedyMix")
+  }
+  if (tolower(format) %in% "baps") {
+    data <- process_BAPS_data(data, NULL)
+    c <- list(
+      noalle = data[["noalle"]],
+      data = data[["data"]],
+      adjprior = data[["adjprior"]],
+      priorTerm = data[["priorTerm"]],
+      rowsFromInd = data[["rowsFromInd"]],
+      Z = data[["Z"]],
+      dist = data[["dist"]]
+    )
+  } else {
+    data <- importFile(data, format, verbose)
+    data <- handleData(data, tolower(format))
+    c <- list(
+      noalle = data[["noalle"]],
+      data = data[["newData"]],
+      adjprior = data[["adjprior"]],
+      priorTerm = data[["priorTerm"]],
+      rowsFromInd = data[["rowsFromInd"]],
+      Z = data[["Z"]],
+      dist = data[["dist"]]
+    )
+  }
 
   # Comparing partitions =======================================================
+  ninds <- length(unique(c[["data"]][, ncol(c[["data"]])]))
   if (!is.null(partitionCompare)) {
     logmls <- comparePartitions(
       c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
       c[["rowsFromInd"]], c[["noalle"]], c[["adjprior"]]
     )
   }
 
-
   # Generating partition summary ===============================================
-  ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)';
-  c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1]
+  ekat <- seq(1L, ninds * c[["rowsFromInd"]], c[["rowsFromInd"]])
+  c[["rows"]] <- cbind(ekat, ekat + c[["rowsFromInd"]] - 1L)
   logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
   logml <- logml_npops_partitionSummary[["logml"]]
   npops <- logml_npops_partitionSummary[["npops"]]

diff --git a/R/handleData.R b/R/handleData.R
@@ -30,25 +30,25 @@ handleData <- function(raw_data, format = "Genepop") {
     "bam"     = stop("BAM format not supported for processing yet")
   )
   data <- as.matrix(raw_data)
-  dataApu <- data[, 1:nloci]
+  dataApu <- data[, seq_len(nloci)]
   nollat <- matlab2r::find(dataApu == 0)
   if (!isempty(nollat)) {
     isoinAlleeli <- base::max(base::max(dataApu))
     dataApu[nollat] <- isoinAlleeli + 1
-    data[, 1:nloci] <- dataApu
+    data[, seq_len(nloci)] <- dataApu
   }
 
   noalle <- zeros(1, nloci)
   alleelitLokuksessa <- cell(nloci, 1, expandable = TRUE)
-  for (i in 1:nloci) {
+  for (i in seq_len(nloci)) {
     alleelitLokuksessaI <- unique(data[, i])
     alleelitLokuksessa[[i]] <- sort(alleelitLokuksessaI[
       matlab2r::find(alleelitLokuksessaI >= 0)
     ])
     noalle[i] <- length(alleelitLokuksessa[[i]])
   }
   alleleCodes <- zeros(base::max(noalle), nloci)
-  for (i in 1:nloci) {
+  for (i in seq_len(nloci)) {
     alleelitLokuksessaI <- alleelitLokuksessa[[i]]
     puuttuvia <- base::max(noalle) - length(alleelitLokuksessaI)
     alleleCodes[, i] <- as.matrix(c(alleelitLokuksessaI, zeros(puuttuvia, 1)))
@@ -83,7 +83,7 @@ handleData <- function(raw_data, format = "Genepop") {
 
   adjprior <- zeros(base::max(noalle), nloci)
   priorTerm <- 0
-  for (j in 1:nloci) {
+  for (j in seq_len(nloci)) {
     adjprior[, j] <- as.matrix(c(
       repmat(1 / noalle[j], c(noalle[j], 1)),
       ones(base::max(noalle) - noalle[j], 1)

diff --git a/R/indMix.R b/R/indMix.R
@@ -21,11 +21,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
     Z <- NULL
   }
 
-
-  rm(c)
-  nargin <- length(as.list(match.call())) - 1
-  if (nargin < 2) {
-    dispText <- 1
+  if (missing(npops)) {
+    # Recreate npopsTaulu from objects other than npops
+    dispText <- TRUE
     npopstext <- matrix()
     ready <- FALSE
     teksti <- "Input upper bound to the number of populations (possibly multiple values)"
@@ -75,7 +73,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
   partitionSummary[, 1] <- zeros(30, 1)
   worstLogml <- -1e50
   worstIndex <- 1
-  for (run in seq_along(nruns)) {
+  for (run in seq_len(nruns)) {
     npops <- npopsTaulu[[run]]
     if (dispText) {
       dispLine()
@@ -110,7 +108,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
     # PARHAAN MIXTURE-PARTITION ETSIMINEN
     nRoundTypes <- 7
     kokeiltu <- zeros(nRoundTypes, 1)
-    roundTypes <- c(1, 1) # Ykk�svaiheen sykli kahteen kertaan.
+    roundTypes <- t(c(1, 1)) # Ykk�svaiheen sykli kahteen kertaan.
     ready <- 0
     vaihe <- 1
 
@@ -143,9 +141,9 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
         if (kokeiltu[round] == 1) { # Askelta kokeiltu viime muutoksen j�lkeen
         } else if (round == 0 | round == 1) { # Yksil�n siirt�minen toiseen populaatioon.
           inds <- seq_len(ninds)
-          aputaulu <- cbind(t(inds), rand(ninds, 1))
+          aputaulu <- cbind(inds, rand(ninds, 1))
           aputaulu <- matrix(sortrows(aputaulu, 2), nrow = nrow(aputaulu))
-          inds <- t(aputaulu[, 1])
+          inds <- aputaulu[, 1]
           muutosNyt <- 0
 
           for (ind in inds) {
@@ -324,7 +322,7 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
             if (round == 5) {
               aputaulu <- c(inds, rand(length(inds), 1))
               aputaulu <- sortrows(aputaulu, 2)
-              inds <- t(aputaulu[, 1])
+              inds <- aputaulu[, 1]
             } else if (round == 6) {
               inds <- returnInOrder(
                 inds, pop, rows, data, adjprior, priorTerm
@@ -525,15 +523,15 @@ indMix <- function(c, npops, counts = NULL, sumcounts = NULL, max_iter = 100L, d
 
       if (ready == 0) {
         if (vaihe == 1) {
-          roundTypes <- 1
+          roundTypes <- t(1)
         } else if (vaihe == 2) {
-          roundTypes <- c(2, 1)
+          roundTypes <- t(c(2, 1))
         } else if (vaihe == 3) {
-          roundTypes <- c(5, 5, 7)
+          roundTypes <- t(c(5, 5, 7))
         } else if (vaihe == 4) {
-          roundTypes <- c(4, 3, 1)
+          roundTypes <- t(c(4, 3, 1))
         } else if (vaihe == 5) {
-          roundTypes <- c(6, 7, 2, 3, 4, 1)
+          roundTypes <- t(c(6, 7, 2, 3, 4, 1))
         }
       }
     }

diff --git a/R/initialCounts.R b/R/initialCounts.R
@@ -7,8 +7,8 @@ initialCounts <- function(partition, data, npops, rows, noalle, adjprior) {
 
   counts <- zeros(base::max(noalle), nloci, npops)
   sumcounts <- zeros(npops, nloci)
-  for (i in 1:npops) {
-    for (j in 1:nloci) {
+  for (i in seq_len(npops)) {
+    for (j in seq_len(nloci)) {
       havainnotLokuksessa <- matlab2r::find(partition == i & data[, j] >= 0)
       sumcounts[i, j] <- length(havainnotLokuksessa)
       for (k in 1:noalle[j]) {

diff --git a/R/linkage.R b/R/linkage.R
@@ -26,11 +26,11 @@ linkage <- function(Y, method = "co") {
   monotonic <- 1
   Z <- zeros(m - 1, 3) # allocate the output matrix.
   N <- zeros(1, 2 * m - 1)
-  N[1:m] <- 1
+  N[seq_len(m)] <- 1
   n <- m # since m is changing, we need to save m in n.
-  R <- 1:n
+  R <- seq_len(n)
   for (s in 1:(n - 1)) {
-    X <- as.matrix(as.vector(Y), ncol = 1)
+    X <- as.matrix(as.vector(Y), nrow = 1)
     v <- matlab2r::min(X)$mins
     k <- matlab2r::min(X)$idx
 
@@ -83,11 +83,14 @@ linkage <- function(Y, method = "co") {
     )
     J <- c(J, i * (m - (i + 1) / 2) - m + j)
     Y <- Y[-J] # no need for the cluster information about j
+
     # update m, N, R
     m <- m - 1
     N[n + s] <- N[R[i]] + N[R[j]]
     R[i] <- n + s
-    R[j:(n - 1)] <- R[(j + 1):n]
+    if (j < n) {
+      R[j:(n - 1)] <- R[(j + 1):n]
+    }
   }
   return(Z)
 }
diff --git a/R/logml2String.R b/R/logml2String.R
@@ -1,7 +1,6 @@
 #' @title Logml to string
 #' @description Returns a string representation of a logml
 #' @param logml input Logml
-#' @param
 #' @param leading_zeros_replacement string to replace leading zeros with
 #' @return String version of logml
 logml2String <- function(logml, leading_zeros_replacement = " ") {

diff --git a/R/newGetDistances.R b/R/newGetDistances.R
@@ -35,7 +35,7 @@ newGetDistances <- function(data, rowsFromInd) {
   y <- zeros(size(toka))
   y <- apply(y, 2, as.integer)
 
-  for (j in 1:nloci) {
+  for (j in seq_len(nloci)) {
     for (k in 1:rowsFromInd) {
       x[, k] <- data[eka[, k], j]
       y[, k] <- data[toka[, k], j]

diff --git a/R/process_BAPS_data.R b/R/process_BAPS_data.R
@@ -0,0 +1,37 @@
+process_BAPS_data <- function(file, partitionCompare) {
+  if (!is.null(partitionCompare)) {
+    cat('Data:', file, '\n')
+  }
+  data <- read.table(file)
+  ninds <- testaaOnkoKunnollinenBapsData(data)  # for testing purposes?
+  if (ninds == 0) {
+    warning('Incorrect Data-file.')
+    return(NULL)
+  }
+  popnames <- NULL  # Dropped specification of population names (from BAPS 6)
+
+  result <- handleData(data, format = "BAPS")
+  data <- result$newData
+  rowsFromInd <- result$rowsFromInd
+  alleleCodes <- result$alleleCodes
+  noalle <- result$noalle
+  adjprior <- result$adjprior
+  priorTerm <- result$priorTerm
+  result <- newGetDistances(data, rowsFromInd)
+  Z <- result$Z
+  dist <- result$dist
+
+  # Forming and saving pre-processed data
+  processed_data <- list(
+    data = data,
+    rowsFromInd = rowsFromInd,
+    alleleCodes = alleleCodes,
+    noalle = noalle,
+    adjprior = adjprior,
+    priorTerm = priorTerm,
+    dist = dist,
+    popnames = popnames,
+    Z = Z
+  )
+  return(processed_data)
+}
diff --git a/R/rBAPS-package.R b/R/rBAPS-package.R
@@ -1,6 +1,5 @@
 #' @title Bayesian Analysis of Population Structure
 #' @description This is a partial implementation of the BAPS software
-#' @docType package
 #' @name rBAPS
 #' @note Found a bug? Want to suggest a feature? Contribute to the scientific
 #' and open source communities by opening an issue on our home page.
@@ -11,4 +10,5 @@
 #' @importFrom stats runif
 #' @importFrom zeallot %<-%
 #' @importFrom matlab2r nargin log2
-NULL
+#' @importFrom utils read.table
+"_PACKAGE"
diff --git a/man/greedyMix.Rd b/man/greedyMix.Rd
diff --git a/man/rBAPS.Rd b/man/rBAPS.Rd