Merge branch 'issue-25' into develop

* issue-25: (38 commits) Adjusted unit tests for #25 Fixed sink() usage Fixed docs Exporting importFile() Improved handling of supported formats for greedyMix() (#25) Fixed basic parsing of FASTA files (#25) Increment version number to 0.0.0.9022 Fixed syntax (#25) Improved printing (#25) Partial reversion of b034158 (#25) Fixed to indMix (#25) Incorporated handleData() on greedyMix() (#25) Improved handleData() to handle FASTA (#25) Added numeric output option to load_fasta() (#25) Fixed test text (#25) Added missing documentation for arguments (#25) Syntax fix (#25) Delayed resolution of FIXMEs (#25) Workaround for usage of MATLAB any() (#25) Fixed argument passing (#25) ...
ocbe-uio · Sep 11, 2023 · 59fbb0a · 59fbb0a
2 parents 449982a + ce954bf
commit 59fbb0a
Show file tree

Hide file tree

Showing 95 changed files with 45,168 additions and 45,030 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -4,7 +4,6 @@ PITFALLS.md
 CHANGELOG.md
 CITATION.cff
 .travis.yml
-inst/ext/ExamplesDataFormatting
+inst/testdata
 .github
 aux
-
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: rBAPS
 Title: Bayesian Analysis of Population Structure
-Version: 0.0.0.9020
+Version: 0.0.0.9022
 Date: 2020-11-09
 Authors@R:
     c(

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,45 +1,8 @@
 # Generated by roxygen2: do not edit by hand
 
-export(addAlleles)
-export(admix1)
-export(calculatePopLogml)
-export(computeAllFreqs2)
-export(computeIndLogml)
-export(computePersonalAllFreqs)
-export(computeRows)
-export(etsiParas)
-export(fgetl)
-export(fopen)
 export(greedyMix)
-export(greedyPopMix)
 export(handleData)
-export(handlePopData)
-export(initPopNames)
-export(learn_partition_modified)
-export(learn_simple_partition)
-export(linkage)
-export(load_fasta)
-export(logml2String)
-export(lueGenePopData)
-export(lueGenePopDataPop)
-export(lueNimi)
-export(noIndex)
-export(ownNum2Str)
-export(poistaLiianPienet)
-export(proportion2str)
-export(randdir)
-export(rivinSisaltamienMjonojenLkm)
-export(selvitaDigitFormat)
-export(simulateAllFreqs)
-export(simulateIndividuals)
-export(simuloiAlleeli)
-export(suoritaMuutos)
-export(takeLine)
-export(testaaKoordinaatit)
-export(testaaOnkoKunnollinenBapsData)
-export(testaaPop)
-export(writeMixtureInfo)
-export(writeMixtureInfoPop)
+export(importFile)
 importFrom(R6,R6Class)
 importFrom(Rsamtools,scanBam)
 importFrom(adegenet,.readExt)

diff --git a/NEWS.md b/NEWS.md
@@ -0,0 +1,6 @@
+# rBAPS (development version)
+
+# rBAPS 0.0.0.9021
+
+* Added a `NEWS.md` file to track changes to the package.
+* Exported `greedyMix()` and `load_fasta()` functions.
diff --git a/R/addAlleles.R b/R/addAlleles.R
@@ -4,7 +4,6 @@
 #' @param line line
 #' @param divider divider
 #' @return data (after alleles were added)
-#' @export
 addAlleles <- function(data, ind, line, divider) {
   # Lisaa BAPS-formaatissa olevaan datataulukkoon
   # yksil�� ind vastaavat rivit. Yksil�n alleelit

diff --git a/R/admix1.R b/R/admix1.R
@@ -6,7 +6,6 @@
 #' alleleCodes, adjprior, popnames, rowsFromInd, data, npops, noalle
 #' @param tietue tietue
 #' @importFrom methods is
-#' @export
 admix1 <- function(tietue) {
   if (!is.list(tietue)) {
     message("Load mixture result file. These are the files in this directory:")

diff --git a/R/calculatePopLogml.R b/R/calculatePopLogml.R
@@ -4,7 +4,6 @@
 #' for the mean parameter.
 #' @param points points
 #' @param fii fii
-#' @export
 calculatePopLogml <- function(points, fii) {
   n <- length(points)
   fuzzy_ones <- sum(points)

diff --git a/R/comparePartitions.R b/R/comparePartitions.R
@@ -0,0 +1,23 @@
+comparePartitions <- function(data, c.rows, partitionCompare.partitions, ninds, rowsFromInd, noalle, adjprior) {
+  stop("Comparing partitions not yet implemented") # TODO: implement
+  # nsamplingunits = size(c.rows,1);
+  # partitions = partitionCompare.partitions;
+  # npartitions = size(partitions,2);
+  # partitionLogml = zeros(1,npartitions);
+  # for i = 1:npartitions
+  #     % number of unique partition lables
+  #     npops = length(unique(partitions(:,i)));
+
+  #     partitionInd = zeros(ninds*rowsFromInd,1);
+  #     partitionSample = partitions(:,i);
+  #     for j = 1:nsamplingunits
+  #         partitionInd([c.rows(j,1):c.rows(j,2)]) = partitionSample(j);
+  #     end
+  #     partitionLogml(i) = initialCounts(partitionInd, data(:,1:end-1), npops, c.rows, noalle, adjprior);
+
+  # end
+  # % return the logml result
+  # partitionCompare.logmls = partitionLogml;
+  # set(h1, 'userdata', partitionCompare);
+  # return
+}
diff --git a/R/computeAllFreqs2.R b/R/computeAllFreqs2.R
@@ -2,7 +2,6 @@
 #' @description Lisää a priori jokaista alleelia joka populaation joka lokukseen
 #' j 1/noalle(j) verran.
 #' @param noalle noalle
-#' @export
 computeAllFreqs2 <- function(noalle) {
   COUNTS <- ifelse(isGlobalEmpty(COUNTS), vector(), COUNTS)
   SUMCOUNTS <- ifelse(isGlobalEmpty(SUMCOUNTS), vector(), COUNTS)

diff --git a/R/computeIndLogml.R b/R/computeIndLogml.R
@@ -3,7 +3,6 @@
 #' määritellyiksi kuten osuusTaulu:ssa.
 #' @param omaFreqs own Freqs?
 #' @param osuusTaulu Percentage table?
-#' @export
 computeIndLogml <- function(omaFreqs, osuusTaulu) {
   omaFreqs <- as.matrix(omaFreqs)
   osuusTaulu <- as.matrix(osuusTaulu)

diff --git a/R/computePersonalAllFreqs.R b/R/computePersonalAllFreqs.R
@@ -7,8 +7,6 @@
 #' @param data data
 #' @param allFreqs allFreqs
 #' @param rowsFromInd rowsFromInd
-#' @export
-
 computePersonalAllFreqs <- function(ind, data, allFreqs, rowsFromInd) {
   if (isGlobalEmpty(COUNTS)) {
     nloci <- npops <- 1

diff --git a/R/computePopulationLogml.R b/R/computePopulationLogml.R
@@ -4,9 +4,13 @@ computePopulationLogml <- function(pops, adjprior, priorTerm) {
   # ======================================================== #
   # Limiting COUNTS size                                     #
   # ======================================================== #
-  COUNTS <- COUNTS[
-    seq_len(nrow(adjprior)), seq_len(ncol(adjprior)), pops, drop = FALSE
-  ]
+  if (!is.null(adjprior)) {
+    nr <- seq_len(nrow(adjprior))
+    nc <- seq_len(ncol(adjprior))
+    COUNTS <- COUNTS[nr, nc, pops, drop = FALSE]
+  } else {
+    COUNTS <- NA
+  }
 
   x <- size(COUNTS, 1)
   y <- size(COUNTS, 2)
@@ -15,25 +19,24 @@ computePopulationLogml <- function(pops, adjprior, priorTerm) {
   # ======================================================== #
   # Computation                                              #
   # ======================================================== #
-  isarray <- length(dim(repmat(adjprior, c(1, 1, length(pops))))) > 2
-  term1 <- squeeze(
-    sum(
+  term1 <- NULL
+  if (!is.null(adjprior)) {
+    isarray <- length(dim(repmat(adjprior, c(1, 1, length(pops))))) > 2
+    term1 <- squeeze(
       sum(
-        reshape(
-          lgamma(
-            repmat(adjprior, c(1, 1, length(pops))) +
-              COUNTS[
-                seq_len(nrow(adjprior)), seq_len(ncol(adjprior)), pops,
-                drop = !isarray
-              ]
+        sum(
+          reshape(
+            lgamma(
+              repmat(adjprior, c(1, 1, length(pops))) + COUNTS[nr, nc, pops, drop = !isarray]
+            ),
+            c(x, y, z)
           ),
-          c(x, y, z)
+          1
         ),
-        1
-      ),
-      2
+        2
+      )
     )
-  )
+  }
   if (is.null(priorTerm)) priorTerm <- 0
   popLogml <- term1 - sum(lgamma(1 + SUMCOUNTS[pops, ]), 2) - priorTerm
   return(popLogml)

diff --git a/R/computeRows.R b/R/computeRows.R
@@ -4,7 +4,6 @@
 #' @param rowsFromInd rowsFromInd
 #' @param inds matrix
 #' @param ninds ninds
-#' @export
 computeRows <- function(rowsFromInd, inds, ninds) {
   if (!is(inds, "matrix")) inds <- as.matrix(inds)
   if (identical(dim(inds), c(nrow(inds), 1L))) {

diff --git a/R/etsiParas.R b/R/etsiParas.R
@@ -1,4 +1,3 @@
-#' @export
 #' @title Etsi Paras
 #' @description Search for the best?
 #' @param osuus Percentages?

diff --git a/R/fgetl-fopen.R b/R/fgetl-fopen.R
@@ -6,7 +6,6 @@
 #' fgetl returns tline as a numeric value -1.
 #' @author Waldir Leoncio
 #' @seealso fopen
-#' @export
 fgetl <- function(file) {
   # ==========================================================================
   # Validation
@@ -27,5 +26,4 @@ fgetl <- function(file) {
 #' @return The same as `readLines(filename)`
 #' @author Waldir Leoncio
 #' @seealso fgetl
-#' @export
 fopen <- function(filename) readLines(filename)
diff --git a/R/greedyMix.R b/R/greedyMix.R
@@ -1,6 +1,16 @@
 #' @title Clustering of individuals
 #' @param data data file
 #' @param format Data format. Format supported: "FASTA", "VCF" ,"BAM", "GenePop"
+#' @param partitionCompare a list of partitions to compare
+#' @param ninds number of individuals
+#' @param npops number of populations
+#' @param counts counts
+#' @param sumcounts sumcounts
+#' @param max_iter maximum number of iterations
+#' @param alleleCodes allele codes
+#' @param inp input file
+#' @param popnames population names
+#' @param fixedK if \code{TRUE}, the number of populations is fixed
 #' @param verbose if \code{TRUE}, prints extra output information
 #' @importFrom utils read.delim
 #' @importFrom vcfR read.vcfR
@@ -9,41 +19,59 @@
 #' @references Samtools: a suite of programs for interacting
 #' with high-throughput sequencing data. <http://www.htslib.org/>
 #' @export
-greedyMix <- function(data, format, verbose = TRUE) {
-  # Parsing data format ------------------------------------------------------
+#' @examples
+#' data <- system.file("extdata", "FASTA_clustering_haploid.fasta", package = "rBAPS")
+#' greedyMix(data, "fasta")
+greedyMix <- function(
+  data, format, partitionCompare = NULL, ninds = 1L, npops = 1L,
+  counts = NULL, sumcounts = NULL, max_iter = 100L, alleleCodes = NULL,
+  inp = NULL, popnames = NULL, fixedK = FALSE, verbose = FALSE
+) {
+  # Importing and handling data ================================================
+  data <- importFile(data, format, verbose)
+  data <- handleData(data, tolower(format))
+  c <- list(
+    noalle = data[["noalle"]],
+    data = data[["newData"]],
+    adjprior = data[["adjprior"]],
+    priorTerm = data[["priorTerm"]],
+    rowsFromInd = data[["rowsFromInd"]]
+  )
 
-  if (missing(format)) {
-    format <- gsub(".*\\.(.+)$", "\\1", data)
-    message("Format not provided. Guessing from file extension: ", format)
+  # Comparing partitions =======================================================
+  if (!is.null(partitionCompare)) {
+    logmls <- comparePartitions(
+      c[["data"]], nrow(c[["data"]]), partitionCompare[["partitions"]], ninds,
+      c[["rowsFromInd"]], c[["noalle"]], c[["adjprior"]]
+    )
   }
-  format <- tolower(format)
 
-  # Dispatching to proper loading function -----------------------------------
 
-  if (format == "fasta") {
-    out <- load_fasta(data)
-  } else if (format == "vcf") {
-    out <- vcfR::read.vcfR(data, verbose = verbose)
-  } else if (format == "sam") {
-    stop(
-      "SAM files not directly supported. ",
-      "Install the samtools software and execute\n\n",
-      "samtools view -b ", data, " > out_file.bam\n\nto convert to BAM ",
-      "and try running this function again with 'format=BAM'"
-    )
-  } else if (format == "bam") {
-    out <- Rsamtools::scanBam(data)
-  } else if (format == "genepop") {
-    if (toupper(adegenet::.readExt(data)) == "TXT") {
-      message("Creating a copy of the file with the .gen extension")
-      dataGen <- gsub("txt", "gen", data)
-      file.copy(data, dataGen)
-      out <- adegenet::read.genepop(dataGen)
-    } else {
-      out <- adegenet::read.genepop(data)
-    }
-  } else {
-    stop("Format not supported.")
+  # Generating partition summary ===============================================
+  ekat <- seq(1L, c[["rowsFromInd"]], ninds * c[["rowsFromInd"]]) # ekat = (1:rowsFromInd:ninds*rowsFromInd)';
+  c[["rows"]] <- c(ekat, ekat + c[["rowsFromInd"]] - 1L) # c.rows = [ekat ekat+rowsFromInd-1]
+  logml_npops_partitionSummary <- indMixWrapper(c, npops, counts, sumcounts, max_iter, fixedK, verbose)
+  logml <- logml_npops_partitionSummary[["logml"]]
+  npops <- logml_npops_partitionSummary[["npops"]]
+  partitionSummary <- logml_npops_partitionSummary[["partitionSummary"]]
+
+  # Generating output object ===================================================
+  out <- list(
+    "alleleCodes" = alleleCodes, "adjprior" = c[["adjprior"]],
+    "popnames" = popnames, "rowsFromInd" = c[["rowsFromInd"]],
+    "data" = c[["data"]], "npops" = npops, "noalle" = c[["noalle"]],
+    "mixtureType" = "mix", "logml" = logml
+  )
+  if (logml == 1) {
+    return(out)
   }
-  return(out)
+
+  # Writing mixture info =======================================================
+  changesInLogml <- writeMixtureInfo(
+    logml, c[["rowsFromInd"]], c[["data"]], c[["adjprior"]], c[["priorTerm"]],
+    NULL, inp, partitionSummary, popnames, fixedK
+  )
+
+  # Updateing results ==========================================================
+  return(c(out, "changesInLogml" = changesInLogml))
 }
diff --git a/R/greedyPopMix.R b/R/greedyPopMix.R
@@ -11,7 +11,6 @@
 #' @importFrom matlab2r uiputfile
 #' @references Samtools: a suite of programs for interacting
 #' with high-throughput sequencing data. <http://www.htslib.org/>
-#' @export
 greedyPopMix <- function(data, format, partitionCompare = NULL, verbose = TRUE
 ) {
   # Replacing original file reading code with greedyMix()