From 605194722f3e79854d89df971c3710376106070e Mon Sep 17 00:00:00 2001 From: Alexey Sergushichev Date: Mon, 9 Oct 2023 14:17:07 -0500 Subject: [PATCH 1/2] fixes for generating meta/index files --- R/updateAndCreateMetaLocal.R | 684 +++++++++++---------- R/updateAndCreateMetaRemote.R | 114 ++-- tests/testthat/test-loadCountsFromH5file.R | 28 +- 3 files changed, 419 insertions(+), 407 deletions(-) diff --git a/R/updateAndCreateMetaLocal.R b/R/updateAndCreateMetaLocal.R index 13d4c45..caffe7a 100644 --- a/R/updateAndCreateMetaLocal.R +++ b/R/updateAndCreateMetaLocal.R @@ -1,340 +1,344 @@ -#' Creates metafiles for HDF5-files -#' @param data, contains metadata -#' @param file, contains file name -#' @param dataset_name, contains dataset name -#' @return NULL -#' @import rhdf5 -createH5 <- function(data, file, dataset_name) { - stopifnot(requireNamespace("rhdf5")) - if (file.exists(file)) { - stop(sprintf("File %s already exist", file)) - # unlink(file, recursive = FALSE) - } - h5createFile(file) - h5write(data, file, dataset_name) - H5close() - return(invisible(NULL)) -} - -#' Converts collection meta.txt files to meta.h5, putting them to the respective -#' collection folders -#' @param counts_dir, contains directory name -#' @return Returns NULL -createMetaH5 <- function(counts_dir){ - stopifnot(requireNamespace("rhdf5")) - collections <- list.dirs(counts_dir, full.names = FALSE) - collections <- collections[-1] - for (collection in collections) { - destdir <- paste0(counts_dir, '/', collection) - meta <- data.table() - filename <- paste0("meta.txt") - h5_meta <- fread(file.path(destdir, filename), index = "file_name") - h5filename <- file.path(destdir, 'meta.h5') - createH5(h5_meta, h5filename, 'meta') - return(invisible(NULL)) - } -} - -#' Creates HDF5-File with priority -#' @param counts_dir, contains counts directory -#' @param force logical value which lets function replace existing priority file -#' @param verbose logical value which determines a content of the output. -#' @return Returns NULL -createPriorityH5 <- function(counts_dir, force = FALSE, verbose = FALSE){ - stopifnot(requireNamespace("rhdf5")) - if (!dir.exists(counts_dir)) { - message(paste0('Counts directory ', counts_dir, " does not extist" )) - return() - } - h5_files <- list.files(file.path(counts_dir), "\\.h5$", full.names = TRUE, recursive = TRUE) - list_dirs <- list.dirs(counts_dir, full.names = FALSE, recursive = TRUE) - list_dirs <- c(".", list_dirs) - priority_file <- file.path(counts_dir, "counts_priority.txt") - need_create <- TRUE - if (file.exists(priority_file)) { - priority <- fread(priority_file) - if (!(setequal(priority$directory,list_dirs) && length(unique(priority$priority)) == length(priority$priority))) { - message(paste0("!!! Priority file ", priority_file , " is invalid and will be replaced")) - } else { - need_create <- FALSE - } - } - if (need_create) { - priority <- data.table(directory = list_dirs, priority = seq_along(list_dirs)) - write.table(x = priority, file = priority_file, sep = "\t", eol = "\n", row.names = FALSE, col.names = TRUE, quote = FALSE) - } - createH5(priority, 'priority.h5', 'priority') - return(invisible(NULL)) -} - -#' Updates indexes from HDF5-files -#' @param counts_dir, contains counts directory -#' @param force logical value which lets function replace existing index file -#' @param verbose logical value which determines a content of the output. -#' @return Returns NULL -updateIndexH5 <- function(counts_dir, force = FALSE, verbose = FALSE){ - stopifnot(requireNamespace("rhdf5")) - if (!dir.exists(counts_dir)) { - message(paste0('Counts directory ', counts_dir, " does not extist" )) - return() - } - meta_name <- file.path(counts_dir, "meta.rda") - h5_files <- list.files(file.path(counts_dir), "\\.h5$", full.names = TRUE, recursive = TRUE) - if (!length(h5_files)) { - return() - } - if (!force) { - meta_time <- as.numeric(file.mtime(meta_name)) - h5_mtime <- max(unlist(lapply(h5_files, file.mtime))) - dirs_mtime <- lapply(file.path(counts_dir, list_dirs[-1]), file.mtime) - if (length(dirs_mtime) > 0) { - dir_mtime <- max(unlist(dirs_mtime)) - } else { - dir_mtime <- -Inf - } - - if (file.exists(meta_name) && meta_time > h5_mtime && meta_time > dir_mtime) { - return() - } - } - if (file.exists(meta_name)) { - unlink(meta_name) - } - DT_counts_meta <- data.table(matrix(ncol = 3, nrow = 0, dimnames = list( NULL, c("accession", "file", "collection_type")))) - for (cur_dir in list_dirs) { - dir_path <- file.path(counts_dir, cur_dir) - dir_path <- sub(pattern = "/./?$", replacement = "", x = dir_path) - cur_files <- list.files(path = dir_path, pattern = "\\.h5", recursive = FALSE ) - if (length(cur_files) == 0) { - next - } - if (!file.exists(file.path(dir_path, "meta.txt"))) { - if (startsWith(x = tolower(basename(dir_path)), prefix = "archs4")) { - updateARCHS4meta(archDir = dir_path) - } else if (startsWith(x = tolower(basename(dir_path)), prefix = "dee2")) { - updateDEE2meta(destDir = dir_path) - } - } - message(paste0('Populating ', cur_dir , ' counts meta' )) - if (!validateCountsCollection(collectionDir = dir_path, verbose = verbose)) { - message(paste0("!! files in ", cur_dir , " are ignored because there is not correct meta file in this directory.")) - next - } - DT_part <- getCountsMetaPart(counts_dir = counts_dir, collection_name = cur_dir, verbose = verbose) - if (length(DT_part)) { - DT_counts_meta <- rbindlist(l = list(DT_counts_meta, DT_part)) - } - rm(DT_part) - - } - DT_counts_meta$chunk <- gsmToChunk(DT_counts_meta$accession) - DT_counts_meta_split <- split(DT_counts_meta, DT_counts_meta$chunk) - createIndexH5(DT_counts_meta_split, 'index.h5') - save(DT_counts_meta, file = meta_name, eval.promises = TRUE) - rm(DT_counts_meta) - return(invisible(NULL)) -} - - -#' Gets list with metadata -#' @param counts_dir, contains counts directory -#' @param collection_name contains name of the collection -#' @param verbose logical value which determines a content of the output. -#' @return list with metadata -getCountsMetaPart <- function(counts_dir, collection_name, verbose){ - destdir <- file.path(counts_dir, collection_name) - if (!dir.exists(destdir)) { - return() - } - DT_h5_meta <- data.table() - h5_files <- list.files(destdir, "\\.h5$", full.names = FALSE) - if (!length(h5_files)) { - return() - } - h5_meta <- fread(file.path(destdir, "meta.txt"), index = "file_name") - for (input_file in h5_files) { - if (input_file %in% h5_meta$file_name) { - full_name <- file.path(destdir, input_file) - relative_path <- file.path(collection_name, input_file) - h5f <- H5Fopen(full_name, flags = "H5F_ACC_RDONLY") - accession <- h5read(h5f, h5_meta[file_name == input_file, ]$sample_id) - h5_part <- data.table(accession = accession, - file = relative_path, - collection_type = collection_name, indexes = seq_along(accession)) - H5Fclose(h5f) - DT_h5_meta <- rbindlist(l = list(DT_h5_meta, h5_part)) - } else { - if (verbose) { - message(paste0("!! ", file.path(destdir, input_file), " is ignored")) - } - } - - } - return(DT_h5_meta) -} - -#' Writes indexes to the file -#' @param data, contains metadata -#' @param file contains the file name -#' @return Returns NULL -createIndexH5 <- function(data, file) { - stopifnot(requireNamespace("rhdf5")) - h5createFile(file) - names <- names(data) - for (i in seq_along(names)) { - h5write(data[[i]], file, paste0("/",names[i])) - } - h5closeAll() - return(invisible(NULL)) -} - -#' Creates \code{meta.txt} file, which describes typical dee2 files. -#' @param destDir path to directory with DEE2 .h5 files. -#' @import data.table -#' @return Returns NULL -updateDEE2meta <- function(destDir = file.path(getOption("phantasusCacheDir"), "counts/dee2")){ - dee2files <- list.files(destDir, pattern = '\\.h5$') - DT_meta <- data.frame(matrix(ncol = 4, nrow = length(dee2files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type")))) - DT_meta$file_name <- dee2files - DT_meta$sample_id <- "/meta/samples/geo_accession" - DT_meta$gene_id <- "/meta/genes/ensembl_gene_id" - genus <- sapply(strsplit(x = dee2files, split = "_"), function(x) x[1]) - for (i_file in 1:length(dee2files)) { - if (genus[i_file] %in% c("hsapiens", "mmusculus", "drerio", "rattus")) { - DT_meta$gene_id_type[i_file] <- "ENSEMBLID" - next - } else if (genus[i_file] %in% c("athaliana", "ecoli")) { - DT_meta$gene_id_type[i_file] <- "Locus tag" - - next - } else if (genus[i_file] %in% c("scerevisiae")) { - DT_meta$gene_id_type[i_file] <- "Yeast id" - next - } else if (genus[i_file] %in% c("dmelanogaster")){ - DT_meta$gene_id_type[i_file] <- "FlyBase id" - next - } else if (genus[i_file] %in% c("celegans")) { - DT_meta$gene_id_type[i_file] <- "WormBase id" - next - } else{ - DT_meta$gene_id_type[i_file] <- "gene" - next - } - } - write.table(x = DT_meta, file = file.path(destDir, "meta.txt"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) - return(invisible(NULL)) -} - -#' Creates \code{meta.txt} file, which describes typical archs4 and archs4Zoo files. -#' @param archDir path to directory with arch4 .h5 files. -#' @import data.table -#' @return Returns NULL -updateARCHS4meta <- function(archDir = file.path(getOption("phantasusCacheDir"), "counts/archs4")){ - stopifnot(requireNamespace("rhdf5")) - archs4files <- list.files(archDir, pattern = '\\.h5$') - DT_meta <- data.frame(matrix(ncol = 4, nrow = length(archs4files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type")))) - DT_meta$file_name <- archs4files - DT_meta$sample_id <- "/meta/Sample_geo_accession" - DT_meta$gene_id <- "/meta/genes" - genus <- tolower(sapply(strsplit(x = archs4files, split = "_"), function(x) x[1])) - for (i_file in seq_along(archs4files)) { - cur_file <- file.path(archDir, archs4files[i_file]) - h5f <- H5Fopen(cur_file, flags = "H5F_ACC_RDONLY") - arch_version <- if (H5Lexists(h5f, "info/version")) { - h5read(h5f, "info/version") - } else { - h5read(h5f, "meta/info/version") - } - arch_version <- as.integer(arch_version) - if (is.na(arch_version)) { - arch_version <- 8 - } - if (arch_version >= 9) { - DT_meta$sample_id[i_file] <- "/meta/samples/geo_accession" - DT_meta$gene_id[i_file] <- "/meta/genes/genes" - } - if (genus[i_file] %in% c("human", "mouse")) { - DT_meta$gene_id_type[i_file] <- "Gene Symbol" - next - } else if (genus[i_file] %in% c("rattus", "bos", "gallus", "danio")) { - DT_meta$gene_id_type[i_file] <- "ENSEMBLID" - next - } else if (genus[i_file] %in% c("arabidopsis")) { - DT_meta$gene_id_type[i_file] <- "TAIR id" - next - } else if (genus[i_file] %in% c("saccharomyces")) { - DT_meta$gene_id_type[i_file] <- "ORF id" - next - } else if (genus[i_file] %in% c("caenorhabditis")) { - DT_meta$gene_id_type[i_file] <- "WormBase id" - next - } else if (genus[i_file] %in% c("drosophila")) { - DT_meta$gene_id_type[i_file] <- "FlyBase id" - next - } else{ - DT_meta$gene_id_type[i_file] <- "gene" - } - H5Fclose(h5f) - } - return(invisible(NULL)) -} - -#' Validates counts collection -#' @param collectionDir contains directory name -#' @param verbose logical value which determines a content of the output. -#' @return false if collection is not valid -#' -validateCountsCollection <- function(collectionDir, verbose=FALSE){ - stopifnot(requireNamespace("rhdf5")) - if (!file.exists(file.path(collectionDir, "meta.txt"))) { - if (verbose) { - message(paste0("metafile does not exist in ", file.path(collectionDir))) - } - return(FALSE) - } - - h5_meta <- fread(file.path(collectionDir, "meta.txt"), index = "file_name") - for (input_file in h5_meta$file_name) { - full_path <- file.path(collectionDir, input_file) - cur_meta <- h5_meta[file_name == input_file, ] - if (nrow(cur_meta) > 1) { - if (verbose) { - message(paste0("two or more rows in meta file for ", full_path )) - } - return(FALSE) - } - h5f <- H5Fopen(full_path, flags = "H5F_ACC_RDONLY") - - tryCatch({ - is_sample_valid <- H5Lexists(h5f, name = cur_meta$sample_id) - - if(!is_sample_valid){ - if (verbose) { - message(paste0("can't read sample_id in ", full_path )) - } - return(FALSE) - } - - gene_ids <- if (H5Lexists(h5f, name = cur_meta$gene_id)) { - h5read(h5f, name = cur_meta$gene_id) - } else NULL - - if (length(gene_ids) == 0) { - if (verbose) { - message(paste("can't read gene_id in ", full_path)) - } - return(FALSE) - } - if (length(gene_ids) != length(unique(gene_ids))) { - if (verbose) { - message(paste("Non-unique gene ids in file", full_path)) - } - return(FALSE) - } - }, finally = { - H5Fclose(h5f) - }) - } - return(TRUE) -} - +#' Creates metafiles for HDF5-files +#' @param data, contains metadata +#' @param file, contains file name +#' @param dataset_name, contains dataset name +#' @return NULL +#' @import rhdf5 +createH5 <- function(data, file, dataset_name) { + stopifnot(requireNamespace("rhdf5")) + if (file.exists(file)) { + stop(sprintf("File %s already exist", file)) + # unlink(file, recursive = FALSE) + } + h5createFile(file) + h5write(data, file, dataset_name) + H5close() + return(invisible(NULL)) +} + +#' Converts collection meta.txt files to meta.h5, putting them to the respective +#' collection folders +#' @param counts_dir, contains directory name +#' @return Returns NULL +createMetaH5 <- function(counts_dir){ + stopifnot(requireNamespace("rhdf5")) + collections <- list.dirs(counts_dir, full.names = FALSE) + collections <- collections[-1] + for (collection in collections) { + destdir <- paste0(counts_dir, '/', collection) + meta <- data.table() + filename <- paste0("meta.txt") + h5_meta <- fread(file.path(destdir, filename), index = "file_name") + h5filename <- file.path(destdir, 'meta.h5') + if (file.exists(h5filename)) { + message("Skipping ", h5filename, " as it's already exists") + next + } + createH5(h5_meta, h5filename, 'meta') + } + return(invisible(NULL)) +} + +#' Creates HDF5-File with priority +#' @param counts_dir, contains counts directory +#' @param force logical value which lets function replace existing priority file +#' @param verbose logical value which determines a content of the output. +#' @return Returns NULL +createPriorityH5 <- function(counts_dir, force = FALSE, verbose = FALSE){ + stopifnot(requireNamespace("rhdf5")) + if (!dir.exists(counts_dir)) { + message(paste0('Counts directory ', counts_dir, " does not extist" )) + return() + } + h5_files <- list.files(file.path(counts_dir), "\\.h5$", full.names = TRUE, recursive = TRUE) + list_dirs <- list.dirs(counts_dir, full.names = FALSE, recursive = TRUE) + list_dirs <- c(".", list_dirs) + priority_file <- file.path(counts_dir, "counts_priority.txt") + need_create <- TRUE + if (file.exists(priority_file)) { + priority <- fread(priority_file) + if (!(setequal(priority$directory,list_dirs) && length(unique(priority$priority)) == length(priority$priority))) { + message(paste0("!!! Priority file ", priority_file , " is invalid and will be replaced")) + } else { + need_create <- FALSE + } + } + if (need_create) { + priority <- data.table(directory = list_dirs, priority = seq_along(list_dirs)) + write.table(x = priority, file = priority_file, sep = "\t", eol = "\n", row.names = FALSE, col.names = TRUE, quote = FALSE) + } + createH5(priority, 'priority.h5', 'priority') + return(invisible(NULL)) +} + +#' Updates indexes from HDF5-files +#' @param counts_dir, contains counts directory +#' @param force logical value which lets function replace existing index file +#' @param verbose logical value which determines a content of the output. +#' @return Returns NULL +updateIndexH5 <- function(counts_dir, force = FALSE, verbose = FALSE){ + stopifnot(requireNamespace("rhdf5")) + if (!dir.exists(counts_dir)) { + message(paste0('Counts directory ', counts_dir, " does not extist" )) + return() + } + meta_name <- file.path(counts_dir, "meta.rda") + h5_files <- list.files(file.path(counts_dir), "\\.h5$", full.names = TRUE, recursive = TRUE) + if (!length(h5_files)) { + return() + } + if (!force) { + meta_time <- as.numeric(file.mtime(meta_name)) + h5_mtime <- max(unlist(lapply(h5_files, file.mtime))) + dirs_mtime <- lapply(file.path(counts_dir, list_dirs[-1]), file.mtime) + if (length(dirs_mtime) > 0) { + dir_mtime <- max(unlist(dirs_mtime)) + } else { + dir_mtime <- -Inf + } + + if (file.exists(meta_name) && meta_time > h5_mtime && meta_time > dir_mtime) { + return() + } + } + if (file.exists(meta_name)) { + unlink(meta_name) + } + DT_counts_meta <- data.table(matrix(ncol = 3, nrow = 0, dimnames = list( NULL, c("accession", "file", "collection_type")))) + for (cur_dir in list_dirs) { + dir_path <- file.path(counts_dir, cur_dir) + dir_path <- sub(pattern = "/./?$", replacement = "", x = dir_path) + cur_files <- list.files(path = dir_path, pattern = "\\.h5", recursive = FALSE ) + if (length(cur_files) == 0) { + next + } + if (!file.exists(file.path(dir_path, "meta.txt"))) { + if (startsWith(x = tolower(basename(dir_path)), prefix = "archs4")) { + updateARCHS4meta(archDir = dir_path) + } else if (startsWith(x = tolower(basename(dir_path)), prefix = "dee2")) { + updateDEE2meta(destDir = dir_path) + } + } + message(paste0('Populating ', cur_dir , ' counts meta' )) + if (!validateCountsCollection(collectionDir = dir_path, verbose = verbose)) { + message(paste0("!! files in ", cur_dir , " are ignored because there is not correct meta file in this directory.")) + next + } + DT_part <- getCountsMetaPart(counts_dir = counts_dir, collection_name = cur_dir, verbose = verbose) + if (length(DT_part)) { + DT_counts_meta <- rbindlist(l = list(DT_counts_meta, DT_part)) + } + rm(DT_part) + + } + DT_counts_meta$chunk <- gsmToChunk(DT_counts_meta$accession) + DT_counts_meta_split <- split(DT_counts_meta, DT_counts_meta$chunk) + createIndexH5(DT_counts_meta_split, 'index.h5') + save(DT_counts_meta, file = meta_name, eval.promises = TRUE) + rm(DT_counts_meta) + return(invisible(NULL)) +} + + +#' Gets list with metadata +#' @param counts_dir, contains counts directory +#' @param collection_name contains name of the collection +#' @param verbose logical value which determines a content of the output. +#' @return list with metadata +getCountsMetaPart <- function(counts_dir, collection_name, verbose){ + destdir <- file.path(counts_dir, collection_name) + if (!dir.exists(destdir)) { + return() + } + DT_h5_meta <- data.table() + h5_files <- list.files(destdir, "\\.h5$", full.names = FALSE) + if (!length(h5_files)) { + return() + } + h5_meta <- fread(file.path(destdir, "meta.txt"), index = "file_name") + for (input_file in h5_files) { + if (input_file %in% h5_meta$file_name) { + full_name <- file.path(destdir, input_file) + relative_path <- file.path(collection_name, input_file) + h5f <- H5Fopen(full_name, flags = "H5F_ACC_RDONLY") + accession <- h5read(h5f, h5_meta[file_name == input_file, ]$sample_id) + h5_part <- data.table(accession = accession, + file = relative_path, + collection_type = collection_name, indexes = seq_along(accession)) + H5Fclose(h5f) + DT_h5_meta <- rbindlist(l = list(DT_h5_meta, h5_part)) + } else { + if (verbose) { + message(paste0("!! ", file.path(destdir, input_file), " is ignored")) + } + } + + } + return(DT_h5_meta) +} + +#' Writes indexes to the file +#' @param data, contains metadata +#' @param file contains the file name +#' @return Returns NULL +createIndexH5 <- function(data, file) { + stopifnot(requireNamespace("rhdf5")) + h5createFile(file) + names <- names(data) + for (i in seq_along(names)) { + h5write(data[[i]], file, paste0("/",names[i])) + } + h5closeAll() + return(invisible(NULL)) +} + +#' Creates \code{meta.txt} file, which describes typical dee2 files. +#' @param destDir path to directory with DEE2 .h5 files. +#' @import data.table +#' @return Returns NULL +updateDEE2meta <- function(destDir = file.path(getOption("phantasusCacheDir"), "counts/dee2")){ + dee2files <- list.files(destDir, pattern = '\\.h5$') + DT_meta <- data.frame(matrix(ncol = 4, nrow = length(dee2files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type")))) + DT_meta$file_name <- dee2files + DT_meta$sample_id <- "/meta/samples/geo_accession" + DT_meta$gene_id <- "/meta/genes/ensembl_gene_id" + genus <- sapply(strsplit(x = dee2files, split = "_"), function(x) x[1]) + for (i_file in 1:length(dee2files)) { + if (genus[i_file] %in% c("hsapiens", "mmusculus", "drerio", "rattus")) { + DT_meta$gene_id_type[i_file] <- "ENSEMBLID" + next + } else if (genus[i_file] %in% c("athaliana", "ecoli")) { + DT_meta$gene_id_type[i_file] <- "Locus tag" + + next + } else if (genus[i_file] %in% c("scerevisiae")) { + DT_meta$gene_id_type[i_file] <- "Yeast id" + next + } else if (genus[i_file] %in% c("dmelanogaster")){ + DT_meta$gene_id_type[i_file] <- "FlyBase id" + next + } else if (genus[i_file] %in% c("celegans")) { + DT_meta$gene_id_type[i_file] <- "WormBase id" + next + } else{ + DT_meta$gene_id_type[i_file] <- "gene" + next + } + } + write.table(x = DT_meta, file = file.path(destDir, "meta.txt"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) + return(invisible(NULL)) +} + +#' Creates \code{meta.txt} file, which describes typical archs4 and archs4Zoo files. +#' @param archDir path to directory with arch4 .h5 files. +#' @import data.table +#' @return Returns NULL +updateARCHS4meta <- function(archDir = file.path(getOption("phantasusCacheDir"), "counts/archs4")){ + stopifnot(requireNamespace("rhdf5")) + archs4files <- list.files(archDir, pattern = '\\.h5$') + DT_meta <- data.frame(matrix(ncol = 4, nrow = length(archs4files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type")))) + DT_meta$file_name <- archs4files + DT_meta$sample_id <- "/meta/Sample_geo_accession" + DT_meta$gene_id <- "/meta/genes" + genus <- tolower(sapply(strsplit(x = archs4files, split = "_"), function(x) x[1])) + for (i_file in seq_along(archs4files)) { + cur_file <- file.path(archDir, archs4files[i_file]) + h5f <- H5Fopen(cur_file, flags = "H5F_ACC_RDONLY") + arch_version <- if (H5Lexists(h5f, "info/version")) { + h5read(h5f, "info/version") + } else { + h5read(h5f, "meta/info/version") + } + arch_version <- as.integer(arch_version) + if (is.na(arch_version)) { + arch_version <- 8 + } + if (arch_version >= 9) { + DT_meta$sample_id[i_file] <- "/meta/samples/geo_accession" + DT_meta$gene_id[i_file] <- "/meta/genes/genes" + } + if (genus[i_file] %in% c("human", "mouse")) { + DT_meta$gene_id_type[i_file] <- "Gene Symbol" + next + } else if (genus[i_file] %in% c("rattus", "bos", "gallus", "danio")) { + DT_meta$gene_id_type[i_file] <- "ENSEMBLID" + next + } else if (genus[i_file] %in% c("arabidopsis")) { + DT_meta$gene_id_type[i_file] <- "TAIR id" + next + } else if (genus[i_file] %in% c("saccharomyces")) { + DT_meta$gene_id_type[i_file] <- "ORF id" + next + } else if (genus[i_file] %in% c("caenorhabditis")) { + DT_meta$gene_id_type[i_file] <- "WormBase id" + next + } else if (genus[i_file] %in% c("drosophila")) { + DT_meta$gene_id_type[i_file] <- "FlyBase id" + next + } else{ + DT_meta$gene_id_type[i_file] <- "gene" + } + H5Fclose(h5f) + } + return(invisible(NULL)) +} + +#' Validates counts collection +#' @param collectionDir contains directory name +#' @param verbose logical value which determines a content of the output. +#' @return false if collection is not valid +#' +validateCountsCollection <- function(collectionDir, verbose=FALSE){ + stopifnot(requireNamespace("rhdf5")) + if (!file.exists(file.path(collectionDir, "meta.txt"))) { + if (verbose) { + message(paste0("metafile does not exist in ", file.path(collectionDir))) + } + return(FALSE) + } + + h5_meta <- fread(file.path(collectionDir, "meta.txt"), index = "file_name") + for (input_file in h5_meta$file_name) { + full_path <- file.path(collectionDir, input_file) + cur_meta <- h5_meta[file_name == input_file, ] + if (nrow(cur_meta) > 1) { + if (verbose) { + message(paste0("two or more rows in meta file for ", full_path )) + } + return(FALSE) + } + h5f <- H5Fopen(full_path, flags = "H5F_ACC_RDONLY") + + tryCatch({ + is_sample_valid <- H5Lexists(h5f, name = cur_meta$sample_id) + + if(!is_sample_valid){ + if (verbose) { + message(paste0("can't read sample_id in ", full_path )) + } + return(FALSE) + } + + gene_ids <- if (H5Lexists(h5f, name = cur_meta$gene_id)) { + h5read(h5f, name = cur_meta$gene_id) + } else NULL + + if (length(gene_ids) == 0) { + if (verbose) { + message(paste("can't read gene_id in ", full_path)) + } + return(FALSE) + } + if (length(gene_ids) != length(unique(gene_ids))) { + if (verbose) { + message(paste("Non-unique gene ids in file", full_path)) + } + return(FALSE) + } + }, finally = { + H5Fclose(h5f) + }) + } + return(TRUE) +} + diff --git a/R/updateAndCreateMetaRemote.R b/R/updateAndCreateMetaRemote.R index a2e63f6..5c0b5d4 100644 --- a/R/updateAndCreateMetaRemote.R +++ b/R/updateAndCreateMetaRemote.R @@ -1,48 +1,66 @@ -#' Creates a data table with indexes and chunks of samples in remote HDF5-files -#' @param url, contains url to the root of counts files -#' @param collections, contains names of the collections -#' @return table with samples, indexes and chunks in all HDF5-files -getIndexRemote <- function(url, collections) { - src <- httr::parse_url(url) - dir <- src$query$domain - if (is.null(dir)) { - dir <- "/" - } - src <- paste0(src$scheme,'://',src$hostname,'/',src$path) - src <- HSDSSource(src) - - DT_h5_meta <- data.table() - src <- HSDSSource(url) - for (collection in collections) { - message("Processing collection ", collection) - collection_path <- file.path(dir, collection, fsep="/") - metaf <- HSDSFile(src, file.path(collection_path, 'meta.h5', fsep="/")) - metads <- HSDSDataset(metaf, '/meta') - h5_meta <- metads[1:metads@shape] - for (input_file in h5_meta$file_name) { - message("Processing file ", input_file) - full_name <- file.path(collection_path, input_file, fsep="/") - relative_path <- file.path(collection, input_file, fsep="/") - h5f <- HSDSFile(src, full_name) - accession <- getSamples(h5f, h5_meta[h5_meta$file_name == input_file, ]$sample_id) - h5_part <- data.table(accession = accession, - file = relative_path, - collection_type = collection, indexes = seq_along(accession)) - DT_h5_meta <- rbindlist(l = list(DT_h5_meta, h5_part)) - } - - } - DT_h5_meta$chunk <- gsmToChunk(DT_h5_meta$accession) - return(DT_h5_meta) -} - -#' Creates HDF5-file with indexes -#' @param url, contains url to the root of counts files -#' @return Returns NULL -createIndexH5Remote <- function(url) { - collections <- c('archs4', 'dee2') - DT_h5_meta <- getIndexRemote(url, collections) - DT_h5_meta_split <- split(DT_h5_meta, DT_h5_meta$chunk) - createIndexH5(DT_h5_meta_split, 'index.h5') - return(invisible(NULL)) -} +#' Creates a data table with indexes and chunks of samples in remote HDF5-files +#' @param url, contains url to the root of counts files +#' @param collections, contains names of the collections +#' @return table with samples, indexes and chunks in all HDF5-files +getIndexRemote <- function(url, collections) { + src <- httr::parse_url(url) + dir <- src$query$domain + if (is.null(dir)) { + dir <- "/" + } + src <- paste0(src$scheme,'://',src$hostname,'/',src$path) + src <- HSDSSource(src) + + DT_h5_meta <- data.table() + for (collection in collections) { + message("Processing collection ", collection) + collection_path <- file.path(dir, collection, fsep="/") + metaf <- HSDSFile(src, file.path(collection_path, 'meta.h5', fsep="/")) + metads <- HSDSDataset(metaf, '/meta') + h5_meta <- metads[1:metads@shape] + for (input_file in h5_meta$file_name) { + message("Processing file ", input_file) + full_name <- file.path(collection_path, input_file, fsep="/") + relative_path <- file.path(collection, input_file, fsep="/") + + h5f <- NULL + tryCatch({ + h5f <- HSDSFile(src, full_name) + }, error=function(e) { + message("Skipping due to error:") + message(e, "\n") + }) + + if (is.null(h5f)) { + next + } + + accession <- getSamples(h5f, h5_meta[h5_meta$file_name == input_file, ]$sample_id) + h5_part <- data.table(accession = accession, + file = relative_path, + collection_type = collection, indexes = seq_along(accession)) + DT_h5_meta <- rbindlist(l = list(DT_h5_meta, h5_part)) + } + + } + DT_h5_meta$chunk <- gsmToChunk(DT_h5_meta$accession) + return(DT_h5_meta) +} + +#' Creates HDF5-file with indexes +#' @param url, contains URL to the root of counts files +#' @param collections vector of collection names to process +#' @param destfile where to put resulting index file +#' @return Returns NULL +createIndexH5Remote <- function(url, + collections=c('archs4', 'dee2'), + destfile="index.h5") { + if (file.exists(destfile)) { + stop("File ", destfile, " alsready exists") + } + + DT_h5_meta <- getIndexRemote(url, collections) + DT_h5_meta_split <- split(DT_h5_meta, DT_h5_meta$chunk) + createIndexH5(DT_h5_meta_split, destfile) + return(invisible(NULL)) +} diff --git a/tests/testthat/test-loadCountsFromH5file.R b/tests/testthat/test-loadCountsFromH5file.R index a5e4435..58f6514 100644 --- a/tests/testthat/test-loadCountsFromH5file.R +++ b/tests/testthat/test-loadCountsFromH5file.R @@ -2,7 +2,7 @@ library(GEOquery) test_that("loadCountsFromHSDS works correctly", { url <- "https://ctlab.itmo.ru/hsds/?domain=/counts" - ess <- getGEO("GSE85653") + ess <- getGEO("GSE85653", AnnotGPL = TRUE) es <- ess[[1]] es <- loadCountsFromHSDS(es, url) expect_true(nrow(exprs(es)) > 0) @@ -10,15 +10,13 @@ test_that("loadCountsFromHSDS works correctly", { es <- loadCountsFromHSDS(es, url) expect_true(all(samples %in% es$geo_accession)) - if (utils::packageVersion("rhdf5client") >= "1.23.5") { - ess <- getGEO("GSE53053") - es <- ess[[1]] - es <- loadCountsFromHSDS(es, url) - expect_true(nrow(exprs(es)) > 0) - samples <- es$geo_accession - es <- loadCountsFromHSDS(es, url) - expect_true(all(samples %in% es$geo_accession)) - } + ess <- getGEO("GSE53053", AnnotGPL = TRUE) + es <- ess[[1]] + es <- loadCountsFromHSDS(es, url) + expect_true(nrow(exprs(es)) > 0) + samples <- es$geo_accession + es <- loadCountsFromHSDS(es, url) + expect_true(all(samples %in% es$geo_accession)) }) @@ -35,16 +33,8 @@ test_that("loadCountsFromHSDS returns the same ExpressionSet, if it contains cou test_that("loadCountsFromH5FileHSDS works without metadata params", { url <- "https://ctlab.itmo.ru/hsds/?domain=/counts" file <- 'archs4/Arabidopsis_thaliana_count_matrix.h5' - ess <- getGEO("GSE85653") + ess <- getGEO("GSE85653", AnnotGPL = TRUE) es <- ess[[1]] es <- loadCountsFromH5FileHSDS(es, url, file) expect_true(nrow(exprs(es)) > 0) - - if (utils::packageVersion("rhdf5client") >= "1.23.5") { - file <- 'archs4/mouse_gene_v2.2.h5' - ess <- getGEO("GSE53053") - es <- ess[[1]] - es <- loadCountsFromH5FileHSDS(es, url, file) - expect_true(nrow(exprs(es)) > 0) - } }) From b6c281e6bf65119748c4275bf1a747dd0e5cffc7 Mon Sep 17 00:00:00 2001 From: Alexey Sergushichev Date: Mon, 9 Oct 2023 14:21:46 -0500 Subject: [PATCH 2/2] Documentation fixes --- R/updateAndCreateMetaLocal.R | 2 +- man/createH5.Rd | 3 +++ man/createIndexH5Remote.Rd | 12 ++++++++++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/R/updateAndCreateMetaLocal.R b/R/updateAndCreateMetaLocal.R index caffe7a..79e5787 100644 --- a/R/updateAndCreateMetaLocal.R +++ b/R/updateAndCreateMetaLocal.R @@ -2,7 +2,7 @@ #' @param data, contains metadata #' @param file, contains file name #' @param dataset_name, contains dataset name -#' @return NULL +#' @return Returns NULL #' @import rhdf5 createH5 <- function(data, file, dataset_name) { stopifnot(requireNamespace("rhdf5")) diff --git a/man/createH5.Rd b/man/createH5.Rd index 48257dd..bd9d0f7 100644 --- a/man/createH5.Rd +++ b/man/createH5.Rd @@ -13,6 +13,9 @@ createH5(data, file, dataset_name) \item{dataset_name, }{contains dataset name} } +\value{ +Returns NULL +} \description{ Creates metafiles for HDF5-files } diff --git a/man/createIndexH5Remote.Rd b/man/createIndexH5Remote.Rd index 00fecb5..469d058 100644 --- a/man/createIndexH5Remote.Rd +++ b/man/createIndexH5Remote.Rd @@ -4,10 +4,18 @@ \alias{createIndexH5Remote} \title{Creates HDF5-file with indexes} \usage{ -createIndexH5Remote(url) +createIndexH5Remote( + url, + collections = c("archs4", "dee2"), + destfile = "index.h5" +) } \arguments{ -\item{url, }{contains url to the root of counts files} +\item{url, }{contains URL to the root of counts files} + +\item{collections}{vector of collection names to process} + +\item{destfile}{where to put resulting index file} } \value{ Returns NULL