utils.R
PROTOBUF_LAYOUT_VERSION = c(0x00, 0x02)
getIndicesVector <- function(current, neededLength) {
if (length(current) == 0) {
current <- 0:(neededLength - 1)
}
current + 1
}
#' Reads ExpressionSet from a GCT file.
#'
#' Only versions 1.2 and 1.3 are supported.
#'
#' @param gct Path to gct file
#'
#' @param ... additional options for read.csv
#'
#' @return ExpressionSet object
#'
#' @examples
#' read.gct(system.file("extdata", "centers.gct", package = "phantasus"))
#' @export
read.gct <- function(gct, ...) {
meta <- readLines(gct, n = 3)
version <- meta[1]
size <- as.numeric(unlist(strsplit(meta[2], "\t")))
if (grepl("^#1.3", version)) {
# number of column annotations = number of additional rows
ann.col <- size[4]
# number of row annotations = number of additional columns
ann.row <- size[3]
} else if (grepl("^#1.2", version)) {
ann.col <- 0
ann.row <- 1
} else {
stop("Unsupported version of gct: use 1.2 or 1.3")
}
colNames <- unlist(strsplit(meta[3], "\t"))
if (grepl("/", colNames[1])) {
rowIdField <- sub("(.*)/(.*)", "\\1", colNames[1])
colIdField <- sub("(.*)/(.*)", "\\2", colNames[1])
} else {
rowIdField <- "id"
colIdField <- "id"
}
colNames[1] <- rowIdField
t <- read.tsv(gct, skip = 2 + 1 + ann.col, nrows = size[1],
col.names = colNames,
row.names = NULL, header = FALSE, ...)
rownames(t) <- t[,1]
exp <- as.matrix(t[, (ann.row + 2):ncol(t)])
fdata <- makeAnnotated(t[, seq_len(ann.row + 1), drop = FALSE])
if (ann.col > 0) {
pdata.raw <- t(read.tsv(gct, skip = 2, nrows = ann.col + 1,
header = FALSE, row.names=NULL))
pdata <- data.frame(pdata.raw[seq_len(ncol(exp)) + 1 + ann.row, ,
drop = FALSE])
colnames(pdata) <- pdata.raw[1, ]
colnames(pdata)[1] <- colIdField
rownames(pdata) <- colnames(exp)
pdata <- makeAnnotated(pdata)
res <- ExpressionSet(exp, featureData = fdata, phenoData = pdata)
} else {
res <- ExpressionSet(exp, featureData = fdata)
}
res
}
read.tsv <- function(file, header = TRUE, sep = "\t", quote = "",
comment.char = "",
check.names = FALSE, ...) {
args <- list(...)
res <- utils::read.table(file, header = header, sep = sep, quote = quote,
comment.char = comment.char, check.names = check.names,
stringsAsFactors = FALSE,
...)
if ( (!"row.names" %in% names(args)) && (colnames(res)[1] == "") ) {
rownames(res) <- res[, 1]
res[[1]] <- NULL
}
res
}
#' Saves ExpressionSet to a GCT file (version 1.3).
#'
#' @param es ExpresionSet obeject to save
#' @param file Path to output gct file
#' @param gzip Whether to gzip apply gzip-compression for the output file#'
#' @return Result of the closing file (as in `close()` function`)
#' @examples
#' es <- read.gct(system.file("extdata", "centers.gct", package = "phantasus"))
#' out <- tempfile(fileext = ".gct.gz")
#' write.gct(es, out, gzip=TRUE)
#' @import Biobase
#' @export
write.gct <- function(es, file, gzip=FALSE) {
if (gzip) {
con <- gzfile(file)
} else {
con <- file(file)
}
open(con, open="w")
writeLines("#1.3", con)
ann.col <- ncol(pData(es))
ann.row <- ncol(fData(es))
writeLines(sprintf("%s\t%s\t%s\t%s", nrow(es), ncol(es), ann.row, ann.col), con)
writeLines(paste0(c("ID", colnames(fData(es)), colnames(es)), collapse="\t"), con)
ann.col.table <- t(as.matrix(pData(es)))
ann.col.table <- cbind(matrix(rep(NA, ann.row*ann.col), nrow=ann.col), ann.col.table)
write.table(ann.col.table, file=con, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
write.table(cbind(fData(es), exprs(es)), file=con, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
close(con)
}
makeAnnotated <- function(data) {
meta <- data.frame(labelDescription = colnames(data))
rownames(meta) <- colnames(data)
methods::new("AnnotatedDataFrame", data = data, varMeta = meta)
}
take <- function(x, n) {
sapply(x, function(x) {
x[[n]]
})
}
writeToList <- function(es) {
data <- as.matrix(exprs(es))
colnames(data) <- NULL
row.names(data) <- NULL
pdata <- pData(es)
row.names(pdata) <- NULL
pdata <- as.list(pdata)
rownames <- rownames(es)
fdata <- fData(es)
row.names(fdata) <- NULL
fdata <- as.list(fdata)
ed <- experimentData(es)
experimentList <- as.list(expinfo(ed))
experimentList$other <- as.list(ed@other)
experimentList$pubMedIds <- pubMedIds(ed)
res <- list(data = data, pdata = pdata, fdata = fdata,
rownames = rownames,
colMetaNames = varLabels(es),
rowMetaNames = fvarLabels(es),
experimentData = experimentList)
res
}
#' Update archs4 files.
#'
#' Download archs4 or archs4zoo counts in \code{cacheDir}. If directory does not exists function makes nothing and produce corresponding warnings.
#' @importFrom utils download.file
#' @param cacheDir file path to \bold{archs4} cache directory
#' @param organism vector which determines organisms to download: human, mouse, zoo or all as default.
#' Also can be a genus. Possible genus: \enumerate{ \item drosophila \item gallus \item bos \item caenorhabditis
#' \item danio \item rattus \item saccharomyces \item arabidopsis}
#' @param force logical value which let function replace current files
updateARCHS4 <- function (cacheDir = file.path(getOption("phantasusCacheDir"), "counts/archs4"), organism = c("all"), force = FALSE){
zoo_dict <- c("drosophila" = "Drosophila_melanogaster",
"bos" = "Bos_taurus",
"caenorhabditis" = "Caenorhabditis_elegans",
"danio" = "Danio_rerio",
"gallus" = "Gallus_gallus",
"rattus" = "Rattus_norvegicus",
"saccharomyces" = "Saccharomyces_cerevisiae",
"arabidopsis" = "Arabidopsis_thaliana"
)
organism <- tolower(organism)
if (any(organism %in% c("all","human")) && (force || !file.exists(file.path(cacheDir, "human_matrix.h5")))) {
download.file(url = "https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5",
destfile = file.path(cacheDir, "human_matrix.h5"),
mode = "wb")
}
if (any(organism %in% c("all","mouse")) && (force || !file.exists(file.path(cacheDir, "mouse_matrix.h5")))) {
download.file(url = "https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5",
destfile = file.path(cacheDir, "mouse_matrix.h5"),
mode = "wb")
}
for (sp in names(zoo_dict)){
if (any(organism %in% c("all","zoo", sp))) {
sp_file_name <- paste0(zoo_dict[sp], "_genecount_v1.h5")
sp_file_path <- file.path(cacheDir, sp_file_name)
if (force || !file.exists(sp_file_path)) {
download.file(url = paste0("https://s3.amazonaws.com/mssm-archs4-zoo/", sp_file_name),
destfile = sp_file_path,
mode = "wb")
}
}
}
}
#' Update ARCHS4 meta files
#'
#' Creates \code{meta.txt} file, which describes typical archs4 and archs4Zoo files.
#' @param archDir path to directory with arch4 .h5 files.
#' @details This function produces very specific "hardcoded" \code{meta.txt} file for arch4 and archs4ZOO counts colletctions.
#' See \code{\link{validateCountsCollection}} for more common information and \code{meta.txt} file structure
#' @seealso \code{\link{validateCountsCollection}}
#' @import data.table
updateARCHS4meta <- function(archDir = file.path(getOption("phantasusCacheDir"), "counts/archs4")){
archs4files <- list.files(archDir, pattern = '\\.h5$')
DT_meta <- data.frame(matrix(ncol = 4, nrow = length(archs4files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type"))))
DT_meta$file_name <- archs4files
DT_meta$sample_id <- "/meta/Sample_geo_accession"
DT_meta$gene_id <- "/meta/genes"
genus <- tolower(sapply(strsplit(x = archs4files, split = "_"), function(x) x[1]))
for (i_file in seq_along(archs4files)) {
file_info <- h5ls(file = file.path(archDir, archs4files[i_file]))
ver_field <- NA
if ("/info" %in% file_info$group) {
ver_field <- "/info/version"
} else if ("/meta/info" %in% file_info$group) {
ver_field <- "/meta/info/version"
}
arch_ver <- as.numeric(h5read(file = file.path(archDir, archs4files[i_file]), name = ver_field))
if (arch_ver >= 9) {
DT_meta$sample_id[i_file] <- "/meta/samples/geo_accession"
DT_meta$gene_id[i_file] <- "/meta/genes/genes"
}
if (genus[i_file] %in% c("human", "mouse")) {
DT_meta$gene_id_type[i_file] <- "Gene Symbol"
next
} else if (genus[i_file] %in% c("rattus", "bos", "gallus", "danio")) {
DT_meta$gene_id_type[i_file] <- "ENSEMBLID"
next
} else if (genus[i_file] %in% c("arabidopsis")) {
DT_meta$gene_id_type[i_file] <- "TAIR id"
next
} else if (genus[i_file] %in% c("saccharomyces")) {
DT_meta$gene_id_type[i_file] <- "ORF id"
next
} else if (genus[i_file] %in% c("caenorhabditis")) {
DT_meta$gene_id_type[i_file] <- "WormBase id"
next
} else if (genus[i_file] %in% c("drosophila")) {
DT_meta$gene_id_type[i_file] <- "FlyBase id"
next
} else{
DT_meta$gene_id_type[i_file] <- "gene"
}
}
h5closeAll()
write.table(x = DT_meta, file = file.path(archDir, "meta.txt"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}
#' Update DEE2 meta files
#'
#' Creates \code{meta.txt} file, which describes typical dee2 files.
#' @param destDir path to directory with DEE2 .h5 files.
#' @details This function produces very specific "hardcoded" \code{meta.txt} file for dee2 counts colletction.
#' See \code{\link{validateCountsCollection}} for more common information and \code{meta.txt} file structure
#' @seealso \code{\link{validateCountsCollection}}
#' @import data.table
updateDEE2meta <- function(destDir = file.path(getOption("phantasusCacheDir"), "counts/dee2")){
dee2files <- list.files(destDir, pattern = '\\.h5$')
DT_meta <- data.frame(matrix(ncol = 4, nrow = length(dee2files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type"))))
DT_meta$file_name <- dee2files
DT_meta$sample_id <- "/meta/samples/geo_accession"
DT_meta$gene_id <- "/meta/genes/ensembl_gene_id"
genus <- sapply(strsplit(x = dee2files, split = "_"), function(x) x[1])
for (i_file in 1:length(dee2files)) {
if (genus[i_file] %in% c("hsapiens", "mmusculus", "drerio", "rattus")) {
DT_meta$gene_id_type[i_file] <- "ENSEMBLID"
next
} else if (genus[i_file] %in% c("athaliana", "ecoli")) {
DT_meta$gene_id_type[i_file] <- "Locus tag"
next
} else if (genus[i_file] %in% c("scerevisiae")) {
DT_meta$gene_id_type[i_file] <- "Yeast id"
next
} else if (genus[i_file] %in% c("dmelanogaster")){
DT_meta$gene_id_type[i_file] <- "FlyBase id"
next
} else if (genus[i_file] %in% c("celegans")) {
DT_meta$gene_id_type[i_file] <- "WormBase id"
next
} else{
DT_meta$gene_id_type[i_file] <- "gene"
next
}
}
write.table(x = DT_meta, file = file.path(destDir, "meta.txt"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}
selfCheck <- function(cacheDir = getOption("phantasusCacheDir"),
preloadedDir = getOption("phantasusPreloadedDir"),
verbose = FALSE) {
if (!is.null(preloadedDir) && dir.exists(preloadedDir)) {
preloadedFiles <- list.files(preloadedDir, pattern = "\\.(gct|rda)$")
message(paste(length(preloadedFiles), 'preloaded datasets are available'))
if (verbose) {
message(paste0(preloadedFiles, collapse = " "))
message(" ")
}
} else {
message('!!! Preloaded dir is not set')
}
h5counts <- list.files(file.path(cacheDir, "counts"), recursive = TRUE,
pattern = '\\.h5$')
archs4Files <- list.files(file.path(cacheDir, "archs4"),
pattern = '\\.h5$')
if (length(h5counts)) {
message(paste(length(h5counts), 'counts files are available'))
if (verbose) {
message(paste0(h5counts, collapse = " "))
message(" ")
}
if (length((archs4Files))) {
message("!!! deprecated archs4-based directory architecture exists. It will be ignored and RNA-seq will load only from counts directory.")
}
} else {
if (length(archs4Files)) {
message("!!! archs4-based directory architecture is obsolete and will dissapear in future release")
message(paste(length(archs4Files), 'archs4 files are available'))
if (verbose) {
message(paste0(archs4Files, collapse = " "))
message(" ")
}
} else {
message('!!! No counts provided RNA-seq will load without matrices')
}
}
annotDir <- file.path(cacheDir, "annotationdb")
dbFiles <- list.files(annotDir, pattern = '\\.sqlite$')
if (length(dbFiles)) {
message(paste(length(dbFiles), 'annotationDb are available'))
if (verbose) {
message(paste0(dbFiles, collapse = " "))
message(" ")
}
} else {
message('!!! No annotationDb provided')
}
fgseaDir <- file.path(cacheDir, 'fgsea')
fgseaFiles <- list.files(fgseaDir, '\\.rds$', full.names = FALSE)
if (length(fgseaFiles)) {
message(paste(length(fgseaFiles), 'fgsea tables are available'))
if (verbose) {
message(paste0(fgseaFiles, collapse = " "))
message(" ")
}
} else {
message('!!! No fgsea tables provided')
}
}
safeDownload <- function(url, dir, filename, ...) {
dest <- file.path(dir, filename)
if (file.exists(dest)) {
return()
}
tempDest <- tempfile(paste0(filename, ".load"), tmpdir = dir)
utils::download.file(url, destfile = tempDestFile, ...)
file.rename(tempDest, dest)
}
isValidExperimentID <- function(name) {
grepl("^(GSE|GDS)[0-9]+(-GPL[0-9]+)?$", name, ignore.case = TRUE)
}
getGEODir <- function(name, destdir = tempdir()) {
if (!isValidExperimentID(name)) {
stop(name, " does not look like a valid GEO Series ID")
}
type <- substr(name, 1, 3)
GEO <- unlist(strsplit(name, "-"))[1]
stub <- gsub("\\d{1,3}$", "nnn", GEO, perl = TRUE)
gdsDirPath <- "%s/geo/datasets/%s/%s/soft"
gseDirPath <- "%s/geo/series/%s/%s/matrix"
if (type == 'GSE') {
fullGEODirPath <- file.path(sprintf(gseDirPath, destdir, stub, GEO))
} else {
fullGEODirPath <- file.path(sprintf(gdsDirPath, destdir, stub, GEO))
}
dir.create(fullGEODirPath, showWarnings = FALSE, recursive = TRUE)
fullGEODirPath
}
getBriefData <- function(name, destdir = tempdir()) {
GEO <- unlist(strsplit(name, "-"))[1]
GEOdir <- dirname(getGEODir(GEO, destdir))
briefFile <- file.path(GEOdir, 'brief')
if (file.exists(briefFile)) {
message('Using cached brief file: ', briefFile)
} else {
url <- sprintf("www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s&targ=self&form=text&view=brief", GEO)
message('Trying ', url)
resp <- httr::GET(url)
text <- httr::content(resp, "text", "UTF-8")
check <- grep('Could not', text)
if (length(check) || httr::status_code(resp) != 200) {
message('No such dataset: ', name)
unlink(GEOdir, recursive = TRUE, force = TRUE)
stop('Failed to download brief data on: ', GEO, '. No such dataset')
} else {
writeLines(text, briefFile)
message('Stored brief data of ', GEO, ' at ', briefFile)
}
}
parsedList <- parseBriefData(readLines(briefFile))
if (length(parsedList) == 0) {
file.remove(briefFile)
stop('Failed to parse brief data on: ', GEO, '. Empty list')
}
return (parsedList)
}
parseBriefData <- function(txt) {
tmp <- txt[grep("!\\w*?_", txt)]
tmp <- gsub("!\\w*?_",'', tmp)
first.eq <- regexpr(' = ', tmp)
tmp <- cbind(substring(tmp, first = 1, last = first.eq - 1),
substring(tmp, first = first.eq + 3))
tmp <- tmp[tmp[,1] != "",]
header <- split(tmp[,2],tmp[,1])
return(header)
}
#' Create meta-data for single counts collection
#'
#' @aliases \alias{getCountsMetaPart}
#' Creates a part of counts collections meta-data
#' @param counts_dir path to directory with count collections
#' @param collection_name name of collection and collection's directory
#' @param verbose logical value which determines a content of the output.
#' @details Function assumes that \code{collection_name} contains \code{meta.txt} which is valid (in sence of \code{\link{validateCountsCollection}}).
#' For each row in \code{meta.txt} function reads specified \code{sample_id} dataset and writes every sample id to the resulting \code{data.table} with source file name and collection name.
#'
#' @return \code{data.table} with meta-data or nothing if \code{destdir} does not exist or does not contain files.
#' @seealso {\code{\link{validateCountsCollection}},\code{\link{getCountsMetaPart}}}
#' @usage
#' \dontrun{
#' collDir= "/path/to/my/collection")
#' valid_collection = validateCountsCollection(collectionDir = collDir, verbose = TRUE)
#' if (valid_collection){
#' metaPart = getCountsMetaPart(destdir = collDir, verbose = TRUE)
#' }
#' }
#'
#'
#' @import data.table
getCountsMetaPart <- function(counts_dir, collection_name, verbose){
destdir <- file.path(counts_dir, collection_name)
if (!dir.exists(destdir)) {
return()
}
DT_h5_meta <- data.table()
h5_files <- list.files(destdir, "\\.h5$", full.names = FALSE)
if (!length(h5_files)) {
return()
}
h5_meta <- fread(file.path(destdir,"meta.txt"), index = "file_name")
for (input_file in h5_files) {
if (input_file %in% h5_meta$file_name) {
full_name <- file.path(destdir, input_file)
relative_path <- file.path(collection_name, input_file )
h5_part <- data.table(accession = h5read(full_name, h5_meta[file_name == input_file, ]$sample_id),
file = relative_path,
collection_type = collection_name)
DT_h5_meta <- rbindlist(l = list(DT_h5_meta, h5_part))
}
else{
if (verbose) {
message(paste0("!! ", file.path(destdir, input_file), " is ignored"))
}
}
}
return(DT_h5_meta)
}
#' Update meta-data for counts collections
#'
#' @aliases \alias{updateCountsMeta}
#' Creates \code{meta.rda} file which contain information about all samples in all collections.
#' Also function checks \code{priority.txt} file. This file is used to manage collections with the same samples.
#' @param counts_dir path to counts cache directory
#' @param verbose logical value which determines a content of the output.
#' @param force logical value wich lets function replace existing \code{meta.rda} file
#' @details First of all function checks validity of \code{priority.txt} file. \bold{Every} Collection should have \bold{unique} priority.
#' If \code{priority.txt} is not valid function creates new one, setting priorities for each subdirectory(=collection) equal to order in \code{list.dir} output.
#'
#'Function updates \code{meta.rda} if this file is older than at least one \code{.h5} file in counts files.
#' \code{meta.rda} is \code{data.table} which is a result of union \code{data.table}s produced by \code{\link{getCountsMetaPart}} for each collection
#' @seealso {\code{\link{validateCountsCollection}},\code{\link{updateCountsMeta}}}
#' @import data.table
updateCountsMeta <- function(counts_dir = file.path(getOption("phantasusCacheDir"), "counts"), force = FALSE, verbose = FALSE){
if (!dir.exists(counts_dir)) {
message(paste0('Counts directory ', counts_dir, " does not extist" ))
return()
}
meta_name <- file.path(counts_dir, "meta.rda")
h5_files <- list.files(file.path(counts_dir), "\\.h5$", full.names = TRUE, recursive = TRUE)
list_dirs <- list.dirs(counts_dir, full.names = FALSE, recursive = TRUE)
list_dirs <- c(".", list_dirs[-1])
priority_file <- file.path(counts_dir, "counts_priority.txt")
need_create <- TRUE
if (file.exists(priority_file)) {
priority <- fread(priority_file)
if (!(setequal(priority$directory,list_dirs) && length(unique(priority$priority)) == length(priority$priority))) {
message(paste0("!!! Priority file ", priority_file , " is invalid and will be replaced"))
} else {
need_create <- FALSE
}
}
if (need_create) {
priority <- data.table(directory = list_dirs, priority = seq_along(list_dirs))
write.table(x = priority, file = priority_file, sep = "\t", eol = "\n", row.names = FALSE, col.names = TRUE, quote = FALSE)
}
if (!length(h5_files)) {
return()
}
if (!force) {
meta_time <- as.numeric(file.mtime(meta_name))
h5_mtime <- max(unlist(lapply(h5_files, file.mtime)))
dirs_mtime <- lapply(file.path(counts_dir, list_dirs[-1]), file.mtime)
if (length(dirs_mtime) > 0) {
dir_mtime <- max(unlist(dirs_mtime))
} else {
dir_mtime <- -Inf
}
if (file.exists(meta_name) && meta_time > h5_mtime && meta_time > dir_mtime) {
return()
}
}
if (file.exists(meta_name)) {
unlink(meta_name)
}
DT_counts_meta <- data.table(matrix(ncol = 3, nrow = 0, dimnames = list( NULL, c("accession", "file", "collection_type"))))
for (cur_dir in list_dirs) {
dir_path <- file.path(counts_dir, cur_dir)
dir_path <- sub(pattern = "/./?$", replacement = "", x = dir_path)
cur_files <- list.files(path = dir_path, pattern = "\\.h5", recursive = FALSE )
if (length(cur_files) == 0) {
next
}
if (!file.exists(file.path(dir_path, "meta.txt"))) {
if (startsWith(x = tolower(basename(dir_path)), prefix = "archs4")) {
updateARCHS4meta(archDir = dir_path)
} else if (startsWith(x = tolower(basename(dir_path)), prefix = "dee2")) {
updateDEE2meta(destDir = dir_path)
}
}
message(paste0('Populating ', cur_dir , ' counts meta' ))
if (!validateCountsCollection(collectionDir = dir_path, verbose = verbose)) {
message(paste0("!! files in ", cur_dir , " are inored because there is not correct meta file in this directory."))
next
}
DT_part <- getCountsMetaPart(counts_dir = counts_dir, collection_name = cur_dir, verbose = verbose)
if (length(DT_part)) {
DT_counts_meta <- rbindlist(l = list(DT_counts_meta, DT_part))
}
rm(DT_part)
}
save(DT_counts_meta, file = meta_name, eval.promises = TRUE)
rm(DT_counts_meta)
}
#' Check a counts collection
#'
#' @aliases \alias{validateCountsCollection}
#' Function checks existing and structure of \code{meta.txt} file in specified counts folder.Also it checks accessibility of specified datasets in corresponding \code{.h5} files.
#' @param collectionDir path to directory with collection
#' @param verbose logical value which determines a content of the output.
#' @details \code{collectionDir} should contain a bunch of \code{.h5} files
#' and a single \code{meta.txt}. \code{meta.txt} is \code{.tsv}-like file where for each \code{.h5} exists a row wit columns:
#' \describe{ \item{file_name}{ name of \code{.h5} file in \code{collectionDir}.}
#' \item{sample_id}{name of dataset in \code{file_name} which contains sample IDs (sample_geo_accession for example).}
#' \item{gene_id}{name of dataset in \code{file_name} which contains ids for genes. For correct work this dataset should contain unique values.}
#' \item{gene_id_type}{meaning of gene_id. This value determines name of gene annotation field in phantasus.}
#' }
#'
#' @import data.table
#' @import rhdf5
validateCountsCollection <- function(collectionDir, verbose=FALSE){
if (file.exists(file.path(collectionDir, "meta.txt"))) {
h5_meta <- fread(file.path(collectionDir, "meta.txt"), index = "file_name")
for (input_file in h5_meta$file_name) {
full_path <- file.path(collectionDir, input_file)
h5_datasets <- h5ls(file = file.path(collectionDir, input_file), datasetinfo = FALSE)[c("group","name")]
h5_datasets <- paste(h5_datasets$group, h5_datasets$name, sep = "/")
if (!(h5_meta[file_name == input_file, ]$sample_id %in% h5_datasets && h5_meta[file_name == input_file, ]$gene_id %in% h5_datasets)) {
h5closeAll()
if (verbose) {
message(paste0("ids in meta file don't match ids in ", full_path ))
}
return(FALSE)
}
gene_ids <- h5read(file = full_path, name = h5_meta[file_name == input_file, ]$gene_id)
if (length(gene_ids) != length(unique(gene_ids))) {
if (verbose) {
message(paste("Non-unique gene ids in file", full_path))
}
h5closeAll()
return(FALSE)
}
}
h5closeAll()
return(TRUE)
}
if (verbose) {
message(paste0("metafile does not exist in ", file.path(collectionDir)))
}
h5closeAll()
return(FALSE)
}
checkBinValidity <- function(filePath, valid_from) {
if (!file.exists(filePath)) {
return(FALSE)
}
if (file.info(filePath)$ctime < valid_from) {
return(FALSE)
}
raw_proto_version <- as.raw(PROTOBUF_LAYOUT_VERSION)
bin_ref <- protolite::serialize_pb(list(raw_proto_version))
file_head <- readBin(con = filePath,what = raw(), n = length(bin_ref))
if (!all(bin_ref == file_head)) {
return(FALSE)
}
return(TRUE)
}
phantasusVersion <- function() {
jsonlite::toJSON(as.character(utils::packageVersion("phantasus")))
}