Revision 66357a4654bfe45c5fe4a9565bdf67d7b5a6f05c authored by clevermx on 18 January 2022, 11:09:39 UTC, committed by Space on 18 January 2022, 11:09:39 UTC
Add meta-files generation for new counts processing.
Add support for 8+ versions of ARCHs4.

Co-authored-by: Alexey Sergushichev <alserg@itmo.ru>
Merge-request: PHANTASUS-MR-21
Merged-by: Maksim Kleverov <klevermx@gmail.com>
1 parent d230978
Raw File
utils.R
PROTOBUF_LAYOUT_VERSION = c(0x00, 0x02)

getIndicesVector <- function(current, neededLength) {
    if (length(current) == 0) {
        current <- 0:(neededLength - 1)
    }
    current + 1
}


#' Reads ExpressionSet from a GCT file.
#'
#' Only versions 1.2 and 1.3 are supported.
#'
#' @param gct Path to gct file
#'
#' @param ... additional options for read.csv
#'
#' @return ExpressionSet object
#'
#' @examples
#' read.gct(system.file("extdata", "centers.gct", package = "phantasus"))
#' @export
read.gct <- function(gct, ...) {
    meta <- readLines(gct, n = 3)
    version <- meta[1]
    size <- as.numeric(unlist(strsplit(meta[2], "\t")))

    if (grepl("^#1.3", version)) {
        # number of column annotations = number of additional rows
        ann.col <- size[4]

        # number of row annotations = number of additional columns
        ann.row <- size[3]
    } else if (grepl("^#1.2", version)) {
        ann.col <- 0
        ann.row <- 1
    } else {
        stop("Unsupported version of gct: use 1.2 or 1.3")
    }

    colNames <- unlist(strsplit(meta[3], "\t"))
    if (grepl("/", colNames[1])) {
        rowIdField <- sub("(.*)/(.*)", "\\1", colNames[1])
        colIdField <- sub("(.*)/(.*)", "\\2", colNames[1])
    } else {
        rowIdField <- "id"
        colIdField <- "id"
    }

    colNames[1] <- rowIdField

    t <- read.tsv(gct, skip = 2 + 1 + ann.col, nrows = size[1],
                col.names = colNames,
                row.names = NULL, header = FALSE,  ...)

    rownames(t) <- t[,1]

    exp <- as.matrix(t[, (ann.row + 2):ncol(t)])

    fdata <- makeAnnotated(t[, seq_len(ann.row + 1), drop = FALSE])


    if (ann.col > 0) {
        pdata.raw <- t(read.tsv(gct, skip = 2, nrows = ann.col + 1,
                                header = FALSE, row.names=NULL))
        pdata <- data.frame(pdata.raw[seq_len(ncol(exp)) + 1 + ann.row, ,
                                drop = FALSE])
        colnames(pdata) <- pdata.raw[1, ]
        colnames(pdata)[1] <- colIdField
        rownames(pdata) <- colnames(exp)
        pdata <- makeAnnotated(pdata)

        res <- ExpressionSet(exp, featureData = fdata, phenoData = pdata)
    } else {
        res <- ExpressionSet(exp, featureData = fdata)
    }

    res
}

read.tsv <- function(file, header = TRUE, sep = "\t", quote = "",
                        comment.char = "",
                        check.names = FALSE, ...) {
    args <- list(...)
    res <- utils::read.table(file, header = header, sep = sep, quote = quote,
                    comment.char = comment.char, check.names = check.names,
                    stringsAsFactors = FALSE,
                    ...)
    if ( (!"row.names" %in% names(args)) && (colnames(res)[1] == "") ) {
        rownames(res) <- res[, 1]
        res[[1]] <- NULL
    }
    res
}

#' Saves ExpressionSet to a GCT file (version 1.3).
#'
#' @param es ExpresionSet obeject to save
#' @param file Path to output gct file
#' @param gzip Whether to gzip apply gzip-compression for the output file#'
#' @return Result of the closing file (as in `close()` function`)
#' @examples
#' es <- read.gct(system.file("extdata", "centers.gct", package = "phantasus"))
#' out <- tempfile(fileext = ".gct.gz")
#' write.gct(es, out, gzip=TRUE)
#' @import Biobase
#' @export
write.gct <- function(es, file, gzip=FALSE) {
    if (gzip) {
        con <- gzfile(file)
    } else {
        con <- file(file)
    }
    open(con, open="w")
    writeLines("#1.3", con)
    ann.col <- ncol(pData(es))
    ann.row <- ncol(fData(es))
    writeLines(sprintf("%s\t%s\t%s\t%s", nrow(es), ncol(es), ann.row, ann.col), con)
    writeLines(paste0(c("ID", colnames(fData(es)), colnames(es)), collapse="\t"), con)

    ann.col.table <- t(as.matrix(pData(es)))
    ann.col.table <- cbind(matrix(rep(NA, ann.row*ann.col), nrow=ann.col), ann.col.table)
    write.table(ann.col.table, file=con, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
    write.table(cbind(fData(es), exprs(es)), file=con, quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE)
    close(con)
}


makeAnnotated <- function(data) {
    meta <- data.frame(labelDescription = colnames(data))
    rownames(meta) <- colnames(data)

    methods::new("AnnotatedDataFrame", data = data, varMeta = meta)
}

take <- function(x, n) {
    sapply(x, function(x) {
        x[[n]]
    })
}

writeToList <- function(es) {
    data <- as.matrix(exprs(es))
    colnames(data) <- NULL
    row.names(data) <- NULL

    pdata <- pData(es)
    row.names(pdata) <- NULL
    pdata <- as.list(pdata)

    rownames <- rownames(es)

    fdata <- fData(es)
    row.names(fdata) <- NULL
    fdata <- as.list(fdata)


    ed <- experimentData(es)
    experimentList <- as.list(expinfo(ed))
    experimentList$other <- as.list(ed@other)
    experimentList$pubMedIds <- pubMedIds(ed)

    res <- list(data = data, pdata = pdata, fdata = fdata,
                rownames = rownames,
                colMetaNames = varLabels(es),
                rowMetaNames = fvarLabels(es),
                experimentData = experimentList)
    res
}

#' Update archs4 files.
#'
#' Download archs4 or archs4zoo counts in \code{cacheDir}. If directory does not exists function makes nothing and produce corresponding warnings.
#' @importFrom utils download.file
#' @param cacheDir file path to \bold{archs4} cache directory
#' @param organism vector which determines organisms to download: human, mouse, zoo or all as default.
#' Also can be a genus. Possible genus: \enumerate{ \item drosophila \item gallus \item bos \item caenorhabditis
#' \item danio \item rattus \item saccharomyces \item arabidopsis}
#' @param force logical value which let function replace current files
updateARCHS4 <- function (cacheDir = file.path(getOption("phantasusCacheDir"), "counts/archs4"), organism = c("all"), force = FALSE){
    zoo_dict <- c("drosophila" = "Drosophila_melanogaster",
                 "bos" = "Bos_taurus",
                 "caenorhabditis" = "Caenorhabditis_elegans",
                 "danio" = "Danio_rerio",
                 "gallus" = "Gallus_gallus",
                 "rattus" = "Rattus_norvegicus",
                 "saccharomyces" = "Saccharomyces_cerevisiae",
                 "arabidopsis" = "Arabidopsis_thaliana"
               )
    organism <- tolower(organism)
    if (any(organism %in% c("all","human")) && (force || !file.exists(file.path(cacheDir, "human_matrix.h5")))) {
        download.file(url = "https://s3.amazonaws.com/mssm-seq-matrix/human_matrix.h5",
                      destfile = file.path(cacheDir, "human_matrix.h5"),
                      mode = "wb")
    }
    if (any(organism  %in% c("all","mouse")) && (force || !file.exists(file.path(cacheDir, "mouse_matrix.h5")))) {
        download.file(url = "https://s3.amazonaws.com/mssm-seq-matrix/mouse_matrix.h5",
                      destfile = file.path(cacheDir, "mouse_matrix.h5"),
                      mode = "wb")
    }
    for (sp in names(zoo_dict)){
        if (any(organism %in% c("all","zoo", sp))) {
            sp_file_name <- paste0(zoo_dict[sp], "_genecount_v1.h5")
            sp_file_path <-  file.path(cacheDir, sp_file_name)
            if (force || !file.exists(sp_file_path)) {
                download.file(url = paste0("https://s3.amazonaws.com/mssm-archs4-zoo/", sp_file_name),
                              destfile = sp_file_path,
                              mode = "wb")
            }
        }
    }
}

#' Update ARCHS4 meta files
#'
#' Creates \code{meta.txt} file, which describes typical archs4 and archs4Zoo files.
#' @param archDir path to directory with arch4 .h5 files.
#' @details This function produces very specific "hardcoded" \code{meta.txt} file for arch4 and archs4ZOO counts colletctions.
#' See \code{\link{validateCountsCollection}} for more common information and \code{meta.txt}  file structure
#' @seealso \code{\link{validateCountsCollection}}
#' @import data.table
updateARCHS4meta <- function(archDir = file.path(getOption("phantasusCacheDir"), "counts/archs4")){
    archs4files <- list.files(archDir, pattern = '\\.h5$')
    DT_meta <- data.frame(matrix(ncol = 4, nrow = length(archs4files), dimnames = list(NULL, c("file_name", "sample_id", 	"gene_id", "gene_id_type"))))
    DT_meta$file_name <- archs4files
    DT_meta$sample_id <- "/meta/Sample_geo_accession"
    DT_meta$gene_id <- "/meta/genes"
    genus <- tolower(sapply(strsplit(x = archs4files, split = "_"), function(x) x[1]))
    for (i_file in seq_along(archs4files)) {
        file_info <- h5ls(file = file.path(archDir, archs4files[i_file]))
        ver_field <- NA
        if ("/info" %in% file_info$group) {
          ver_field <- "/info/version"
        } else if ("/meta/info" %in% file_info$group) {
          ver_field <- "/meta/info/version"
        }
        arch_ver <- as.numeric(h5read(file = file.path(archDir, archs4files[i_file]), name = ver_field))
        if (arch_ver >= 9) {
          DT_meta$sample_id[i_file] <- "/meta/samples/geo_accession"
          DT_meta$gene_id[i_file] <-  "/meta/genes/genes"
        }
        if (genus[i_file] %in% c("human", "mouse")) {
            DT_meta$gene_id_type[i_file] <- "Gene Symbol"
            next
        } else if (genus[i_file] %in% c("rattus", "bos", "gallus", "danio")) {
            DT_meta$gene_id_type[i_file] <- "ENSEMBLID"
            next
        } else if (genus[i_file] %in% c("arabidopsis", "saccharomyces")) {
            DT_meta$gene_id_type[i_file] <- "Gene Symbol"
            next
        } else if (genus[i_file] %in% c("caenorhabditis")) {
            DT_meta$gene_id_type[i_file] <- "WormBase id"
            next
        } else if (genus[i_file] %in% c("drosophila")) {
            DT_meta$gene_id_type[i_file] <- "FlyBase id"
            next
        } else{
          DT_meta$gene_id_type[i_file] <- "gene"
        }
    }
    h5closeAll()
    write.table(x = DT_meta, file = file.path(archDir, "meta.txt"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}
#' Update DEE2 meta files
#'
#' Creates \code{meta.txt} file, which describes typical dee2 files.
#' @param destDir path to directory with DEE2 .h5 files.
#' @details This function produces very specific "hardcoded" \code{meta.txt} file for dee2 counts colletction.
#' See \code{\link{validateCountsCollection}} for more common information and \code{meta.txt}  file structure
#' @seealso \code{\link{validateCountsCollection}}
#' @import data.table
updateDEE2meta <- function(destDir = file.path(getOption("phantasusCacheDir"), "counts/dee2")){
    dee2files <- list.files(destDir, pattern = '\\.h5$')
    DT_meta <- data.frame(matrix(ncol = 4, nrow = length(dee2files), dimnames = list(NULL, c("file_name", "sample_id", "gene_id", "gene_id_type"))))
    DT_meta$file_name <- dee2files
    DT_meta$sample_id <- "/meta/samples/geo_accession"
    DT_meta$gene_id <- "/meta/genes/ensembl_gene_id"
    genus <- sapply(strsplit(x = dee2files, split = "_"), function(x) x[1])
    for (i_file in 1:length(dee2files)) {
        if (genus[i_file] %in% c("hsapiens", "mmusculus", "drerio", "rattus")) {
            DT_meta$gene_id_type[i_file] <- "ENSEMBLID"
            next
        } else if (genus[i_file] %in% c("athaliana", "ecoli")) {
            DT_meta$gene_id_type[i_file] <- "Locus tag"

            next
        } else if (genus[i_file] %in% c("scerevisiae")) {
            DT_meta$gene_id_type[i_file] <- "Yeast id"
            next
        } else if (genus[i_file] %in% c("dmelanogaster")){
            DT_meta$gene_id_type[i_file] <- "FlyBase id"
            next
        } else if (genus[i_file] %in% c("celegans")) {
            DT_meta$gene_id_type[i_file] <- "WormBase id"
            next
        } else{
            DT_meta$gene_id_type[i_file] <- "gene"
            next
        }
    }
    write.table(x = DT_meta, file = file.path(destDir, "meta.txt"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE)
}

selfCheck <- function(cacheDir = getOption("phantasusCacheDir"),
                      preloadedDir = getOption("phantasusPreloadedDir"),
                      verbose = FALSE) {
    if (!is.null(preloadedDir) && dir.exists(preloadedDir)) {
        preloadedFiles <- list.files(preloadedDir, pattern = "\\.(gct|rda)$")
        message(paste(length(preloadedFiles), 'preloaded datasets are available'))
        if (verbose) {
            message(paste0(preloadedFiles,  collapse = " "))
            message(" ")
        }
    } else {
        message('!!! Preloaded dir is not set')
    }

    h5counts <- list.files(file.path(cacheDir, "counts"), recursive = TRUE,
                           pattern = '\\.h5$')
    archs4Files <- list.files(file.path(cacheDir, "archs4"),
                              pattern = '\\.h5$')
    if (length(h5counts)) {
        message(paste(length(h5counts), 'counts files are available'))
        if (verbose) {
            message(paste0(h5counts, collapse = " "))
            message(" ")
        }
        if (length((archs4Files))) {
            message("!!! deprecated archs4-based directory architecture exists. It will be ignored and RNA-seq will load only from counts directory.")
        }
    } else {
        if (length(archs4Files)) {
            message("!!! archs4-based directory architecture is obsolete and will dissapear in future release")
            message(paste(length(archs4Files), 'archs4 files are available'))
            if (verbose) {
                message(paste0(archs4Files, collapse = " "))
                message(" ")
            }
        } else {
            message('!!! No counts provided RNA-seq will load without matrices')
        }
    }
    annotDir <- file.path(cacheDir, "annotationdb")
    dbFiles <- list.files(annotDir, pattern = '\\.sqlite$')
    if (length(dbFiles)) {
        message(paste(length(dbFiles), 'annotationDb are available'))
        if (verbose) {
            message(paste0(dbFiles, collapse = " "))
            message(" ")
        }
    } else {
        message('!!! No annotationDb provided')
    }

    fgseaDir <- file.path(cacheDir, 'fgsea')
    fgseaFiles <- list.files(fgseaDir, '\\.rds$', full.names = FALSE)
    if (length(fgseaFiles)) {
        message(paste(length(fgseaFiles), 'fgsea tables are available'))
        if (verbose) {
            message(paste0(fgseaFiles, collapse = " "))
            message(" ")
        }
    } else {
        message('!!! No fgsea tables provided')
    }
}

safeDownload <- function(url, dir, filename, ...) {
    dest <- file.path(dir, filename)
    if (file.exists(dest)) {
        return()
    }

    tempDest <- tempfile(paste0(filename, ".load"), tmpdir = dir)
    utils::download.file(url, destfile = tempDestFile, ...)
    file.rename(tempDest, dest)
}

isValidExperimentID <- function(name) {
    grepl("^(GSE|GDS)[0-9]+(-GPL[0-9]+)?$", name, ignore.case = TRUE)
}


getGEODir <- function(name, destdir = tempdir()) {
    if (!isValidExperimentID(name)) {
        stop(name, " does not look like a valid GEO Series ID")
    }
    type <- substr(name, 1, 3)
    GEO <- unlist(strsplit(name, "-"))[1]
    stub <- gsub("\\d{1,3}$", "nnn", GEO, perl = TRUE)
    gdsDirPath <- "%s/geo/datasets/%s/%s/soft"
    gseDirPath <- "%s/geo/series/%s/%s/matrix"
    if (type == 'GSE') {
        fullGEODirPath <- file.path(sprintf(gseDirPath, destdir, stub, GEO))
    } else {
        fullGEODirPath <- file.path(sprintf(gdsDirPath, destdir, stub, GEO))
    }
    dir.create(fullGEODirPath, showWarnings = FALSE, recursive = TRUE)

    fullGEODirPath
}

getBriefData <- function(name, destdir = tempdir()) {
    GEO <- unlist(strsplit(name, "-"))[1]
    GEOdir <- dirname(getGEODir(GEO, destdir))
    briefFile <- file.path(GEOdir, 'brief')

    if (file.exists(briefFile)) {
        message('Using cached brief file: ', briefFile)
    } else {
        url <- sprintf("www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=%s&targ=self&form=text&view=brief", GEO)
        message('Trying ', url)
        resp <- httr::GET(url)
        text <- httr::content(resp, "text", "UTF-8")
        check <- grep('Could not', text)
        if (length(check) || httr::status_code(resp) != 200) {
            message('No such dataset: ', name)
            unlink(GEOdir, recursive = TRUE, force = TRUE)
            stop('Failed to download brief data on: ', GEO, '. No such dataset')
        } else {
            writeLines(text, briefFile)
            message('Stored brief data of ', GEO, ' at ', briefFile)
        }
    }
    parsedList <- parseBriefData(readLines(briefFile))
    if (length(parsedList) == 0) {
        file.remove(briefFile)
        stop('Failed to parse brief data on: ', GEO, '. Empty list')
    }

    return (parsedList)
}

parseBriefData <- function(txt) {
    tmp <- txt[grep("!\\w*?_", txt)]
    tmp <- gsub("!\\w*?_",'', tmp)
    first.eq <- regexpr(' = ', tmp)
    tmp <- cbind(substring(tmp, first = 1, last = first.eq - 1),
                 substring(tmp, first = first.eq + 3))
    tmp <- tmp[tmp[,1] != "",]
    header <- split(tmp[,2],tmp[,1])
    return(header)
}

#' Create meta-data for single counts collection
#'
#' @aliases \alias{getCountsMetaPart}
#' Creates a part of counts collections meta-data
#' @param counts_dir path to directory with count collections
#' @param collection_name name of collection and collection's directory
#' @param verbose logical value which determines a content of  the output.
#' @details Function assumes that \code{collection_name} contains \code{meta.txt} which is valid (in sence of \code{\link{validateCountsCollection}}).
#' For each row in \code{meta.txt} function reads specified \code{sample_id} dataset and writes every sample id to the resulting \code{data.table} with source file name and collection name.
#'
#' @return \code{data.table} with meta-data or nothing if \code{destdir} does not exist or does not contain files.
#' @seealso {\code{\link{validateCountsCollection}},\code{\link{getCountsMetaPart}}}
#' @usage
#' \dontrun{
#'     collDir= "/path/to/my/collection")
#'     valid_collection = validateCountsCollection(collectionDir = collDir, verbose = TRUE)
#'     if (valid_collection){
#'         metaPart = getCountsMetaPart(destdir = collDir, verbose = TRUE)
#'      }
#'  }
#'
#'
#' @import data.table
getCountsMetaPart <- function(counts_dir, collection_name, verbose){
    destdir <- file.path(counts_dir, collection_name)
    if (!dir.exists(destdir)) {
        return()
    }
    DT_h5_meta <- data.table()
    h5_files <- list.files(file.path(destdir), "\\.h5$", full.names = FALSE)
    if (!length(h5_files)) {
        return()
    }
    h5_meta <- fread(file.path(destdir,"meta.txt"), index = "file_name")
    for (input_file in h5_files) {
        if (input_file %in% h5_meta$file_name) {
            full_name <- file.path(destdir, input_file)
            h5_part <- data.table(accession = h5read(full_name, h5_meta[file_name == input_file, ]$sample_id),
                                 file = full_name,
                                 collection_type = collection_name)
            DT_h5_meta <- rbindlist(l = list(DT_h5_meta, h5_part))
        }
        else{
            if (verbose) {
                message(paste0("!! ", file.path(destdir, input_file), " is ignored"))
            }
        }
    }
    return(DT_h5_meta)
}

#' Update meta-data for counts collections
#'
#' @aliases \alias{updateCountsMeta}
#' Creates \code{meta.rda} file which contain information about all samples in all collections.
#' Also function checks \code{priority.txt} file. This file is used to manage collections with the same samples.
#' @param counts_dir path to counts cache directory
#' @param verbose logical value which determines a content of  the output.
#' @param force logical value wich lets function replace existing \code{meta.rda} file
#' @details  First of all function checks validity of \code{priority.txt} file. \bold{Every} Collection should have \bold{unique} priority.
#' If \code{priority.txt} is not valid function creates new one, setting priorities for each subdirectory(=collection) equal to order in \code{list.dir} output.
#'
#'Function updates \code{meta.rda} if this file is older than at least one \code{.h5} file in counts files.
#' \code{meta.rda} is \code{data.table} which is a result of union \code{data.table}s produced by \code{\link{getCountsMetaPart}} for each collection
#' @seealso {\code{\link{validateCountsCollection}},\code{\link{updateCountsMeta}}}
#'  @import data.table
updateCountsMeta <- function(counts_dir =  file.path(getOption("phantasusCacheDir"), "counts"), force = FALSE, verbose = FALSE){
    if (!dir.exists(counts_dir)) {
        message(paste0('Counts directory ', counts_dir, " does not extist" ))
        return()
    }
    meta_name <- file.path(counts_dir, "meta.rda")
    h5_files <- list.files(file.path(counts_dir), "\\.h5$", full.names = TRUE, recursive = TRUE)
    list_dirs <-  list.dirs(counts_dir, full.names = FALSE, recursive = TRUE)
    list_dirs <- c(".", list_dirs[-1])
    priority_file <- file.path(counts_dir, "counts_priority.txt")
    need_create <- TRUE
    if (file.exists(priority_file)) {
        priority <- fread(priority_file)
        if (!(setequal(priority$directory,list_dirs) && length(unique(priority$priority)) == length(priority$priority))) {
            message(paste0("!!! Priority file ", priority_file , " is invalid and will be replaced"))
        } else {
            need_create <- FALSE
        }
    }
    if (need_create) {
        priority <- data.table(directory = list_dirs, priority = seq_along(list_dirs))
        write.table(x = priority, file = priority_file, sep = "\t", eol = "\n", row.names = FALSE, col.names = TRUE, quote = FALSE)
    }
    if (!length(h5_files)) {
      return()
    }
    if (!force) {
        meta_time <- as.numeric(file.mtime(meta_name))
        h5_mtime <- max(unlist(lapply(h5_files, file.mtime)))
        dirs_mtime <- lapply(file.path(counts_dir, list_dirs[-1]), file.mtime)
        if (length(dirs_mtime) > 0) {
            dir_mtime <- max(unlist(dirs_mtime))
        } else {
            dir_mtime <- -Inf
        }

        if (file.exists(meta_name) && meta_time > h5_mtime && meta_time > dir_mtime) {
            return()
        }
    }
    if (file.exists(meta_name)) {
      unlink(meta_name)
    }
    DT_counts_meta <- data.table(matrix(ncol = 3, nrow = 0, dimnames = list( NULL, c("accession", "file", "collection_type"))))
    for (cur_dir in list_dirs) {
        dir_path <- file.path(counts_dir, cur_dir)
        dir_path <- sub(pattern = "/./?$", replacement = "", x =  dir_path)
        cur_files <- list.files(path = dir_path, pattern = "\\.h5", recursive = FALSE )
        if (length(cur_files) == 0) {
          next
        }
        if (!file.exists(file.path(dir_path, "meta.txt"))) {
            if (startsWith(x = tolower(basename(dir_path)), prefix =  "archs4")) {
              updateARCHS4meta(archDir = dir_path)
            } else if (startsWith(x = tolower(basename(dir_path)), prefix = "dee2")) {
              updateDEE2meta(destDir = dir_path)
            }
        }
        message(paste0('Populating ', cur_dir , ' counts meta' ))
        if (!validateCountsCollection(collectionDir = dir_path, verbose = verbose)) {
            message(paste0("!! files in ", cur_dir , " are inored because there is not correct meta file in this directory."))
            next
        }
        DT_part <- getCountsMetaPart(counts_dir = counts_dir, collection_name = cur_dir, verbose = verbose)
        if (length(DT_part)) {
            DT_counts_meta <- rbindlist(l = list(DT_counts_meta, DT_part))
        }
        rm(DT_part)
    }
    save(DT_counts_meta, file = meta_name, eval.promises = TRUE)
    rm(DT_counts_meta)
}

#' Check a counts collection
#'
#' @aliases \alias{validateCountsCollection}
#' Function checks existing  and  structure of \code{meta.txt} file in specified counts folder.Also it checks accessibility of specified datasets in  corresponding \code{.h5} files.
#' @param collectionDir path to directory with collection
#' @param verbose logical value which determines a content of  the output.
#' @details  \code{collectionDir} should contain a bunch of \code{.h5} files
#'  and a single \code{meta.txt}.  \code{meta.txt} is \code{.tsv}-like file where for each \code{.h5} exists a row wit columns:
#' \describe{ \item{file_name}{ name of \code{.h5} file in \code{collectionDir}.}
#'            \item{sample_id}{name of dataset in \code{file_name} which contains sample IDs (sample_geo_accession for example).}
#'            \item{gene_id}{name of dataset in \code{file_name} which contains ids for genes. For correct work this dataset should contain unique values.}
#'            \item{gene_id_type}{meaning of gene_id. This value determines name of gene annotation field in phantasus.}
#'            }
#'
#' @import data.table
#' @import rhdf5
validateCountsCollection <- function(collectionDir, verbose=FALSE){
    if (file.exists(file.path(collectionDir, "meta.txt"))) {
        h5_meta <- fread(file.path(collectionDir, "meta.txt"), index = "file_name")
        for (input_file  in h5_meta$file_name) {
            full_path <- file.path(collectionDir, input_file)
            h5_datasets <- h5ls(file = file.path(collectionDir, input_file), datasetinfo = FALSE)[c("group","name")]
            h5_datasets <- paste(h5_datasets$group, h5_datasets$name, sep = "/")
            if (!(h5_meta[file_name == input_file, ]$sample_id %in% h5_datasets && h5_meta[file_name == input_file, ]$gene_id %in% h5_datasets)) {
                h5closeAll()
                if (verbose) {
                    message(paste0("ids in meta file don't match ids in ", full_path ))
                }
                return(FALSE)
            }
            gene_ids <- h5read(file = full_path, name = h5_meta[file_name == input_file, ]$gene_id)
            if (length(gene_ids)  != length(unique(gene_ids))) {
                if (verbose) {
                    message(paste("Non-unique gene ids in file", full_path))
                }
                h5closeAll()
                return(FALSE)
            }
        }
        h5closeAll()
        return(TRUE)
    }
    if (verbose) {
        message(paste0("metafile does not exist in ",  file.path(collectionDir)))
    }
    h5closeAll()
    return(FALSE)
}


checkBinValidity <- function(filePath, valid_from) {
  if (!file.exists(filePath)) {
    return(FALSE)
  }

  if (file.info(filePath)$ctime < valid_from) {
    return(FALSE)
  }

  raw_proto_version <- as.raw(PROTOBUF_LAYOUT_VERSION)
  bin_ref <- protolite::serialize_pb(list(raw_proto_version))
  file_head <- readBin(con = filePath,what = raw(), n = length(bin_ref))

  if (!all(bin_ref == file_head)) {
    return(FALSE)
  }

  return(TRUE)
}

phantasusVersion <- function() {
  jsonlite::toJSON(as.character(utils::packageVersion("phantasus")))
}
back to top