Revision 0119b3b08d1245ce56be7291017fe4f5df8115bc authored by PaulGinzberg on 18 September 2016, 22:43:25 UTC, committed by GitHub on 18 September 2016, 22:43:25 UTC
1 parent bb622a2
Raw File
mocafunctions.R


moca.pipeline <- function(mat, groups=NULL, gene.whitelist=NULL, gene.screening.function=potentially.significant,
                          max.size=5, max.merges=1e3, max.dist=1, max.p=0.05, alpha.bonferroni=max.p, max.interesting=10,
                          consider=c("subclusters","original","maximalonly"),
                          merge.types=FALSE,merge.duplicates=FALSE,test.by.prefix=FALSE,
                          randomised=TRUE,
                          save.results=NULL,na.action=NA.as.FALSE) {
  # This function runs the whole moca pipeline starting from a binary matrix of mutations.
  # 
  # INPUT:
  # mat: Either 1) A binary matrix with genes/alterations in rows and samples in columns
  #             2) The filepath of a file containing such a matrix as a plain text table
  #             3) A character vector of filepaths from which to load multiple such matrices.
  #      Note that some features assume that the row names of mat are of the form prefix_suffix,
  #      Where prefix is the gene name / gene ID, and suffix is the alteration type (e.g. SNV/AMP/DEL)
  # groups: A factor with the same length as ncol(mat) which groups samples by cancer type. 
  #         Statistical analysis is perfomed on each cancer type separately and then combined using Stouffer's method.
  #         groups will be generated automatically if multiple mutation matrices are loaded.
  # gene.whitelist: Either 1) A vector of the rownames in mat which should be used (length>>1)
  #                        2) The filepath of a file containing such a vector in plain text (length==1)
  # gene.screening.function: A function to select genes based on their mutation counts (input=mat, output=mat with some rows removed)
  #                          By default only genes with a number of mutations greater than log2(number of genes kept) are kept.
  #                          Other options would be for example function(x){x[rowSums(x)>=10,,drop=FALSE]} or function(x){x[rowSums(x)>=0.1*ncol(x),,drop=FALSE]}.
  #                          For no screening, set gene.screening.function=identity.
  # max.size: The maximum allowed size for a gene set
  # max.merges: The maximum number of candidate gene sets to be generated greedily
  # max.dist: Maximum allowed pvalue for merges (irrelevant if >=1 as in default setting).
  #           A sensible lower choice for this value might be 1/(number of genes used), since no multiple hypothesis testing
  #           correction is performed at this stage.
  # max.p: The threshold for significance to report a gene set. (Note that a weighted Bonferroni correction is performed)
  # alpha.bonferroni: Used to compute weights for the weighted Bonferroni correction.
  #                   (An upper bound on) the threshold for significance to include an additional alteration in a gene set.
  #                   Equal to max.p by default.
  # max.interesting: The maximum number of interesting examples to report.
  # consider: If "original", then all candidate gene sets are considered.
  #           If "maximalonly", only maximal candidate gene sets are considered. (not recommended)
  #           If "subclusters" (default) then subsets of the original candidate gene sets are also considered. This might be slow if max.size is large.
  # merge.types: If TRUE, suffixes in the row names of mat are ignored and rows with matching prefixes are merged. (can be used to ignore the distrinction between SNV, AMP and DEL alterations)
  # merge.duplicates: If TRUE, rows of mat that have the same mutation pattern are merged. The suffix ORO is used in naming any such merged rows.
  # test.by.prefix: If TRUE, when computing p-values the amount of cooccurrence between genes having the same prefix is ignored.
  #                 Although the default is FALSE, this should be set to TRUE if both amplifications and deletions of a gene are considered,
  #                 and the prefix_suffix naming convension is followed.
  # randomised: Whether to use randomised p-values to mitigate the conservative nature of discrete p-values. This does not apply to the generation of candidate gene sets.
  # save.results: (Optional) A name to use when saving intermediate computational results are saved to *.Rdata files
  # na.action: A function to handle missing values. By default NA.as.FALSE: missing values in mat will be treated as FALSE.
  #            Set this to na.omit to omit genes with missing values, and to na.pass to handle missing value in each later calculation
  # 
  # OUTPUT:
  # 
  # 
  elapsed <- proc.time()
  # Read and pre-process the data
  consider <- match.arg(consider)
  call <- match.call()
  cat("\nReading and pre-processing inputs...")
  if(!is.null(gene.whitelist)&&length(gene.whitelist)<2) {
    gene.whitelist <- as.character(read.table(gene.whitelist)[[1]])
  }
  if(is.character(mat)) {
    matdir <- mat
    matlist <- list()
    for(k in 1:length(matdir)) {
      mat <- read.table(matdir[k],header=TRUE,row.names=1)
      if(!is.null(gene.whitelist)) {
        mat <- mat[rownames(mat) %in% gene.whitelist,,drop=FALSE]
      }
      matlist <- c(matlist,list(mat))
    }
    if(length(matdir)>1) {
      matlist <- matlist2mat(matlist)
      mat <- matlist$mat
      if(!is.null(groups)) {warning("Argument 'groups' overwritten because multiple matrices were loaded.")}
      groups <- matlist$groups
    }
    rm(matlist)
  }
  mat <- as.matrix(mat)
  if(!is.logical(mat)) {
    if(any(range(mat,na.rm=TRUE)!=c(0,1))) {
      warning("mat converted to logical matrix from ",mode(mat),". Formatting of input data may not be supported.")
    }
    mat <- mat==1
  }
  mat <- na.action(mat)
  if(merge.types) { # Merge e.g. SNP AMP and DEL versions of each gene
    rownames(mat) <- suffixrm(rownames(mat))
    mat <- duplicatemerge(mat)
  }
  if(merge.duplicates) {
    # Merge genes that have exactly the same pattern of mutations
    matd<-duplicaterm(mat)
    #matorig <- mat
    mat <- matd$mat
    matd$mat <- NULL
  } else {
    matd <- list(dlistseed=list(),dlist=list())
  }
  mat <- gene.screening.function(mat)
  if(any(rowMeans(mat,na.rm=TRUE)%in%c(0,1))) { # Remove any genes which are never/always mutated.
    warning(paste(rownames(mat)[which(rowMeans(mat,na.rm=TRUE) %in% c(0,1))],collapse=", ")," is mutated in all/no samples and has been removed.")
    mat <- mat[!(rowMeans(mat,na.rm=TRUE)%in%c(0,1)),]
  }
  min.samples <- floor(log2(nrow(mat)))+1
  cat("DONE\n")
  cat("There are ",nrow(mat)," genes and ",ncol(mat),"samples after pre-processing\n")
  if(nrow(mat)<2||ncol(mat)<2) {stop("Dataset mat is too small (only ",nrow(mat)," by ",ncol(mat)," )")}
  
  # Generate candidate clusters
  clustout <- cooccurrence.clustering(mat,groups=groups,max.merges=max.merges,max.size=max.size,max.dist=max.dist)
  
  if(consider=="subclusters") {
    clusters <- maximal.clusters(clustout$merge,onlymaximal=TRUE)$clusters
    clusters <- all.subclusters(clusters)
  } else if(consider=="original") {
    clusters <- maximal.clusters(clustout$merge,onlymaximal=FALSE)$clusters
  } else if(consider=="maximalonly") {
    clusters <- maximal.clusters(clustout$merge,onlymaximal=TRUE)$clusters
  }
  cat(length(clusters)," candidate gene sets were generated.\n")
  
  cat("Computing p-values...")
  lengths<-sapply(clusters,length)
  if(test.by.prefix) {
    coverageall <- sapply(clusters,function(x)meta.coverage.test(duplicatemerge(mat[x,],names=suffixrm(rownames(mat)[x])),groups=groups,randomised=randomised))
  } else {
    coverageall<-sapply(clusters,function(x)meta.coverage.test(mat[x,],groups=groups,randomised=randomised))
  }
  # Apply weighted Bonferroni Correction
  coverageallc<-coverageall*sapply(lengths,subset.bonferroni,n=nrow(mat),order="size",kmax=max.size,alpha=alpha.bonferroni)
  cat("DONE\n")
  
  # Select Significant clusters
  significantsets<-representative.clusters(clusters,mat,n=Inf,max.p=max.p,pvals=coverageallc)
  # Select interesting clusters
  interestingsets <- significantsets
  maxoverlap <- max.size+1
  while(length(interestingsets$clusters)>max.interesting && maxoverlap>0) {
    maxoverlap <- maxoverlap-1
    interestingsets <- representative.clusters(clusters,mat,n=Inf,max.p=max.p,pvals=coverageallc,maxoverlap=maxoverlap)
  }
  if(length(interestingsets$clusters)==0) {
    interestingsets <- representative.clusters(clusters,mat,n=1,max.p=Inf,pvals=coverageallc,maxoverlap=maxoverlap)
  } else if(length(interestingsets$clusters)>max.interesting) {
    interestingsets <- representative.clusters(clusters,mat,n=max.interesting,max.p=Inf,pvals=coverageallc,maxoverlap=maxoverlap)
  }
  interestingsets$clusters <- relist(rownames(mat)[unlist(interestingsets$clusters)],skeleton = interestingsets$clusters)
  significantsets$clusters <- relist(rownames(mat)[unlist(significantsets$clusters)],skeleton = significantsets$clusters)
  connectedcomponents <- get.connected.components(significantsets$clusters)
  elapsed <- proc.time()-elapsed
  cat(length(significantsets$clusters)," significant gene sets were found, and combined they form a graph with ",length(connectedcomponents)," connected components.\n")
  output <- list(call=call,elapsed=elapsed,significantsets=significantsets,connectedcomponents=connectedcomponents,interestingsets=interestingsets,mat=mat,matd=matd,groups=groups,clustout=clustout,clusters=clusters,raw.p.values=coverageall,corrected.p.values=coverageallc,min.samples=min.samples,maxoverlap=maxoverlap)
  if(!is.null(save.results)) {
    cat("Writing output files ... ")
    #save(call,clustout,elapsed,clusters,significantsets,connectedcomponents,mat,matd,groups,coverageall,coverageallc,maxoverlap,interestingsets,min.samples, file=paste0(save.results,".Rdata"))
    save(output, file=paste0(save.results,".Rdata"))
    description <- data.frame(p.value=coverageall[interestingsets$selection],p.value.corrected=interestingsets$pvals,gene.set=sapply(interestingsets$clusters,paste,collapse=", "))
    write.table(description,paste0(save.results,"-interesting.txt"),row.names=FALSE)
    description <- data.frame(p.value=coverageall[significantsets$selection],p.value.corrected=significantsets$pvals,gene.set=sapply(significantsets$clusters,paste,collapse=", "))
    write.table(description,paste0(save.results,"-significant.txt"),row.names=FALSE)
    description <- data.frame(gene.set=sapply(connectedcomponents,paste,collapse=", "))
    write.table(description,paste0(save.results,"-connectedcomponents.txt"),row.names=FALSE)
    cat("DONE\n")
  }
  #return(list(call=call,elapsed=elapsed,significantsets=significantsets,connectedcomponents=connectedcomponents,interestingsets=interestingsets,mat=mat,matd=matd,groups=groups,clustout=clustout,clusters=clusters,raw.p.values=coverageall,corrected.p.values=coverageallc,min.samples=min.samples,maxoverlap=maxoverlap))
  return(output)
}

get.connected.components <- function(clusters) {
  # Gives the connected components of the graph obtained by adding an edge between two genes
  # whenever they share a cluster.
  # INPUT:
  # clusters: A list of gene sets.
  # OUTPUT:
  # components: A list of gene sets, where any gene sets with genes in common are merged.
  components<-list()
  while(length(clusters)>=1){
    current <- clusters[[1]]
    matches <- sapply(clusters,function(x)any(x %in% current))
    if(sum(matches)>1) {
      clusters[[1]] <- Reduce(union,clusters[matches])
      matches[1] <- FALSE
      clusters[matches] <- NULL
    } else {
      components <- c(components,clusters[1])
      clusters[[1]] <- NULL
    }
  }
  return(components)
}

NA.as.FALSE <- function(x){
  # Convert NA entries to FALSE
  x[is.na(x)]<-FALSE
  return(x)
}
potentially.significant <- function(mat,OtherGenesCoverage=0.5,min.genes=0,round.up=TRUE) {
  # Function to filter out genes which do not have a reasonable chance of being
  # significantly anti-co-occurring after Bonferroni correction.
  # 
  # INPUT:
  # mat: A binary matrix of mutations, where each row corresponds to a gene
  # OtherGenesCoverage: (default 50%) The hypothetical (maximum) coverage of the other gene(s) against which
  #                     each gene might be tested. See DETAILS.
  # min.genes: A minimum number of genes to keep in the output (overrides everything else)
  # round.up: It is possible that including some but not all of the genes with a certain minimum mutation
  #           count makes significance possible. If round.up=TRUE (default) none of them are included.
  #           If round.up=FALSE then all of those genes are included (even though they will not be significant).
  #
  # OUTPUT:
  # mat: same as imput mat, but without the rows which do not have a reasonable chance of being significant.
  #
  # DETAILS:
  # Only genes with mutation count greater or equal to min.samples will be kept.
  # To determin min.samples we use a heuristic based on the binomial approximation,
  # and the idea that it should be possible for the remaining genes to be significantly 
  # anti-co-occurring with respect to a gene that is mutated in a proportion
  # OtherGenesCoverage of the samples (half of the samples by default).
  # A Bonferroni correction of 1/(number of genes kept) is applied.
  samplespergene <- rowSums(mat,na.rm=TRUE)
  #samplespergene <- sort(samplespergene2,decreasing=TRUE)
  #plot(samplespergene,log="y"); lines(1:length(samplespergene),log2(1:length(samplespergene))); #log(1:length(samplespergene),1/(1-samplespergene/ncol(mat)))
  min.samples <- min(samplespergene[samplespergene>log(rank(-samplespergene,ties.method=ifelse(round.up,"max","first"))-1,1/(1-OtherGenesCoverage))])
  # Remove rare mutations
  if(sum(samplespergene>=min.samples)>=min.genes) {
    mat <- mat[samplespergene>=min.samples,,drop=FALSE]
  } else {
    mat <- mat[order(samplespergene,decreasing=TRUE)[1:min.genes],,drop=FALSE]
    warning("min.samples was not enforced due to too few genes remaining. Top ",min.genes," genes used instead.")
  }
  return(mat)
}


duplicaterm<- function(mat) {
  # Remove duplicate rows from mat, but keep track of their names.
  # The purpose of duplicaterm is to only keep one example 
  # when multiple genes have the same mutation pattern.
  # Genes that are only different in samples that have missing values are
  # considered as having the same mutation pattern.
  # The OUTPUT is a list containing
  # dlistseed: a list of the rownames in the new mat matrix which had duplicates in the old mat matrix.
  # Note that the 
  # dlist: a list of vectors of the rownames in the old mat matrix which had the same mutation pattern.
  # The genes in dlist[[i]] have been replaced by the single gene dlistseed[[i]].
  # mat: a new possibly smaller version of the input matrix mat, without duplicate rows.
  # Note that the name used for rows which were duplicated will end with _oro instead of _AMP/_DEL/_SNV
  # oro stands for "or other".
  
  matred <- mat[,!apply(mat,2,function(x)any(is.na(x)))]
  checkforcopies <- function(y) apply(matred,1,function(x)all(x==y,na.rm=TRUE))
  ncopies <- function(y) sum(checkforcopies(y))
  duplicates <- which(duplicated(matred) | duplicated(matred, fromLast = TRUE))
  dlist<-list()
  dlistseed<-list()
  if(length(duplicates)==0) {return(list(dlistseed=dlistseed,dlist=dlist,mat=mat))}
  while(length(duplicates)>0) {
    #print(length(duplicates))
    sel<-duplicates[1]
    aka<-setdiff(which(checkforcopies(mat[sel,])),sel)
    dlist<-c(dlist,list(rownames(mat)[aka]))
    dlistseed<-c(dlistseed,list(rownames(mat)[sel]))
    duplicates<-setdiff(duplicates,c(sel,aka))
  }
  mat<-mat[!(rownames(mat) %in% unlist(dlist)),]
  mn<-match(unlist(dlistseed),rownames(mat))
  namesplus<-strsplit(rownames(mat),split="_")
  namesplus[mn]<-lapply(namesplus[mn],function(x) return(c(x[1],"oro")))
  rownames(mat)<-sapply(namesplus,paste,collapse="_")
  dlist<-mapply(c,dlistseed,dlist)
  names(dlist) <- rownames(mat)[mn]
  dlistseed<-as.list(rownames(mat)[mn])
  return(list(dlistseed=dlistseed,dlist=dlist,mat=mat))
}

duplicatemerge <- function(mat,names=rownames(mat)) {
  # Merges rows of the matrix mat that have the same names
  # Can be used to merge SNP AMP and DEL versions of each gene,
  # so that the gene is considered mutated if any of those three types of alteration is present.
  if(anyDuplicated(names)!=0) {
    for(name in unique(names)) {
      mat[match(name,names),] <- apply(mat[names==name,,drop=FALSE],2,any)#,na.rm=TRUE)
    }
    mat <- mat[match(unique(names),names),,drop=FALSE]
  }
  return(mat)
}


matlist2mat <- function(matlist) {
  # convert a list of matrices/data.frames matlist to
  # a single matrix mat and factor groups indicating which columns
  # came from which matrix in matlist
  groupcounts <- sapply(matlist,ncol)
  groupnames <- names(matlist)
  if(is.null(groupnames)) {groupnames <- paste0("group",1:length(matlist))}
  groups <- factor(rep(groupnames,times=groupcounts))
  
  matlist <- lapply(matlist,function(x)cbind(data.frame(NAMEformerging=rownames(x)),x))
  mat <- matlist[[1]]
  if(length(matlist)>1) {
    for(k in 2:length(matlist)) {
      mat <- merge(mat,matlist[[k]],by=c("NAMEformerging"),all=TRUE)
    }
  }
  rownames(mat) <- mat$NAMEformerging
  mat$NAMEformerging <- NULL
  mat <- as.matrix(mat)
  return(list(mat=mat,groups=groups))
}

cooccurrence.clustering<-function(matlist, groups=NULL, max.merges=1e3, max.dist=1, max.size=Inf) {
  # Simplified / stripped down version.
  # Generates candidate sets of mutually exclusive mutations through a greedy algorithm similar to hierarchical clustering.
  # The output is a series of merges with heights (similar to hierarchical clustering) 
  # and a matrix containing the mutation pattern for each set considered.
  # INPUT
  # matlist: Either a binary matrix with genes in rows and samples in columns, or a list of such binary matrices having the same number of rows
  # groups: a factor indicating the source of the columns of matlist, which if provided is used to split the single matrix matlist into a list of matrices
  # Only anti-cooccurrence within each group will contribute to significance, not between groups.
  # max.merges: the maximum number of clusters to generate
  # max.dist: the maximum pvalue or height at which to merge two clusters. This should be set as a significance level.
  # max.size: the maximum allowed number of genes in each cluster.
  # OUTPUT
  # A list containing
  # merge: an n by 2 matrix, where n is the number of clusters generated. Row i of merge describes the merging of clusters at step i of the clustering. If an element j in the row is negative, then observation -j was merged at this stage. If j is positive then the merge was with the cluster formed at the (earlier) stage j of the algorithm. Thus negative entries in merge indicate agglomerations of singletons, and positive entries indicate agglomerations of non-singletons.
  # height: a set of n pvalues corresponding to the n merges
  # extended.mat: an extended version of matlist with one additional row for each cluster
  
  if(class(matlist)=="matrix") {
    if(is.null(groups)) {
      matlist <- list(onlyone=matlist)
    } else {
      matlist <- by(t(matlist),INDICES=groups,FUN=t,simplify=FALSE)
    }
  }
  Nmats <- length(matlist)
  Nrows <- nrow(matlist[[1]])
  norig <- Nrows
  
  analysing.fun <- function(ij) {
    # Function which returns the p-value for a Fisher exact test
    # between the ij[1] and ij[2] rows. If Nmats>1 then the multiple tests are combined
    
    # Note that if any of the individual pvalues is 1 then the combined pvalue is 1.
    # This means that including information from
    # small samples has the potential of "breaking" the meta-analysis.
    # To avoid this, we use the mid-p-value instead of standard p-values.
    # Using the mid p-value also means lack of evidence against the null leads to lower
    # p-values than weak evidence in favour of the null.
    # Note however that this choice leads to type I error control being approximate.
    
    # Earlier solution: (implemented by commented-out code marked with "AVOIDONES")
    # To avoid this, a group is ignored entirely when
    # a rough estimate of the probability of getting a pvalue of 1 is larger than the 
    # group's normalised (squared) weight in the Stouffer method.
    # Also note that the weights are based on asymptotic theory,
    # so small samples will probably not be given an optimal weight and might create avoidable problems.
    
    #w2sum<-0 # Normalisation for weights
    #meta.p<-0
    at.least.one.computable <- FALSE
    #xytable <- matrix(0,nrow=2,ncol=2)
    
    # initialise quantities
    selectionscore <- numeric(0)
    weights <- numeric(0)
    pvec <- numeric(0)
    
    for(k in 1:Nmats) {
      #xytable <- table(matlist[[k]][ij[1],],matlist[[k]][ij[2],]) #table() is slow so I wrote a simplified version from scratch
      v1 <- matlist[[k]][ij[1],]
      v2 <- matlist[[k]][ij[2],]
      v12nas <- is.na(v1)|is.na(v2)
      v1[v12nas] <- NA
      v2[v12nas] <- NA
      v1s <- sum(v1,na.rm=TRUE)
      v2s <- sum(v2,na.rm=TRUE)
      v12s <- sum(v1&v2,na.rm=TRUE)
      #xytable[2,2] <- v12s
      #xytable[1,2] <- v2s - v12s
      #xytable[2,1] <- v1s - v12s
      #xytable[1,1] <- length(v1) - sum(v12nas) - sum(xytable)
      Ntot <- length(v1)-sum(v12nas)
      
      #if(length(dim(xytable))==2 && all(dim(xytable)==c(2,2)) && v1s>0 && v2s>0)
      if(v1s>0 && v2s>0) {
        at.least.one.computable <- TRUE
        if(Nmats==1) {
          return(phyper(v12s,v1s,Ntot-v1s,v2s))
        } else {
          ##w <- meta.weight(sum(xytable),c(xytable[2,1]+xytable[2,2],xytable[1,2]+xytable[2,2])) # weight
          w <- meta.weight(Ntot,c(v1s,v2s))
          weights <- c(weights,w)
          
          ##AVOIDONES
          #selectionscore <-c(selectionscore,w*(max(v1s,v2s)/Ntot)^-min(v1s,v2s))
          
        }
      } else {
        w <- 0
        #p <- 1
      }
      if(w>0) {
        #p <- fisher.test(xytable,alternative="less",conf.int=FALSE)$p.value # This was unnecessarily slow so replaced with direct p-value calculation
        #p <- phyper(xytable[2,2],xytable[2,1]+xytable[2,2],xytable[1,1]+xytable[1,2],xytable[1,2]+xytable[2,2])
        p <- phyper(v12s,v1s,Ntot-v1s,v2s)
        # Add mid p-value correction
        p <- 0.5*p + 0.5*phyper(v12s-1,v1s,Ntot-v1s,v2s)
        pvec <- c(pvec,p)
        #meta.p <- meta.p+qnorm(p,lower=FALSE)*w
        #w2sum <- w2sum + w^2
      }
    }
    if(!at.least.one.computable) {return(1)}
    
    ##AVOIDONES
    #ord <- order(selectionscore,decreasing=TRUE)
    #cumweight <- cumsum(weights[ord]^2)
    #cumweight[ord] <- cumweight
    #selection <- selectionscore>cumweight
    #return(pnorm(sum(qnorm(pvec[selection],lower=FALSE)*weights[selection]/sqrt(sum(weights[selection]^2))),lower=FALSE))
    
    return(pnorm(sum(qnorm(pvec,lower=FALSE)*weights/sqrt(sum(weights^2))),lower=FALSE))
    #return(pnorm(meta.p/sqrt(w2sum),lower=FALSE))
  }
  
  
  # Initialise quantities
  merge <- matrix(0,ncol=2,nrow=0)
  height <- vector("numeric",0)
  clusters <- as.list(1:norig)
  clustersizes <- rep(1,norig)
  isfull <- rep(FALSE,Nmats)
  
  # Do not consider genes which are always/never mutated.
  permanent.blacklist <- apply(sapply(matlist,rowMeans,na.rm=TRUE),1,function(x)all(x==0)|all(x==1))
  pair.sub <- all.pairs(Nrows)
  pair.sub <- pair.sub[,!((pair.sub[1,] %in% which(permanent.blacklist))|(pair.sub[2,] %in% which(permanent.blacklist))),drop=FALSE]
  # Compute initial distances
  cat("Computing pairwise pvalues ... ")
  pair.dist <- apply(pair.sub,2,analysing.fun)
  cat("DONE\n")
  pair.clustersizes <- rep(2L,length(pair.dist))
  
  #   if(size.downweighting){
  #     sizerank <- rank(rowSums(mat,na.rm=TRUE),ties.method="max")
  #     pair.dist <- pair.dist*pmax(sizerank[pair.sub[1,]],sizerank[pair.sub[2,]])
  #   }
  
  # Main Loop
  #clustersizescount <- rep(0L,max.size)
  #clustersizescount[1] <- norig
  counter <- 0
  temp.max.size <- 2
  cat("Generating clusters of size ",temp.max.size," ... ")
  while(counter<max.merges && temp.max.size<=max.size) {
    if(counter >= max.merges*(temp.max.size-1)/(max.size-1)) {
      temp.max.size <- temp.max.size+1
      cat("DONE\nGenerating clusters of (maximum) size ",temp.max.size," ... ")
    }
    select <- which.min(pair.dist+(pair.clustersizes>temp.max.size))
    new.height <- pair.dist[select]
    if(new.height >= max.dist) {
      temp.max.size <- temp.max.size+1
      cat("DONE\nGenerating clusters of (maximum) size ",temp.max.size," ... ")
      next
    }
    pair <- pair.sub[,select]
    # Make sure the selected pair will never be selected again
    pair.dist[select] <- Inf
    new.cluster <- c(clusters[[pair[1]]],clusters[[pair[2]]])
    # Avoid creating duplicate entries.
    if(any(sapply(clusters,setequal,y=new.cluster))) next
    # Merging of the new cluster with blacklisted clusters will not be attempted.
    #blacklist <- sapply(clusters,function(x,y)length(intersect(x,y)),y=new.cluster)>0
    blacklist <- sapply(clusters,function(x,y)any(y %in% x),y=new.cluster)
    blacklist <- blacklist|permanent.blacklist
    counter <- counter+1
    merge <- rbind(merge,pair.sub[,select])    
    clusters <- c(clusters,list(new.cluster))
    height <- c(height,new.height)
    clustersizes <- c(clustersizes,length(new.cluster))
    #clustersizescount[length(new.cluster)] <- clustersizescount[length(new.cluster)]+1
    # Extend inputs
    #mat<-rbind(mat,ifelse(is.na(mat[pair[1],])|is.na(mat[pair[2],]),NA,mat[pair[1],]|mat[pair[2],])) # old version
    Nrows <- Nrows+1
    for(k in 1:Nmats) {
      matlist[[k]] <- rbind(matlist[[k]],matlist[[k]][pair[1],]|matlist[[k]][pair[2],])
      # Note that | produces the right behaviour with missing data.
      isfull[k] <- all(matlist[[k]][Nrows,],na.rm=TRUE)
    }
    # List new combinations to try
    new.pair.sub <- rbind(Nrows,(1:(Nrows-1))[!blacklist])
    #if(max.size!=Inf) {
    # Avoid merges that would lead to excessively large clusters
    new.pair.clustersizes <- clustersizes[new.pair.sub[1,]]+clustersizes[new.pair.sub[2,]]
    new.pair.sub <- new.pair.sub[,new.pair.clustersizes<=max.size,drop=FALSE]
    new.pair.clustersizes <- new.pair.clustersizes[new.pair.clustersizes<=max.size]
    #}
    if(all(isfull)|clustersizes[Nrows]>=max.size) {
      # If all samples are already mutated for a cluster, it doesn't make sense to try further merging.
      permanent.blacklist<-c(permanent.blacklist,TRUE)
      next
    } else {
      permanent.blacklist<-c(permanent.blacklist,FALSE)
    }
    # Compute additional distances
    new.pair.dist <- apply(new.pair.sub,2,analysing.fun)
    pair.dist<-c(pair.dist,new.pair.dist)
    pair.sub<-cbind(pair.sub,new.pair.sub)
    pair.clustersizes <- c(pair.clustersizes,new.pair.clustersizes)
  }
  cat("DONE\n")
  merge[merge<=norig] <- -merge[merge<=norig]
  merge[merge>norig] <- merge[merge>norig]-norig
  return(list(merge=merge,height=height,extended.mat=matlist))
}

all.pairs <- function(nvar){
  # For nvar variables, output a matrix of all pairs of variables. 
  # Each column is of the form c(i,j) corresponding to the pair (i,j).
  ind2sub <- function(ind,nrow)  c(r = ((ind-1) %% nrow) + 1, floor((ind-1) / nrow) + 1) # Turn vector index into pair of (i,j) matrix indices
  type <- matrix(FALSE,nrow=nvar,ncol=nvar)
  pair.ind <- which(lower.tri(type))
  #rm(type)
  pair.sub <- apply(matrix(pair.ind),1,ind2sub,nrow=nvar) # each column corresponds to a pair of mutations
  dimnames(pair.sub) <- NULL
  #rownames(pair.sub) <- c("i","j");
  return(pair.sub)
}

pinterhyper <- function(q,m,N,ntries=1,tolerance=1e-10) {
  # Outputs the probability that the intersection of random sets of size m[1],m[2],etc 
  # in a universe of size N is less than or equal to q.
  # ntries is the number of genes for multiple hypothesis correction (work in progress)
  # tolerance is the error tolerance for warning about probabilities >1.
  
  if(length(m)==1) return(as.numeric(q>=m))
  if(length(m)==2) return(phyper(q,m[2],N-m[2],m[1]))
  #if(length(m)==2) return(-expm1(phyper(q,m[2],N-m[2],m[1],lower.tail=FALSE,log.p=TRUE)*ntries))
  k<-m[1]
  pk<-1
  for(iter in 2:(length(m)-1)){
    # Safety checks have been commented out
    #if(any(is.na(pk))) {print("x\nx");print(which(is.na(pk)));print("iter");print(iter);print("m");print(m[iter]);print("pk");print(rbind(k,pk))}
    k<-k[pk>0&!is.na(pk)]
    pk<-pk[pk>0&!is.na(pk)]
    #if(sum(pk)<0.999|sum(pk)>1.001) {print("pksum");print(sum(pk))}
    #if(any(is.na(k))|length(k)==0) {print("kkk");print(k);}
    #if(is.na(m[iter])) {print("mmm");print(m);print(iter)}
    x<-max(min(k)-(N-m[iter]),0):min(m[iter],max(k))
    if(ntries==1) {
      p<-dhyper(rep(x,each=length(k)),m[iter],N-m[iter],rep(k,times=length(x)))
    } else {
      p<-dhyper.corrected(rep(x,each=length(k)),m[iter],N-m[iter],rep(k,times=length(x)),ntries=ntries)
    }
    pk<-colSums(matrix(p,ncol=length(x))*pk)
    k<-x
  }
  if(ntries==1){
    p<-phyper(q,m[length(m)],N-m[length(m)],k)
  } else {
    p<- -expm1(phyper(q,m[length(m)],N-m[length(m)],k,lower.tail=FALSE,log.p=TRUE)*ntries)
  }
  pk<-sum(p*pk)
  if(pk>1) {
    if(pk>1+tolerance) {warning("Produced a probability of ",pk," this will be rounded to 1")}
    return(1)
  }
  if(pk<0) {
    warning("Produced a probability of ",pk," this will be rounded to 0")
    return(0)
  }
  return(pk)
}

coverage.test <- function(mat,ntries=1,mid.pvalue=FALSE,randomised=FALSE) {
  # Statistical test for whether the coverage of a gene set is significantly large
  # given mutation counts for each gene and total sample size
  # INPUT
  # mat: a binary matrix where each row corresponds to a gene from the gene set and each column to a sample
  # ntries: (not fully implemented / obsolete) the multiple hypothesis correction (e.g. number of total genes).
  # mid.pvalue: If TRUE compute the mid p-value for the test instead of the p-value
  # randomised: If TRUE randomise the p-value so that it follows a 
  # continuous U(0,1) null distribution instead of the default conservative 
  # discrete distribution.
  # Setting randomised=TRUE overrides mid.pvalue.
  # OUTPUT
  # The pvalue of the test
  if(any(is.na(mat))) {mat<-mat[,!apply(mat,2,function(x)any(is.na(x))),drop=FALSE]}
  q<-sum(apply(!mat,2,all))
  m<-rowSums(!mat)
  names(m)<-NULL
  p.value <- pinterhyper(q,m,ncol(mat),ntries)
  if(randomised) {
    U <- runif(1)
  } else if(mid.pvalue) {
    U <- 0.5
  } else {
    # Standard p-value
    return(pinterhyper(q,m,ncol(mat),ntries))
  }
  # Weighted p-value (either mid or randomised)
  return(U*pinterhyper(q,m,ncol(mat),ntries)+(1-U)*pinterhyper(q-1,m,ncol(mat),ntries))
}

# Now let's generalise coverage.test to allow for a meta-analysis of multiple cancer types.
meta.weight <- function(N,n) {
  # Compute the unnormalised weight for Stouffer's meta-analysis given to a certain group or cancer type Z-score
  # N is the total number of samples (ncol(mat))
  # n is a vector containing the number of mutations for each gene in the set. (rowSums(mat))
  # (Both N and n are based only on data from a specific cancer type / group)
  # The weight is based on a heuristic and asymptotic approximation of the power
  
  m <- N-n
  # For each pair of genes compute the asymptotic standard deviation of the sample log-odds-ratio.
  if(length(n)==2) {
    #z*sqrt(n) is the standard deviation
    z <- sqrt(1/(n[1]*n[2])+1/(n[1]*m[2])+1/(m[1]*n[2])+1/(m[1]*m[2]))
    w <- 1/z
  } else {
    #z <- sqrt(1/outer(n,n)+1/outer(n,m)+1/outer(m,n)+1/outer(m,m))
    #w <- sum(1/z[upper.tri(z)])
    z <- 1/outer(n,n)+1/outer(n,m)+1/outer(m,n)+1/outer(m,m)
    w <- sqrt(sum(1/z[upper.tri(z)]))
  }
  # Note that 1/0=Inf so 1/(1/0+...)=0. Luckily this is what we want.
  # (NO LONGER APPLIES TO THIS VERSION: 1/w is (proportional to) the harmonic mean of standard deviations.)
  # It would be (proportional to) a good approximation for the standard deviation of our test if the overlaps between pairs were independent.
  w <- w/sqrt(N) #*sqrt(2/length(n))
  return(w)
}
meta.coverage.test <- function(mat,groups=NULL,ntries=1,avoidones=FALSE,randomised=FALSE) {
  # Statistical test for whether the coverage of a gene set is significantly large
  # given mutation counts for each gene and total sample size
  # INPUT
  # mat: a binary matrix where each row corresponds to a gene from the gene set and each column to a sample
  # groups: a factor for column grouping (e.g. type of cancer for each sample).
  # We condition on the mutation counts in each group so that between-group patterns are not counted.
  # pvalues from different groups are combined using Stouffer's method.
  # ntries: (not fully implemented) the multiple hypothesis correction (e.g. number of total genes).
  # avoidones: if TRUE, a heuristic will be used to avoid pvalues of 1 from small groups. (additional comments in code)
  # OUTPUT
  # The pvalue of the test
  # We condition on the mutation counts in each group so that between-group patterns are not counted.
  if(is.null(groups)) {return(coverage.test(mat,ntries=ntries))}
  if(any(is.na(mat))) { #Handle missing values
    uselesssamples <- apply(mat,2,function(x)any(is.na(x)))
    # Note: for future work, also group by by is.na on each row of mat, to use some of the partially-missing data better.
    # uselesssamples <- apply(mat,2,function(x)sum(!is.na(x)))<2
    # etc
    mat <- mat[,!uselesssamples,drop=FALSE]
    groups <- groups[!uselesssamples,drop=TRUE]
  }
  
  Nsamplist <- tapply(rep(1,length(groups)),groups,sum)
  nmutlist <- by(t(mat),INDICES=groups,FUN=colSums,na.rm=TRUE)
  #print(nmutlist)
  weights <- mapply(meta.weight,N=Nsamplist,n=nmutlist)
  # Compute individual pvalues
  if(any(weights>0)) {
    if(avoidones) {
      # Avoid using groups where the probability of getting a pvalue of 1
      # is so large that it "outweighs" the contribution from that group.
      # In practice we check that the normalised square weight is larger then the probability
      # of getting a pvalue of 1 under the null, which is computed using a binomial approximation.
      # Note that this is just a heuristic.
      
      selection <- weights^2*(sapply(nmutlist,max)/Nsamplist)^-sapply(nmutlist,function(x)sum(x)-max(x))
      ord <- order(selection,decreasing=TRUE)
      cumweight <- cumsum(weights[ord]^2)
      cumweight[ord] <- cumweight
      selection <- selection>cumweight
    } else {
      selection <- weights>0
    }
    # Note that all the fancy weighting happens before accessing any information about the amount of overlap.
    pvec <- by(t(mat),INDICES=groups,FUN=function(x)coverage.test(t(x),randomised=randomised))
    # Do weighted Stouffer scoring. We ignore weights of 0 since they produce NaNs.
    p <- pnorm(sum(qnorm(pvec[selection],lower=FALSE)*weights[selection]/sqrt(sum(weights[selection]^2))),lower=FALSE)
    return(p)
  } else {
    warning("No useable data.")
    return(1)
  }
}

representative.clusters <- function(clusters,mat=NULL,n=Inf,max.p.value=0.05,pvals=NULL,groups=NULL,kmax=10,maxoverlap=Inf) {
  # This function selects a sublist of clusters from the input clusters.
  # Choose only the clusters with p-values (pvals) less than max.p.value.
  # If input pvals is missing, pvalues will be computed from mat (and groups)
  # If more than n clusters are obtained limit the output to n clusters.
  # Clusters outputted will be sorted from most to least significant.
  # maxoverlap controls the number of genes in a new cluster which is allowed to overlap with a previous cluster
  # Setting maxoverlap=0 will output only disjoint clusters.
  # If a cluster already has a superset/subset in the output (with a smaller p-value), then it will not be added to the output.
  # OUTPUT
  # A list containing
  # clusters: the selected clusters (ordered from smallest to largest pvalue)
  # selection: the indices of the selected clusters in the input list clusters
  # pvals: the pvalues of the selected clusters (taken from input pvals)
  lengths<-sapply(clusters,length)
  if(is.null(pvals)) {
    coverageall<-sapply(clusters,function(x)meta.coverage.test(mat[x,],groups=groups))
    pvals<-coverageall*sapply(lengths,subset.bonferroni,n=nrow(mat),order="size",kmax=kmax)
  }
  if(length(clusters) != length(pvals)) {stop("pvals and clusters should have the same length")}
  if(any(is.na(pvals))) {
    warning("pvals ",paste(which(pvals),collapse=", "),"are NA.")
    pvals[is.na(pvals)] <- Inf
  }
  ind <- seq(along.with=pvals)
  ind <- ind[pvals<=max.p.value]
  clusters <- clusters[pvals<=max.p.value]
  lengths <- lengths[pvals<=max.p.value]
  pvals <- pvals[pvals<=max.p.value]
  
  if(length(clusters)<1) {
    selection <- integer(0)
    return(list(clusters=clusters[selection],selection=selection,pvals=pvals[selection])) 
  }
  ord <- order(pvals)
  clusters <- clusters[ord]
  pvals <- pvals[ord]
  lengths <- lengths[ord]
  ind <- ind[ord]
  
  clustersmat<-clusters2mat(clusters,genes=unique(unlist(clusters)))
  overlap<-t(clustersmat) %*% clustersmat
  
  # Find whether there any subset of a gene set was already included as a more significant gene set
  # Also remove a set if it is a subset of a previously included gene set
  overlap1 <- overlap
  overlap1[lower.tri(overlap,diag=TRUE)] <- 0
  overlap2 <- t(overlap1)
  overlap1 <- overlap1/lengths
  overlap1 <- overlap1 == 1
  overlap2 <- overlap2/lengths
  overlap2 <- overlap2 == 1
  selection <- sort(which(!(apply(overlap1,2,any)|apply(overlap2,1,any))))
  if(maxoverlap<max(lengths)) { # No point in checking for overlaps if this is false.
    k <- 2
    while(k<=length(selection)) {
      if(any(overlap[selection[k],selection[1:(k-1)]]>maxoverlap)) {
        selection <- selection[-k]
      } else {
        k <- k+1
      }
    }
  }
  if(length(selection)>n) {selection <- selection[1:n]}
  return(list(clusters=clusters[selection],selection=ind[selection],pvals=pvals[selection]))  
}


subset.bonferroni <- function(n,k,nmax=n,kmin=2,kmax=k,order=c("size","sizeapprox","stratifiedsize","genesize","simple"),alpha=0.05) {
  # We will assume in this explanation that order="size" (default)
  # computes the weighted Bonferroni correction for a gene set of size k
  # n|nmax is the number of genes to choose from
  # kmin, kmax are the smallest (resp. largest) number of genes allowed in a gene set
  # the weight is chosen such that the probability of a passenger mutation being added
  # to an alteration set is less than alpha
  #(explanation of the order types:
  # sizeapprox: same as size but uses an appproximate formula valid for small alpha, large n annd large kmax.
  #   This can be used when "size" returns NaN due to overflow.
  # simple: unweighted bonferroni correction
  # genesize: order the hypotheses by the coverage of the least mutated gene
  # stratified size: perform a Bonferroni correction separately for each set size, and then for the allowed range of gene set sizes.)
  order <- match.arg(order)
  if(order=="size") {
    #\frac{\sum_{\ell=2}^{\kmax}{m \choose \ell}\prod_{k=2}^{\ell}\left(1-(1-\alpha)^\frac{1}{m-k+1}\right)}{\prod_{k=2}^{|M|}\left(1-(1-\alpha)^\frac{1}{m-k+1}\right)}
    return(sum(choose(nmax,kmin:kmax)*cumprod(-expm1(log1p(-alpha)/(nmax+1-kmin:kmax))))/prod(-expm1(log1p(-alpha)/(nmax+1-kmin:k))))
  }
  if(order=="sizeapprox") {
    return((expm1(alpha)-alpha)*prod((n-k+1):n)/alpha^k)
  }
#   if(order=="size") {
#     return(choose(nmax,k)*factorial(k)/sum(((2*alpha)^(kmin:kmax)/factorial(kmin:kmax)))*(2*alpha)^(-k))
#   }
  if(order=="stratifiedsize") {
    return(choose(nmax,k)*(kmax-kmin+1))
    #return(sum(choose(nmax,kmin:k)*sum(choose(nmax,kmin:kmax)/cumsum(choose(nmax,kmin:kmax)))))
  }
  if(order=="genesize") {
    return(sum(choose(n,kmin:k))*(log(sum(choose(nmax,kmin:kmax)))+1))
  }
  if(order=="simple") {
    return(sum(choose(nmax,kmin:kmax)))
  }
}

suffixrm <- function(x,part=1,split="_") {
  # By default, assume x is a vector or (recursive) list containing character strings of the form
  # "prefix_suffix"
  # and return an object of the same length/shape, where only
  # "prefix"
  # remains.
  # Setting part=2 would keep the suffix instead of the prefix.
  y <- unlist(x)
  y <- strsplit(y,split=split,fixed=TRUE)
  y <- sapply(y,function(x)x[part])
  return(relist(y,skeleton=x))
}

clusters2mat <- function(clusters,genes=NULL,genenames=NULL) {
  # Convert a list of clusters to a binary matrix (genes in rows, clusters in columns)
  if(is.null(genes)) genes<-unique(unlist(clusters))
  if(is.null(genenames)) genenames<-genes
  genesvsclusters<-sapply(clusters,function(x,genes) genes %in% x,genes=genes)
  rownames(genesvsclusters)<-genenames
  colnames(genesvsclusters)<-names(clusters)
  return(genesvsclusters)
}

maximal.clusters <- function(merge,onlymaximal=TRUE) {
  # Generate the clusters based on the merges described in merge (see hclust)
  # OUTPUT:
  # A list containing:
  # clusters: a list of clusters (only the maximal clusters are retained if onlymaximal=TRUE (default))
  # ismaximal: a logical vector indicating which of the merges generate maximal clusters
  names(merge)<-NULL
  clusters<-list()
  for(k in 1:nrow(merge)){
    tmp<-sort(merge[k,])
    if(all(tmp<0)) {
      new.cluster <- -tmp
    } else {
      if(tmp[1]<0) {
        new.cluster <- c(-tmp[1],clusters[[tmp[2]]])
      } else {
        new.cluster <- c(clusters[[tmp[1]]],clusters[[tmp[2]]])
      }      
    }
    clusters<-c(clusters,list(new.cluster))
  }
  ismaximal1 <- !((1:nrow(merge)) %in% merge[merge>0])
  if(!onlymaximal) allclusters<-clusters
  clusters <- clusters[ismaximal1]
  ismaximal2 <- !sapply(clusters,function(x,clusters) sum(sapply(clusters,function(cc,x)all(x %in% cc),x=x))>1,clusters=clusters)
  clusters <- clusters[ismaximal2]
  ismaximal1[ismaximal1]<-ismaximal2
  if(!onlymaximal) clusters<-allclusters
  return(list(clusters=clusters,ismaximal=ismaximal1))
}

subsets <- function(set,min.size=1,max.size=Inf) {
  # List all subsets of a set (vector)
  # subsets of sizes between min.size and max.size only are considered.
  max.size <- min(max.size,length(set))
  if(min.size>max.size) {return(list())}
  out <- lapply(min.size:max.size,function(x)combn(set,x,simplify=FALSE))
  out <- unlist(out,recursive=FALSE)
  return(out)
}

all.subclusters <- function(clusters,min.size=2,max.size=Inf) {
  # Given a list of sets clusters,
  # produce a list of all subsets of the sets.
  # The size of subsets must be between min.size and max.size
  out <- lapply(clusters,subsets,min.size=min.size,max.size=Inf)
  out <- unlist(out,recursive=FALSE)
  out <- out[!duplicated(out)]
  return(out)
}

min.possible.coverage.test <- function(nsample,k,correct=FALSE,nmax=k,alpha=0.05) {
  # Returns the minimum possible p-value from coverage.test when there are
  # nsample: the number of samples
  # k: the number of genes in the gene set
  n <- nsample
  mat <- matrix(FALSE,nrow=k,ncol=n)
  g <- rep(floor(n/k),k)+c(rep(1,n%%k),rep(0,k-(n%%k)))
  cumg <- c(0L,cumsum(g))
  for(i in 1:k)  {
    mat[i,cumg[i]:cumg[i+1]] <- TRUE
  }
  out <- coverage.test(mat)
  if(correct) {out <- out*subset.bonferroni(n=nmax,kmax=k,k=k,alpha=alpha)}
  return(out)
}
get.kmax.upperbound <- function(nsample,nmax,alpha=0.05,correct=TRUE) {
  for(k in 2:nmax) {
    if(min.possible.coverage.test(nsample=nsample,k=k,correct=correct,alpha=alpha,nmax=nmax)>alpha) {
      return(k-1)
    }  
  }
  return(nmax)
}
back to top