https://github.com/taylor-lab/hotspots
Tip revision: 733c727bd4b9f7c1a7f4508b9a467b2f31cacf33 authored by changmt on 01 May 2017, 15:06:09 UTC
Updated Google link to MAF
Updated Google link to MAF
Tip revision: 733c727
funcs.R
# functions for hotspot_algo.R
# returns baseline expected probability given gene length, gene mutation burden, and total samples
get.probability=function(gene,aa,total.muts,total.samples,aa.length) {
top=sum(aa$count[which(aa$toohot)])
aa.length=aa.length-length(which(aa$toohot))
total.muts=total.muts-top
return((1/aa.length)*(total.muts/total.samples))
}
# returns the mutability of the codon given the gene
get.alpha=function(k,aa,mu_prot){
if(is.na(aa$mu_position[k])) return(1)
else return(aa$mu_position[k]/mu_prot)
}
# returns log10 p-value of the binomial distribution
get.pvalues=function(k,total.muts,aa.length,mu_prot,aa,gene,min_prob,total.samples=TOTAL_SAMPLES) {
prob=max(get.probability(gene,aa,total.muts,total.samples,aa.length),min_prob)
alpha=get.alpha(k,aa,mu_prot)
prob=prob*alpha
pbinom(as.numeric(as.character(aa$count[k]))-1,size=total.samples,prob=prob,
lower.tail=FALSE,log.p=TRUE)/log(10,exp(1))
}
# returns the table and column names of table into a single string
combine=function(tb,sep=":") {
out=paste(names(tb)[1],tb[1],sep=sep)
if(length(tb)>1) for(i in 2:length(tb)) out=paste(out,paste(names(tb)[i],tb[i],sep=sep),sep="|")
return(out)
}
# returns the number and types of mutant residues of the codon
get.variant.aa=function(position,mf) {
mf=mf[ which(mf$Amino_Acid_Position==position), ]
variant=rev(sort(table(mf$Variant_Amino_Acid)))
variant=combine(variant)
codon=rev(sort(table(mf$Codons)))
codon=combine(codon)
genomic_pos=rev(sort(table(paste(mf$Chromosome,mf$Start_Position,sep=":"))))
genomic_pos=combine(genomic_pos,sep="_")
return(c(variant,codon,genomic_pos))
}
# returns the number of cancer types with the mutation
get.cancer.type=function(position,mf,d=d) {
mf=mf[ mf$Amino_Acid_Position==position, ]
cnt=c()
for(i in unique(mf$TUMORTYPE)) {
tm=mf[ mf$TUMORTYPE==i, ]
cnt=rbind(cnt,c(i,length(unique(d$Master_ID[ d$TUMORTYPE==i ])),nrow(mf[ mf$TUMORTYPE==i,])))
}
cnt=as.data.frame(cnt)
colnames(cnt)=c('tumortype','samples_muted','mutations')
cnt$mutations=as.numeric(as.character(cnt$mutations))
cnt=cnt[ order(-cnt$mutations), ]
output=paste(cnt$tumortype[1],cnt$samples_muted[1],cnt$mutations[1],sep=':')
if(nrow(cnt)>1) for(i in 2:nrow(cnt)) output=paste(output,paste(cnt$tumortype[i],cnt$samples_muted[i],cnt$mutations[i],sep=':'),sep="|")
return(output)
}
# returns the count of tri-nucleotide contexts of the codon
get.reference.tri=function(pos,maf) {
table(maf$Ref_Tri[ which(maf$Amino_Acid_Position==pos) ])
}
# return the weighted average tri-nucleotide mutability of the codon
get.mu.score=function(pos,maf,mu) {
# given amino acid position, get count of all the reference tri-nucleotide context
t=get.reference.tri(pos,maf)
# get the mutability scores of those tri-nucleotides
ind=match(names(t),mu$tri)
# return the weighted average
return(sum(mu$mu[ind]*t)/sum(t))
}
# return the counts and types of tri-nucleotide contexts
get.all.tri=function(pos,maf){
t=rev(sort(table(maf$Ref_Tri[ which(maf$Amino_Acid_Position==pos) ])))
return(paste(names(t),collapse="|"))
}
# returns the reference amino acid
get.reference.aa=function(pos,maf) {
tb=table(maf$Reference_Amino_Acid[ which(maf$Amino_Acid_Position==pos) ])
tb=tb[ order(tb,decreasing=T) ]
return(combine(tb))
}
# returns dbSNP/1000G/NHLBI ids, if any
get.rsid=function(pos,maf) {
tb=table(maf$dbSNP_RS[ which(maf$Amino_Acid_Position==pos) ])
tb=tb[ order(tb,decreasing=T) ]
if(length(tb)==1) if(names(tb)=='') return('')
return(combine(tb))
}
# return perentage into decimal
convert.to.decimal=function(num){
if(is.na(num)) return(NA)
if(grepl('%',num)) num=gsub('%','',num)
num=as.numeric(as.character(num))
if(num > 1) num=num/100
return(num)
}
# return the variant allele frequency (VAF) ranks of the mutation in the sample
get.sample.rank=function(sample,gene,pos,maf) {
sample=d[ which(d$Master_ID==sample), ]
sample$allele_freq=as.numeric(as.character(unlist(lapply(sample$allele_freq,convert.to.decimal))))
af.rank=sort(unique(sample$allele_freq),decreasing=T)
return(which(af.rank==max(sample$allele_freq[ which(sample$Hugo_Symbol==gene & sample$Amino_Acid_Position==pos) ]))/length(af.rank))
}
# return all the VAF ranks of the mutation in all samples with the mutation
get.af.rank=function(pos,maf) {
samples=maf$Master_ID[ which(maf$Amino_Acid_Position==pos & !is.na(maf$allele_freq)) ]
if(length(samples)==0) return(c(NA,NA))
af.rank=unlist(lapply(samples,get.sample.rank,gene=unique(maf$Hugo_Symbol),pos=pos,maf=maf))
return(c(median(af.rank),paste(length(af.rank),paste(af.rank,collapse=':'),sep="|")))
}
# return the cancer cell fractions (CCF) of the mutation
get.ccf=function(pos,maf) {
ccfs=maf$ccf[ which(maf$Amino_Acid_Position==pos & !is.na(maf$ccf)) ]
if(length(ccfs)==0) return(NA)
return(paste(ccfs,collapse=":"))
}
# perform binomial test for all mutations observed in gene
binom.test_snp=function(gene) {
# Reduce maf down to just of gene G
maf=d[ which(d$Hugo_Symbol==gene), ]
# Get count of mutations as each amino acid position
aa=table(maf$Amino_Acid_Position)
# Get amino acid length of the gene
aa.length=max(maf$Protein_Length)
# Find the mutability of the gene (pre-calculated)
# If gene does not exist, use the average mutability of all genes
ii=which(p$gene==gene)
if(any(ii)) mu_protein=p$score[ p$gene==gene ] else mu_protein=mean(p$score)
# Find the mutability of the amino acid position
mu_position=unlist(lapply(as.numeric(names(aa)),get.mu.score,maf=maf,mu=mu))
aa=cbind(as.data.frame(aa),mu_position)
colnames(aa)=c('pos','count','mu_position')
aa$mu_position=as.numeric(as.character(aa$mu_position))
aa$pos=as.numeric(as.character(aa$pos))
aa$count=as.numeric(as.character(aa$count))
aa$tri=unlist(lapply(aa$pos,get.all.tri,maf=maf))
af.ranks=as.data.frame(do.call('rbind',lapply(aa$pos,get.af.rank,maf=maf)))
colnames(af.ranks)=c('Median_Allele_Freq_Rank','Allele_Freq_Rank')
if('ccf' %in% colnames(maf)) ccfs=unlist(lapply(aa$pos,get.ccf,maf=maf))
else ccfs=rep(NA,nrow(aa))
aa$toohot=FALSE
cutoff=max(as.numeric(quantile(aa$count,0.99)),20)
aa$toohot[ which(aa$count > cutoff) ]=TRUE
# cat(' Compiling output...\n')
info=as.data.frame(do.call('rbind',lapply(aa$pos,get.variant.aa,mf=maf)))
colnames(info)=c('Variant_Amino_Acid','Codon_Change','Genomic_Position')
cancer.types=as.data.frame(do.call('rbind',lapply(aa$pos,get.cancer.type,mf=maf,d=d)))
colnames(cancer.types)='Samples'
output=cbind('Hugo_Symbol'=rep(gene,nrow(aa)),'Amino_Acid_Position'=aa$pos,
'log10_pvalue'=unlist(lapply(1:nrow(aa),get.pvalues,total.muts=sum(aa$count),aa.length=aa.length,mu_prot=mu_protein,aa=aa,gene=gene,min_prob=min_prob)),
'Mutation_Count'=aa$count,'Reference_Amino_Acid'=unlist(lapply(aa$pos,get.reference.aa,maf=maf)),
'Total_Mutations_in_Gene'=rep(sum(aa$count),nrow(aa)),'Median_Allele_Freq_Rank'=af.ranks$Median_Allele_Freq_Rank,
'Allele_Freq_Rank'=af.ranks$Allele_Freq_Rank,'SNP_ID'=unlist(lapply(aa$pos,get.rsid,maf=maf)),info,'Tumortypes'=cancer.types)
output=cbind(output,'Tri-nucleotides'=aa$tri, 'Mutability'=aa$mu_position,'mu_protein'=rep(mu_protein,nrow(aa)))
output=cbind(output,'ccf'=ccfs)
output$Genomic_Position=as.character(output$Genomic_Position)
pval=10^output$log10_pvalue
ppval=c(pval,rep(1,aa.length-length(pval)))
q2=p.adjust(ppval,method='BY')[1:length(pval)]
output=cbind(output,'qvalue'=q2)
# sorting by q-value
output=output[ order(output$qvalue), ]
return(output)
}
# return annotation track of hotspots with alignability/uniqueness scores (UCSC)
annotate.bedGraph.tracks=function(df,bedgraph) {
# formating bedGraph file
setnames(bedgraph,old=colnames(bedgraph),new=c('chromosome','start','stop','score'))
bedgraph$chromosome=gsub('chr','',bedgraph$chromosome)
bedgraph$chromosome[ bedgraph$chromosome=='X' ]=23
bedgraph$chromosome[ bedgraph$chromosome=='Y' ]=24
bedgraph$chromosome=suppressWarnings(as.numeric(as.character(bedgraph$chromosome)))
bedgraph=bedgraph[ which(!is.na(bedgraph$chromosome)), ]
bedgraphgrange=RangedData(IRanges(bedgraph$start,bedgraph$stop),space=bedgraph$chromosome,values=bedgraph$score)
# collect all the genomic positions in hotspot list
pos=matrix(unlist(lapply(df$Genomic_Position,function(x) unlist(strsplit(x,'[|:_]')))),ncol=3,byrow=TRUE)[,1:2]
pos=as.data.frame(pos)
colnames(pos)=c('Chromosome','Start_Position')
pos$Chromosome=as.character(pos$Chromosome)
pos$Chromosome[ which(pos$Chromosome=='X') ]=23
pos$Chromosome[ which(pos$Chromosome=='Y') ]=24
pos$Chromosome=as.numeric(as.character(pos$Chromosome))
pos$Start_Position=as.numeric(as.character(pos$Start_Position))
dirange=split(IRanges(pos$Start_Position,pos$Start_Position),pos$Chromosome)
overlap=findOverlaps(dirange,bedgraphgrange)
overlap=as.data.frame(as.matrix(overlap))
overlap$queryHits=as.numeric(as.character(overlap$queryHits))
overlap$subjectHits=as.numeric(as.character(overlap$subjectHits))
dirange=as.data.frame(dirange)
colnames(dirange)=c('n','chromosome','start','stop','width')
dirange$width=NULL
dirange$n=NULL
dirange$stop=NULL
dirange$chromosome=as.numeric(as.character(dirange$chromosome))
dirange$start=as.numeric(as.character(dirange$start))
bedgraphgrange=as.data.frame(bedgraphgrange)
colnames(bedgraphgrange)=c('chromosome','start','end','width','score')
bedgraphgrange$width=NULL
# get alignability score from IRanges object
get.score=function(i) {
z=which(overlap$queryHits==i)
if(length(z)==0) return(NA)
s=max(bedgraphgrange$score[overlap$subjectHits[ z ]])
return(s)
}
scores=unlist(lapply(1:nrow(dirange),get.score))
dirange=cbind(dirange,scores)
# returns matrix of genomic positions (chromosome, start position)
get.genomic.pos=function(i,df) {
return( matrix(unlist(strsplit(df$Genomic_Position[i],'[|:_]')),ncol=3,byrow=TRUE))
}
# collect alignment scores
final_scores=c()
for(ii in 1:nrow(df)) {
mx=get.genomic.pos(ii,df)
mx[,1]=gsub('X',23,mx[,1])
mx[,1]=gsub('Y',24,mx[,1])
codon_score=c()
for(p in 1:nrow(mx)) {
yy=which(dirange$chromosome==mx[p,1] & dirange$start==mx[p,2])
codon_score=c(codon_score,rep(dirange$scores[yy],mx[p,3]))
}
final_scores=c(final_scores,mean(codon_score))
}
return(final_scores)
}
# return mutation calling centers (TCGA) of a given hotspot
mut.center.breakdown=function(i,sig,maf) {
get.genomic.position=function(x,hs) {
pos=do.call('rbind',strsplit(unlist(strsplit(hs$Genomic_Position[x],'\\|')),'_'))
pos=matrix(do.call('rbind',strsplit(pos[,1],':')),ncol=2)
out=c()
for(i in 1:nrow(pos)) out=c(out,paste(pos[i,],collapse=" "))
return(paste(out,collapse=":"))
}
get.sequencing.centers=function(pos,maf) {
b=pos
chr=unlist(strsplit(pos,' '))[1]
pos=as.numeric(unlist(strsplit(pos,' '))[2])
maf=maf[ which( as.character(maf$Chromosome)==chr & maf$Start_Position==pos& maf$End_Position==pos), ]
seqcenter=maf$Center[ which(!duplicated(maf$Tumor_Sample_Barcode)) ]
if(length(seqcenter)==0) return(c(NA,NA,NA,NA,NA,NA,NA))
return(seqcenter)
}
pos=get.genomic.position(i,hs=sig)
pos=unlist(strsplit(pos,':'))
out=unlist(lapply(pos,get.sequencing.centers,maf=maf))
out=out[ which(out!='') ]
tt=out
out=table(out)
return(c(sig$Hugo_Symbol[i],sig$Amino_Acid_Position[i],names(out)[ which(out==max(out))][1], max(out),sum(out),max(out)/sum(out),paste(tt,collapse=":")))
}
# return TRUE|FALSE of hotspots with putative mutation calling center bias
annotate.center.bias=function(sig,maf) {
# initialize variables
sig$Samples=as.character(sig$Samples)
sig$Hugo_Symbol=as.character(sig$Hugo_Symbol)
sig$Amino_Acid_Position=as.numeric(as.character(sig$Amino_Acid_Position))
sig$tm=paste(sig$Hugo_Symbol,sig$Amino_Acid_Position)
maf$tm=paste(maf$Hugo_Symbol,maf$Amino_Acid_Position)
major.tumortype=function(x,hs) {
samples=do.call('rbind',strsplit(unlist(strsplit(hs$Samples[x],'\\|')),':'))
count=as.numeric(as.vector(samples[,3]))
return(c(samples[1,1],count[1],sum(count)))
}
mjr=lapply(1:nrow(sig),major.tumortype,hs=sig)
mjr=as.data.frame(do.call('rbind',mjr))
colnames(mjr)=c('tt','count','tot')
mjr$tt=as.character(mjr$tt)
mjr$count=as.numeric(as.character(mjr$count))
mjr$tot=as.numeric(as.character(mjr$tot))
mjr$ratio=mjr$count/mjr$tot
ind=which( mjr$tt %in% 'coadread' & mjr$ratio >= 0.5)
tm=suppressWarnings(do.call('rbind',lapply(ind,mut.center.breakdown,sig=sig,maf=maf)))
tm=as.data.frame(tm)
colnames(tm)=c('gene','aa_position','major_calling_center','from_major','total','ratio','seq_centers')
for(i in 1:ncol(tm)) tm[,i]=as.character(tm[,i])
tm$ratio=as.numeric(as.character(tm$ratio))
tm$from_major=as.numeric(as.character(tm$from_major))
tm$total=as.numeric(as.character(tm$total))
tm$ratio=tm$from_major/tm$total
tm=tm[ order(tm$ratio,decreasing=T), ]
tm=tm[ which(!grepl(',',tm$major_calling_center)), ]
tm$idx=paste(tm$gene,tm$aa_position)
tm$num_tt=unlist(lapply(tm$idx,function(x) length(unique(maf$TUMORTYPE[ which(maf$tm==x) ]))))
centerbias=rep(FALSE,nrow(sig))
# return TRUE if there are at least 7 mutations with known mutation calling centers and
# more than 85% of mutations were called from a single mutation calling center or
# if mutaitons are localized to only two tumor types whose majority comes from greater than 75%
ii=which(((tm$ratio>0.85)|(tm$ratio==0.75&tm$num_tt<3)) &tm$from_major>5)
centerbias[ which(sig$tm%in%tm$idx[ii]) ]=TRUE
return(centerbias)
}
# return calculcated entropy around the site of each hotspot
annotate.entropy=function(sig) {
# find genomic position of hotspot
hspos=lapply(strsplit(sig$Genomic_Position,"[|]"),function(x) gsub("_.*$","",x))
hspos=do.call(rbind,lapply(hspos,function(x) c(gsub(":.*$","",x[1]),range(as.numeric(gsub("^.*:","",x))))))
hspos[,1]=paste("chr",hspos[,1],sep="")
pcN=rep(1/4,4)
names(pcN)=c("A","C","T","G")
n=nrow(hspos)
# make the intervals to get sequence
regions=c(rep("up12",n),rep("up24",n),rep("up36",n),rep("dn12",n),rep("dn24",n), rep("dn36",n))
starts=c(as.numeric(hspos[,2])-11,as.numeric(hspos[,2])-23,as.numeric(hspos[,2])-35,rep.int(as.numeric(hspos[,3]),3))
ends=c(rep.int(as.numeric(hspos[,2]),3),as.numeric(hspos[,3])+11,as.numeric(hspos[,3])+23,as.numeric(hspos[,3])+35)
contigs=rep.int(hspos[,1],6)
for(pad in c(11,23,35)) {
rcontigs=sample(seqnames(Hsapiens)[1:23],1000,replace=T)
rstarts=unlist(lapply(rcontigs,function(x) sample(seqlengths(Hsapiens)[x]-(pad+1),1)))
rends=rstarts+pad
starts=c(starts,rstarts)
ends=c(ends,rends)
contigs=c(contigs,rcontigs)
regions=c(regions,rep(paste("rand",pad+1,sep=""),1000))
}
rpts=GRanges(seqnames=contigs,ranges=IRanges(starts,ends),names=regions)
rpts_seq=getSeq(Hsapiens,rpts)
# calculate entropy of returned sequence
pN=lapply(strsplit(as.character(rpts_seq),""),function(x) table(x)+(1/4))
rpts_entropy=unlist(lapply(pN,function(x) -sum((x/sum(x))*log(x/sum(x)))))
for(pad in c(12,24,36)) {
ui=which(rpts$names==paste("up",pad,sep=""))
di=which(rpts$names==paste("dn",pad,sep=""))
sig[[paste("pad",pad,"entropy",sep="")]]=pmin(rpts_entropy[ui],rpts_entropy[di])
}
return(sig[,(ncol(sig)-2):ncol(sig)])
}
# return TRUE|FALSE of whether hotspot is a true positive
annotate.true.positives=function(sig,dmp) {
# return TRUE if hotspot mutation is in true-positive file
tp=rep(FALSE,nrow(sig))
sig$id=paste(sig$Hugo_Symbol,sig$Amino_Acid_Position,sep="-")
tp[sig$id%in%unique(dmp$id)]=TRUE
return(tp)
}
# return homopolyer region, if any, of a given hotspot
get.repeats=function(i,sig,rr) {
# initialize homopolymer file
gene_pos=as.data.frame(matrix(unlist(strsplit(sig$Genomic_Position[i],'[_:|]')),ncol=3,byrow=T))
colnames(gene_pos)=c('chromosome','pos','count')
gene_pos$chromosome=paste('chr',as.character(gene_pos$chromosome),sep="")
gene_pos$chromosome=as.character(gene_pos$chromosome)
gene_pos$pos=as.numeric(as.character(gene_pos$pos))
gene_pos$count=as.numeric(as.character(gene_pos$count))
# find overlap padded by 1 position
pos_range=RangedData(IRanges(gene_pos$pos-1,gene_pos$pos+1),space=gene_pos$chromosome,values=gene_pos$count)
hits=as.data.frame(as.matrix(findOverlaps(pos_range,rr)))
if(nrow(hits)==0) return(c(FALSE,NA,NA))
else {
results=as.data.frame(rr)[as.numeric(as.character(hits$subjectHits[1:nrow(hits)])),]
ret=unlist(strsplit(results$values[1],' '))
return(c(TRUE,ret[1],ret[2]))
}
}
# return homopolymer region metrics: TRUE|FALSE if it is a repeat region, the repeat nucleotide(s), and length of repeat region
annotate.homopolymers=function(sig,homopolymerbed) {
# find overlap
rep_range=RangedData(IRanges(homopolymerbed$start,homopolymerbed$end),
space=homopolymerbed$chromosome,values=paste(homopolymerbed$rep,nchar(homopolymerbed$seq)))
out=lapply(1:nrow(sig),get.repeats,sig=sig,rr=rep_range)
bb=do.call('rbind',out)
colnames(bb)=c('Is_repeat','seq','length')
return(bb)
}
# return filtering judgement based on annotated metrics
annotate.filtering.judgement=function(sig) {
reason=rep('',nrow(sig))
sig$Hugo_Symbol=as.character(sig$Hugo_Symbol)
# sub-clonality filter
ccfs=lapply(strsplit(as.character(sig$ccf),":"),as.numeric)
sig$ccf_subclonal_frac=unlist(lapply(ccfs,function(x) sum(x<0.8)/length(x)))
sig$num_ccf=unlist(lapply(ccfs,function(x) length(x)))
rem=which(sig$num_ccf>1 & sig$ccf_subclonal_frac==1)
threshold=max(sig$ccf_subclonal_frac[sig$TP & sig$num_ccf>1])
xi=which(sig$num_ccf>1 & sig$ccf_subclonal_frac>threshold)
reason[xi]=ifelse(reason[xi]=="","SUBCLONAL_FRAC",paste(reason[xi],"SUBCLONAL_FRAC",sep="|"))
# low af filter
sig$Allele_Freq_Rank=as.character(sig$Allele_Freq_Rank)
sig$num_af=as.numeric(as.character(unlist(lapply(sig$Allele_Freq_Rank,function(x) unlist(strsplit(x,'\\|'))[1] ))))
sig$Allele_Freq_Rank=unlist(lapply(sig$Allele_Freq_Rank,function(x) unlist(strsplit(x,'\\|'))[2] ))
afs=lapply(strsplit(sig$Allele_Freq_Rank,":"),as.numeric)
sig$low_af_frac=unlist(lapply(afs,function(x) sum(x>0.8)/length(x)))
threshold=max(sig$low_af_frac[which(sig$TP & sig$num_af>5)])
xi=which(sig$num_af>5 & sig$low_af_frac>threshold)
reason[xi]=ifelse(reason[xi]=="","LOW_AF_FRAC",paste(reason[xi],"LOW_AF_FRAC",sep="|"))
# center bias filter
if('seq_bias' %in% colnames(sig)) {
xi=which(sig$seq_bias)
reason[xi]=ifelse(reason[xi]=="","CENTER_BIAS",paste(reason[xi],"CENTER_BIAS",sep="|"))
}
# entropy filter
xi=which(sig$pad12entropy<min(sig$pad12entropy[sig$TP]) | sig$pad24entropy<min(sig$pad24entropy[sig$TP]))
reason[xi]=ifelse(reason[xi]=="","LOCAL_ENTROPY",paste(reason[xi],"LOCAL_ENTROPY",sep="|"))
# homopolymer filter
if('Is_repeat' %in% colnames(sig) & 'seq' %in% colnames(sig) & 'length' %in% colnames(sig)) {
xi=which(sig$Is_repeat & sig$length>=10)
reason[xi]=ifelse(reason[xi]=="","HOMOPOLYMER",paste(reason[xi],"HOMOPOLYMER",sep="|"))
}
# mapability filter
if('align100'%in%colnames(sig) & 'align24'%in%colnames(sig)) {
ranked_align=rowSums(cbind(rank(sig$align100),rank(sig$align24)))
xi=which(ranked_align<min(ranked_align[sig$TP]) | sig$align24<quantile(sig$align24,probs=0.125))
reason[xi]=ifelse(reason[xi]=="","ALIGNABILITY",paste(reason[xi],"ALIGNABILITY",sep="|"))
}
# biologically-driven filters of putative false positives
retained=table(sig$Hugo_Symbol[reason==""])
excluded=table(sig$Hugo_Symbol[reason!=""])
xi=names(excluded)[which(names(excluded)%in%names(retained))]
de=list()
de$excluded=excluded[xi]
de$retained=retained[xi]
de=as.data.frame(de)
xi=which(sig$Hugo_Symbol%in%rownames(de)[de$retained<=(de$excluded*3.5)] & reason=="")
reason[xi]=ifelse(reason[xi]=="","FP_RICH_GENE",paste(reason[xi],"FP_GENE",sep="|"))
supp_table_7_mutsigcv_paper=c(
"PCLO","FLG","BAGE2","TPTE","TTN","CSMD1","CSMD3","RYR2","RYR3","MUC16","MUC4",
"MUC17","MUC5B","OR10G9","OR2G6","OR4C6","OR4M2","OR2T4","OR5L2","OR2T33"
)
xi=which(sig$Hugo_Symbol%in%supp_table_7_mutsigcv_paper)
reason[xi]=ifelse(reason[xi]=="","FP_MUTSIG",paste(reason[xi],"FP_MUTSIG",sep="|"))
return(reason)
}
split.double=function(ind,df) {
ref=unlist(strsplit(df$Reference_Amino_Acid[ind],''))
vart=unlist(strsplit(df$Variant_Amino_Acid[ind],''))
out=c()
for(i in 1:length(ref)) {
tm=df[ind,]
tm$Amino_Acid_Position=tm$Amino_Acid_Position+(i-1)
tm$Reference_Amino_Acid=ref[i]
tm$Variant_Amino_Acid=vart[i]
if(ref[i]=='*') tm$Consequence='stop_lost'
else if(vart[i]=='*') tm$Consequence='stop_gained'
else if(ref[i]==vart[i]) tm$Consequence='synonymous_variant'
else tm$Consequence='missense_variant'
tm$Amino_Acid_Change=paste(tm$Reference_Amino_Acid,tm$Amino_Acid_Position,tm$Variant_Amino_Acid,sep="")
out=rbind(out,tm)
}
return(out)
}
#returns other positions that are 1 bp away from pos
find.splice.neighbors=function(pos,df) {
dif=abs(df$Amino_Acid_Position-pos)
ind=which(dif==1)
if(length(ind)==0) return(NA)
return(unique(c(pos,df$Amino_Acid_Position[ind])))
}
#returns TRUE if there are other positions that are 1 bp away
any.neighbors=function(df) {
tm=sort(unique(df$Amino_Acid_Position))
tt=lapply(tm,find.splice.neighbors,df=df)
if(length(which(!is.na(tt))) > 0) return(TRUE)
return(FALSE)
}
#returns the first occurance of a neighbor
get.neighbors=function(df) {
tm=sort(unique(df$Amino_Acid_Position))
tt=lapply(tm,find.splice.neighbors,df=df)
return(tt[[which(!is.na(tt))[1]]])
}
remove.unexpressed.genes=function(i,maf,express) {
#coad samples
if(maf$TUMORTYPE[i]=='coadread') {
tt=which(colnames(express)=='coad' | colnames(express)=='read')
g=which(express$gene_id==maf$Hugo_Symbol[i])
if(!any(tt) | !any(g) ) return(1==2)
else {
explvl=as.character(express[ g, tt ])
explvl=do.call('rbind',strsplit(explvl,';'))
samples=as.numeric(explvl[,1])
tot=as.numeric(explvl[,3])
return(samples < 0.1*tot[1] && samples < 0.1*tot[2])
}
}
tt=grep(maf$TUMORTYPE[i],colnames(express))
g=which(express$gene_id==maf$Hugo_Symbol[i])
if(!any(g)) return(1==2)
#if tumortype is not included, then i will infer gene expressions based on other tumortypes
#if gene is not expressed in 95% of all tumor samples, then i infer this gene is not expressed
#in the tumor samples we do not have RNASeqV2 data for
else if(!any(tt)) {
explvl=express[ g, ]
explvl=do.call('rbind',strsplit(as.character(explvl[2:length(explvl)]),';'))
samples=sum(as.numeric(as.character(explvl[,1])))
tot=sum(as.numeric(as.character(explvl[,3])))
return(samples < 0.05*tot)
}
else {
explvl=express[ g, tt ]
explvl=unlist(strsplit(explvl,';'))
samples=as.numeric(explvl[1])
tot=as.numeric(explvl[3])
return(samples < 0.1*tot)
}
}
# deprecated germline SNP filtering based on 1000/NHLBI
# putative germline SNPs are filtered based on ExAC with minor AF > 0.06%
# ExAC r0.2 has been preloaded
remove.snps=function(maf) {
return(maf[ which(!paste(maf$Chromosome,maf$Start_Position,maf$Tumor_Seq_Allele2)
%in% paste(exacr0_2snps$Chromosome,exacr0_2snps$Position,exacr0_2snps$Alt)), ])
}
remove.unexpressed.mutations=function(maf,expressiontb) {
ind=unlist(lapply(1:nrow(maf),remove.unexpressed.genes,maf=maf,express=expressiontb))
ind=which(ind)
if(any(ind)) maf=maf[ -ind, ]
return(maf)
}
prepmaf=function(maf,expressiontb) {
cat('Prepping MAF for analysis ...\n')
#only non-indel, coding mutations
coding=c('initiator_codon_variant','missense_variant','splice_acceptor_variant',
'splice_donor_variant','stop_gained','stop_lost','stop_retained_variant','synonymous_variant')
cat(' ... Reducing MAF to protein-coding substitutions\n')
maf=maf[ which(maf$Consequence %in% coding & maf$CANONICAL=='YES' & maf$BIOTYPE=='protein_coding'), ]
#remove indels
maf=maf[ which(!maf$Variant_Type%in%c('INS','DEL')), ]
# add additional annotations
if(!'TUMORTYPE' %in% colnames(maf)) maf$TUMORTYPE='none'
if(!'Master_ID' %in% colnames(maf)) maf$Master_ID=maf$Tumor_Sample_Barcode
maf$Amino_Acid_Change=gsub('p.','',maf$HGVSp_Short)
maf$Amino_Acid_Position=unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,"/"))[1] ))
maf$Protein_Length=as.numeric(unlist(lapply(maf$Protein_position,function(x) unlist(strsplit(x,'\\/'))[2])))
maf$Reference_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[1]))
maf$Variant_Amino_Acid=unlist(lapply(1:nrow(maf), function(x) unlist(strsplit(maf$Amino_acids[x],'/'))[2]))
maf$allele_freq=maf$t_alt_count/(maf$t_alt_count+maf$t_ref_count)
#subset splice_site mutations
splice=c('splice_acceptor_variant','splice_donor_variant')
ss=maf[ which(maf$Consequence %in% splice), ]
maf=maf[ which(!maf$Consequence %in% splice), ]
cat(' ... Re-annotating substitutions\n')
# annotating oligo mutations
i=which(nchar(maf$Reference_Amino_Acid)!=1 & nchar(maf$Variant_Amino_Acid)!=1 & !grepl('ext',maf$Reference_Amino_Acid))
if(any(i)) {
dd=maf[i,]
dd$Reference_Amino_Acid[ which(is.na(dd$Reference_Amino_Acid))]='NA'
dd$Variant_Amino_Acid[ which(is.na(dd$Variant_Amino_Acid))]='NA'
dd$Amino_Acid_Position=unlist(lapply(dd$Amino_Acid_Position,function(x) as.numeric(as.character(unlist(strsplit(x,'-'))[1]))))
tm=do.call('rbind',lapply(1:nrow(dd),split.double,df=dd))
maf=maf[-i, ]
maf=rbind(maf,tm)
}
# annotating splice site mutations
ss=ss[ nchar(ss$Reference_Allele)<3, ]
ss$Amino_Acid_Position=as.numeric(ss$Start_Position)
ss$Reference_Amino_Acid='SS'
ss$Variant_Amino_Acid='SS'
out=c()
for(i in unique(ss$Chromosome)) {
tm=ss[ ss$Chromosome==i, ]
while(any.neighbors(tm)) {
neighbors=get.neighbors(tm)
tm$Amino_Acid_Position[ tm$Amino_Acid_Position %in% neighbors ]=min(neighbors)
}
out=rbind(out,tm)
}
ss=out
maf=rbind(maf,ss)
# remove putative germline mutations
cat(' ... Removing putative germline SNPs based on ExAC r0.2\n')
maf=remove.snps(maf)
# remove mutations in unexpressed genes
cat(' ... Removing unexpressed genes\n')
maf=remove.unexpressed.mutations(maf,expressiontb)
maf$Amino_Acid_Position=as.numeric(maf$Amino_Acid_Position)
maf$Variant_Type='SNP'
return(maf)
}