https://github.com/Gregor-Mendel-Institute/RKP2021-CMT3
Tip revision: 89d7e2ea78af1969bb161640baed09296ed2485f authored by Papareddy on 23 April 2021, 11:18:44 UTC
Update README.md
Update README.md
Tip revision: 89d7e2e
featurecounts_deseq2.r
#!/usr/bin/env Rscript
################################################
################################################
## REQUIREMENTS ##
################################################
################################################
## DIFFERENTIAL ANALYSIS, SCATTERPLOTS AND PCA FOR SAMPLES IN FEATURECOUNTS FILE
## - FIRST SIX COLUMNS OF FEATURECOUNTS_FILE SHOULD BE INTERVAL INFO. REMAINDER OF COLUMNS SHOULD BE SAMPLES-SPECIFIC COUNTS.
## - SAMPLE NAMES HAVE TO END IN "_R1" REPRESENTING REPLICATE ID. LAST 3 CHARACTERS OF SAMPLE NAME WILL BE TRIMMED TO OBTAIN GROUP ID FOR DESEQ2 COMPARISONS.
## - BAM_SUFFIX IS PORTION OF FILENAME AFTER SAMPLE NAME IN FEATURECOUNTS COLUMN SAMPLE NAMES E.G. ".rmDup.bam" if "DRUG_R1.rmDup.bam"
## - PACKAGES BELOW NEED TO BE AVAILABLE TO LOAD WHEN RUNNING R
################################################
################################################
## LOAD LIBRARIES ##
################################################
################################################
library(optparse)
library(DESeq2)
library(vsn)
library(ggplot2)
library(RColorBrewer)
library(pheatmap)
library(lattice)
library(BiocParallel)
################################################
################################################
## PARSE COMMAND-LINE PARAMETERS ##
################################################
################################################
option_list <- list(make_option(c("-i", "--featurecount_file"), type="character", default=NULL, help="Feature count file generated by the SubRead featureCounts command.", metavar="path"),
make_option(c("-b", "--bam_suffix"), type="character", default=NULL, help="Portion of filename after sample name in featurecount file header e.g. '.rmDup.bam' if 'DRUG_R1.rmDup.bam'", metavar="string"),
make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
make_option(c("-p", "--outprefix"), type="character", default='differential', help="Output prefix", metavar="string"),
make_option(c("-s", "--outsuffix"), type="character", default='', help="Output suffix for comparison-level results", metavar="string"),
make_option(c("-v", "--vst"), type="logical", default=FALSE, help="Run vst transform instead of rlog", metavar="boolean"),
make_option(c("-c", "--cores"), type="integer", default=1, help="Number of cores", metavar="integer"))
opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)
if (is.null(opt$featurecount_file)){
print_help(opt_parser)
stop("Please provide featurecount file.", call.=FALSE)
}
if (is.null(opt$bam_suffix)){
print_help(opt_parser)
stop("Please provide bam suffix in header of featurecount file.", call.=FALSE)
}
################################################
################################################
## READ IN COUNTS FILE ##
################################################
################################################
count.table <- read.delim(file=opt$featurecount_file,header=TRUE,skip=1)
colnames(count.table) <- gsub(opt$bam_suffix,"",colnames(count.table))
colnames(count.table) <- as.character(lapply(colnames(count.table), function (x) tail(strsplit(x,'.',fixed=TRUE)[[1]],1)))
rownames(count.table) <- count.table$Geneid
interval.table <- count.table[,1:6]
count.table <- count.table[,7:ncol(count.table),drop=FALSE]
################################################
################################################
## RUN DESEQ2 ##
################################################
################################################
if (file.exists(opt$outdir) == FALSE) {
dir.create(opt$outdir,recursive=TRUE)
}
setwd(opt$outdir)
samples.vec <- sort(colnames(count.table))
groups <- sub("_[^_]+$", "", samples.vec)
print(unique(groups))
if (length(unique(groups)) == 1) {
quit(save = "no", status = 0, runLast = FALSE)
}
DDSFile <- paste(opt$outprefix,".dds.rld.RData",sep="")
if (file.exists(DDSFile) == FALSE) {
counts <- count.table[,samples.vec,drop=FALSE]
coldata <- data.frame(row.names=colnames(counts),condition=groups)
dds <- DESeqDataSetFromMatrix(countData = round(counts), colData = coldata, design = ~ condition)
dds <- DESeq(dds, parallel=TRUE, BPPARAM=MulticoreParam(opt$cores))
if (!opt$vst) {
rld <- rlog(dds)
} else {
rld <- vst(dds)
}
save(dds,rld,file=DDSFile)
}
################################################
################################################
## PLOT QC ##
################################################
################################################
PlotFile <- paste(opt$outprefix,".plots.pdf",sep="")
if (file.exists(PlotFile) == FALSE) {
pdf(file=PlotFile,onefile=TRUE,width=7,height=7)
## PCA
pca.data <- DESeq2::plotPCA(rld,intgroup=c("condition"),returnData=TRUE)
percentVar <- round(100 * attr(pca.data, "percentVar"))
plot <- ggplot(pca.data, aes(PC1, PC2, color=condition)) +
geom_point(size=3) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1))
print(plot)
## WRITE PC1 vs PC2 VALUES TO FILE
pca.vals <- pca.data[,1:2]
colnames(pca.vals) <- paste(colnames(pca.vals),paste(percentVar,'% variance',sep=""), sep=": ")
pca.vals <- cbind(sample = rownames(pca.vals), pca.vals)
write.table(pca.vals,file=paste(opt$outprefix,".pca.vals.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=TRUE)
## SAMPLE CORRELATION HEATMAP
sampleDists <- dist(t(assay(rld)))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)
## WRITE SAMPLE DISTANCES TO FILE
write.table(cbind(sample = rownames(sampleDistMatrix), sampleDistMatrix),file=paste(opt$outprefix,".sample.dists.txt",sep=""),row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
dev.off()
}
################################################
################################################
## SAVE SIZE FACTORS ##
################################################
################################################
SizeFactorsDir <- "sizeFactors/"
if (file.exists(SizeFactorsDir) == FALSE) {
dir.create(SizeFactorsDir,recursive=TRUE)
}
NormFactorsFile <- paste(SizeFactorsDir,opt$outprefix,".sizeFactors.RData",sep="")
if (file.exists(NormFactorsFile) == FALSE) {
normFactors <- sizeFactors(dds)
save(normFactors,file=NormFactorsFile)
for (name in names(sizeFactors(dds))) {
sizeFactorFile <- paste(SizeFactorsDir,name,opt$outsuffix,".sizeFactor.txt",sep="")
if (file.exists(sizeFactorFile) == FALSE) {
write(as.numeric(sizeFactors(dds)[name]),file=sizeFactorFile)
}
}
}
################################################
################################################
## WRITE LOG FILE ##
################################################
################################################
LogFile <- paste(opt$outprefix,".log",sep="")
if (file.exists(LogFile) == FALSE) {
cat("\nSamples =",samples.vec,"\n\n",file=LogFile,append=TRUE,sep=', ')
cat("Groups =",groups,"\n\n",file=LogFile,append=TRUE,sep=', ')
cat("Dimensions of count matrix =",dim(counts),"\n\n",file=LogFile,append=FALSE,sep=' ')
cat("\n",file=LogFile,append=TRUE,sep='')
}
################################################
################################################
## LOOP THROUGH COMPARISONS ##
################################################
################################################
ResultsFile <- paste(opt$outprefix,".results.txt",sep="")
if (file.exists(ResultsFile) == FALSE) {
raw.counts <- counts(dds,normalized=FALSE)
colnames(raw.counts) <- paste(colnames(raw.counts),'raw',sep='.')
pseudo.counts <- counts(dds,normalized=TRUE)
colnames(pseudo.counts) <- paste(colnames(pseudo.counts),'pseudo',sep='.')
deseq2_results_list <- list()
comparisons <- combn(unique(groups),2)
for (idx in 1:ncol(comparisons)) {
control.group <- comparisons[1,idx]
treat.group <- comparisons[2,idx]
CompPrefix <- paste(control.group,treat.group,sep="vs")
cat("Saving results for ",CompPrefix," ...\n",sep="")
CompOutDir <- paste(CompPrefix,'/',sep="")
if (file.exists(CompOutDir) == FALSE) {
dir.create(CompOutDir,recursive=TRUE)
}
control.samples <- samples.vec[which(groups == control.group)]
treat.samples <- samples.vec[which(groups == treat.group)]
comp.samples <- c(control.samples,treat.samples)
comp.results <- results(dds,contrast=c("condition",c(control.group,treat.group)))
comp.df <- as.data.frame(comp.results)
comp.table <- cbind(interval.table, as.data.frame(comp.df), raw.counts[,paste(comp.samples,'raw',sep='.')], pseudo.counts[,paste(comp.samples,'pseudo',sep='.')])
## WRITE RESULTS FILE
CompResultsFile <- paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.results.txt",sep="")
write.table(comp.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
## FILTER RESULTS BY FDR & LOGFC AND WRITE RESULTS FILE
pdf(file=paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.plots.pdf",sep=""),width=10,height=8)
if (length(comp.samples) > 2) {
for (MIN_FDR in c(0.01,0.05)) {
## SUBSET RESULTS BY FDR
pass.fdr.table <- subset(comp.table, padj < MIN_FDR)
pass.fdr.up.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange > 0)
pass.fdr.down.table <- subset(comp.table, padj < MIN_FDR & log2FoldChange < 0)
## SUBSET RESULTS BY FDR AND LOGFC
pass.fdr.logFC.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1)
pass.fdr.logFC.up.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange > 0)
pass.fdr.logFC.down.table <- subset(comp.table, padj < MIN_FDR & abs(log2FoldChange) >= 1 & log2FoldChange < 0)
## WRITE RESULTS FILE
CompResultsFile <- paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.FDR",MIN_FDR,".results.txt",sep="")
CompBEDFile <- paste(CompOutDir,CompPrefix,opt$outsuffix,".deseq2.FDR",MIN_FDR,".results.bed",sep="")
write.table(pass.fdr.table, file=CompResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
write.table(pass.fdr.table[,c("Chr","Start","End","Geneid","log2FoldChange","Strand")], file=CompBEDFile, col.names=FALSE, row.names=FALSE, sep='\t', quote=FALSE)
## MA PLOT & VOLCANO PLOT
DESeq2::plotMA(comp.results, main=paste("MA plot FDR <= ",MIN_FDR,sep=""), ylim=c(-2,2),alpha=MIN_FDR)
plot(comp.table$log2FoldChange, -1*log10(comp.table$padj), col=ifelse(comp.table$padj<=MIN_FDR, "red", "black"), xlab="logFC", ylab="-1*log10(FDR)", main=paste("Volcano plot FDR <=",MIN_FDR,sep=" "), pch=20)
## ADD COUNTS TO LOGFILE
cat(CompPrefix," genes with FDR <= ",MIN_FDR,": ",nrow(pass.fdr.table)," (up=",nrow(pass.fdr.up.table),", down=",nrow(pass.fdr.down.table),")","\n",file=LogFile,append=TRUE,sep="")
cat(CompPrefix," genes with FDR <= ",MIN_FDR," & FC > 2: ",nrow(pass.fdr.logFC.table)," (up=",nrow(pass.fdr.logFC.up.table),", down=",nrow(pass.fdr.logFC.down.table),")","\n",file=LogFile,append=TRUE,sep="")
}
cat("\n",file=LogFile,append=TRUE,sep="")
}
## SAMPLE CORRELATION HEATMAP
rld.subset <- assay(rld)[,comp.samples]
sampleDists <- dist(t(rld.subset))
sampleDistMatrix <- as.matrix(sampleDists)
colors <- colorRampPalette( rev(brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,clustering_distance_rows=sampleDists,clustering_distance_cols=sampleDists,col=colors)
## SCATTER PLOT FOR RLOG COUNTS
combs <- combn(comp.samples,2,simplify=FALSE)
clabels <- sapply(combs,function(x){paste(x,collapse=' & ')})
plotdat <- data.frame(x=unlist(lapply(combs, function(x){rld.subset[, x[1] ]})),y=unlist(lapply(combs, function(y){rld.subset[, y[2] ]})),comp=rep(clabels, each=nrow(rld.subset)))
plot <- xyplot(y~x|comp,plotdat,
panel=function(...){
panel.xyplot(...)
panel.abline(0,1,col="red")
},
par.strip.text=list(cex=0.5))
print(plot)
dev.off()
colnames(comp.df) <- paste(CompPrefix,".",colnames(comp.df),sep="")
deseq2_results_list[[idx]] <- comp.df
}
## WRITE RESULTS FROM ALL COMPARISONS TO FILE
deseq2_results_table <- cbind(interval.table,do.call(cbind, deseq2_results_list),raw.counts,pseudo.counts)
write.table(deseq2_results_table, file=ResultsFile, col.names=TRUE, row.names=FALSE, sep='\t', quote=FALSE)
}
################################################
################################################
## R SESSION INFO ##
################################################
################################################
RLogFile <- "R_sessionInfo.log"
if (file.exists(RLogFile) == FALSE) {
sink(RLogFile)
a <- sessionInfo()
print(a)
sink()
}
################################################
################################################
################################################
################################################