https://github.com/abreschi/Rscripts
Raw File
Tip revision: d313bde3bbeb8a93015c4445a038507e738c3617 authored by Alessandra Breschi on 02 March 2019, 09:14:14 UTC
add range feature to scale_matrix.R
Tip revision: d313bde
edgeR.analysis.R
#!/usr/bin/env Rscript 


options(stringsAsFactors=F)


#########################
# Default OPT parameters
########################

opt = list()
opt$n_samples = 2
opt$cpm = 1
opt$replace_NAs = TRUE

##################
# OPTION PARSING
##################

suppressPackageStartupMessages(library("optparse"))

option_list <- list(
make_option(c("-i", "--input_matrix"), default="stdin", help="the matrix with READ COUNTS you want to analyze [default=%default]"),
make_option(c("-r", "--replace_NAs"), action="store_true", default=FALSE, help="replace NAs with 0 [default=%default]"),
make_option(c("-m", "--metadata"), help="tsv file with metadata on matrix experiment"),
make_option(c("-f", "--fields"), help="choose the fields you want to use in the differential expression, comma-separated"),
make_option(c("-F", "--formula"), default="~.", help="a formula for the differential expression [default=%default]"),
make_option(c("-T", "--tissue_sp"), default=FALSE, action="store_true", help="Build a linear model with grand mean intercept"),
make_option(c("-C", "--cpm"), help="threshold for cpm [default=%default]", type="integer", default=1),
make_option(c("-s", "--n_samples"), help="minimum number of samples with cpm > \"cpm\" [default=%default]", type="integer", default=2),
make_option(c("-c", "--coefficient"), type="character", help="the coefficient(s) for the comparison"),
make_option(c("-n", "--contrast"), help="Numbers comma-separated giving the vector of contrasts. Use instead of coefficients"),
make_option(c("-o", "--output"), help="additional suffix for output [default=%default]", default='out'),
make_option(c("-d", "--output_dir"), help="choose the output directory [default=%default]", default="./"),
make_option(c("-g", "--genes"), help='a file with a list of genes to filter', type='character'),
make_option(c("-v", "--verbose"), action="store_true", default=FALSE, help="verbose output")
)

parser <- OptionParser(usage = "%prog [options] file", option_list=option_list)
arguments <- parse_args(parser, positional_arguments = TRUE)
opt <- arguments$options
if (opt$verbose) {print(opt)}


##------------
## LIBRARIES
##------------ 


if(opt$verbose) {cat('Libraries loading... ')}

suppressPackageStartupMessages(library('reshape2'))
suppressPackageStartupMessages(library('ggplot2'))
#suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library('edgeR'))

if (opt$verbose) {cat('DONE\n\n')}

# ==========================================
# Function for loading Rdata
# ==========================================

load_obj <- function(f)
{
    env <- new.env()
    nm <- load(f, env)[1]
    env[[nm]]
}


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Function that estimates the average of gene counts
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

mglmAverage = function(object, lev,  prior.count = 0.125, dispersion = 'auto') {
	# Evaluate dispersion argument
    dispersion <- switch(dispersion, common = object$common.dispersion,
        trended = object$trended.dispersion, tagwise = object$tagwise.dispersion,
        auto = getDispersion(object))
    # Get the library size 
    lib.size <- object$samples$lib.size * object$samples$norm.factors
    # Divide each library size by the mean library size across ALL conditions
    # they will be pseudo counts for all genes in each sample
    prior.count <- prior.count * lib.size/mean(lib.size)
    # Take the log of the library size (I don't understand why they add prior.count since it 
    # is negligible compared to the library size)
    offset.aug <- log(lib.size + 2 * prior.count)

    # Indeces of the specified level
    j <- object$samples$group == lev
    # Number of elements in level
    n <- sum(j)
    # Number of elements for which there are counts
    ntags <- nrow(object$counts)
    # Subset the matrix of counts
    y <- object$counts[, j, drop=FALSE]

    mglmAvg = mglmOneGroup(y + matrix(prior.count[j], ntags, n, byrow = TRUE), 
        offset = offset.aug[j], dispersion = dispersion)
	names(mglmAvg) <- rownames(object$counts)
    return(mglmAvg)
}



##-------------------------##
## Differential Expression ##
##-------------------------##

# specify the design to the program
fields = strsplit(opt$fields, ",")[[1]]

# Report error and quit if contrasts and coefficients are both provided
if (!is.null(opt$contrast) & !is.null(opt$coefficient)) {
	cat("ERROR: Provide contrasts OR coefficient!\nABORTED\n")
	q(save='no')
} 
if (is.null(opt$contrast) & is.null(opt$coefficient) & length(fields)>1) {
	cat("ERROR: Provide contrasts or coefficient!\nABORTED\n")
	q(save='no')
}

coeff = NULL
if (!is.null(opt$coefficient)) {
	coeff = eval(parse(text=opt$coefficient))
}

contr = NULL
if (!is.null(opt$contrast)) {
	contr <- eval(parse(text=opt$contrast))
} 

# read the matrix from the command line
#if (opt$input_matrix == "stdin") {inF = file("stdin")} else {inF=opt$input_matrix}
#m = read.table(inF, h=T, sep="\t")
if (opt$input_matrix == "stdin") {
    m = read.table(file("stdin"), h=T)
} else {
    m <- try(load_obj(opt$input_matrix), silent=T)
    if (class(m) == "try-error") {m <- read.table(opt$input_matrix)}
}

if (opt$replace_NAs) {m<-replace(m, is.na(m), 0)}
genes = rownames(m)
m = (apply(m, 2, as.integer))
rownames(m) <- genes

if (opt$verbose) {print(head(m))}


# Keep only selected genes if needed
if (!is.null(opt$genes)) {
fgenes = as.character(read.table(opt$genes, h=F)$V1)
m = m[intersect(rownames(m),fgenes),]
}


# read the metadata from the metadata file
mdata = read.table(opt$metadata, h=T, sep='\t', quote=NULL, check.names=F)
mdata[,"labExpId"] <- gsub("[,-]", ".", mdata[,"labExpId"])
mdata = unique(mdata[,c("labExpId", fields)])
mdata = mdata[match(colnames(m), mdata[,"labExpId"]), c("labExpId", fields)]


# ONLY ONE CONDITION
#if (length(fields) == 1) {
if (length(unique(unlist(mdata[fields]))) == 2) {
condition = factor(mdata[match(colnames(m), mdata[,"labExpId"]), fields])
if (opt$verbose) {print(condition)}

# create count object for edgeR
M = DGEList(counts=na.omit(m), group = condition)
if (opt$verbose) {cat(nrow(M$counts), "/", nrow(m), "did not contain NAs\n\n")}

# This also estimates the library size as the sum of all reads in each sample
# To change the library size
# M$samples$lib.size <- colSums(M$counts)

# TO DO: Add option to provide the library size 


# ~~~~ Normalisation ~~~~~

# Divide by the library size
cpm.m <- cpm(M)
# Filter by cpm and number of samples where the gene is expressed
M <- M[rowSums(cpm.m > opt$cpm) >= opt$n_samples,]
if(opt$verbose) {cat(nrow(M$counts), "passed the required filters\n\n")}

# Normalize by TMM
M <- calcNormFactors(M, method="TMM")

# variance estimation
M <- estimateCommonDisp(M, verbose=T)
M <- estimateTagwiseDisp(M)

# calling differential expression
et <- exactTest(M)
res = topTags(et, n=nrow(et))$table

# Add the estimated averages for each level (the function is a subset of exactTest)
for(lev in levels(condition)) {
#	avg = mglmAverage(M, lev)
#	res[lev] = round(mglmAverage(M, lev)[rownames(res)], digits=2)
	res = cbind(avg=round(mglmAverage(M, lev)[rownames(res)], digits=2), res)
	colnames(res)[which(colnames(res) == "avg")] <- lev
}

# MULTIPLE CONDITIONS

}else{
#design_df = mdata[mdata[,"labExpId"] %in% colnames(m),]
#design_df = droplevels(design_df)
#design_df = design_df[order(design_df$labExpId),]
#design = model.matrix(as.formula(opt$formula), design_df[fields])
design = model.matrix(as.formula(opt$formula), mdata[fields])
#print(colnames(design))
rownames(design) <- mdata[, "labExpId"]

if (opt$tissue_sp) {
	mdata[,fields] <- factor(mdata[,fields])
	contrasts(mdata[,fields]) <- contr.sum(levels(mdata[,fields]))
	design = model.matrix(as.formula(opt$formula), mdata[fields])
	rownames(design) <- mdata[, "labExpId"]
}

print(colnames(design))

cat('\n')


# create count object for edgeR
M = DGEList(counts=na.omit(m))

# normalisation
cpm.m <- cpm(M)
M <- M[rowSums(cpm.m > opt$cpm) >= opt$n_samples,]
#M$samples$lib.size <- colSums(M$counts)
M <- calcNormFactors(M)

# variance estimation
M <- estimateGLMCommonDisp(M, design, verbose=T)
M <- estimateGLMTrendedDisp(M, design)
M <- estimateGLMTagwiseDisp(M, design)


# calling differential expression
if (opt$tissue_sp) {
	fit <- glmQLFit(M, design)
	lrt <- glmQLFTest(fit, coef=coeff, contrast=contr)
} else {
	fit <- glmFit(M, design)
	lrt <- glmLRT(fit, coef=coeff, contrast=contr)
}
res = topTags(lrt, n=nrow(lrt))$table
}

# format the otuput and write to a file
if(length(coeff)<2) {res$logFC <- round(res$logFC, 2)}
res$logCPM <- round(res$logCPM, 2)
res$PValue <- format(res$PValue, digits=2)
res$FDR <- format(res$FDR, digits=2)

output = sprintf("%s/edgeR.cpm%s.n%s.%s", opt$output_dir, opt$cpm, opt$n_samples, opt$output)
write.table(res, file=sprintf("%s.tsv", output), quote=F, sep='\t',row.names=T)
#write.table(cpm.m, file=sprintf("cpm.%s.tsv",opt$output), quote=F, sep='\t',row.names=T)

#------#
# PLOT #
#------#

pdf(sprintf('%s.pdf', output))

if (length(coeff)<2) {
	# MA plot
	gp = ggplot(res, aes(x=logCPM, y=logFC))
	gp = gp + geom_point(shape=1, aes(color=cut(as.double(FDR), breaks=c(0, 0.01, 0.05, 1))))
	gp = gp + scale_color_brewer(name="FDR", palette="Set1")
	gp = gp + geom_hline(yintercept=c(2,-2))
	gp
	
	# Volcano plot
	gp = ggplot(res, aes(x=logFC, y=-log10(as.double(FDR))))
	gp = gp + geom_point(shape=1, aes(color=cut(as.double(FDR), breaks=c(0, 0.01, 0.05, 1))))
	gp = gp + scale_color_brewer(name="FDR", palette="Set1")
	gp = gp + scale_y_log10()
	gp = gp + geom_vline(xintercept=c(2,-2))
	gp
	
}

# plot dispersion estimates
plotBCV(M)
dev.off()










q(save='no')


back to top