https://github.com/spleonard1/Tn-seq
Raw File
Tip revision: f6d20057f6541e72d184b12f33a99af0a4b6227b authored by spleonard1 on 06 December 2016, 18:09:12 UTC
script to count TA sites not general
Tip revision: f6d2005
TnSeqDESeq2Essential.R
# Defined as a function, return value is table of differential fitness results with essentiality calls (source into R/RStudio for interactive analyses)
TnSeqDESeqEssential <- function(ctrl_pfx, ctrl_reps, gff_pfx, out_pfx, to_trim, num_expected, in_files) {
	# Read in sites files
	library(dplyr)
	library(readr)
	sites <- data.frame(Pos=c(0)) 
	for (i in 1:length(in_files)) {
		newsites <- read.table(paste(paste(in_files[i], in_files[i], sep="/"), "sites.txt", sep="-")) 
		colnames(newsites) <- c(paste("V", i, sep=""), "Pos")
		newsites <- tail(newsites, n=-to_trim) %>% arrange(Pos)
		sites <- merge(sites, newsites, all=T) 
	}
	sites <- tail(sites, n=-1)
	sites[is.na(sites)] <- 0

	# OPTIONAL - perform site filtering. Example: only consider sites identified in both of 2 replicates
	#sites <- sites %>% mutate(numreps = (V1 > 0) + (V2 > 0)) %>% filter(numreps == 2)
	#sites <- sites[-4]
	
	# LOESS smooth data
	for (i in 2:(length(sites))) { 
		counts.loess <- loess(sites[,i] ~ sites$Pos, span=1, data.frame(x=sites$Pos, y=sites[,i]), control=loess.control(statistics=c("approximate"),trace.hat=c("approximate")))
		counts.predict <- predict(counts.loess, data.frame(x=sites$Pos))
		counts.ratio <- counts.predict/median(counts.predict)
	    sites[,i] <- sites[,i]/counts.ratio
	}

	# Normalize data by reads/site
	library(DESeq2)
	colData <- data.frame(c(rep(ctrl_pfx, ctrl_reps)), condition = rep("untreated", ctrl_reps))
	sitescds <- sites[,2:length(sites)] %>% round %>% DESeqDataSetFromMatrix(colData = colData, design= ~ 1)
	sitescds <- estimateSizeFactors(sitescds)
	#Output the normalized counts
	counts.norm <- counts(sitescds, normalized=F)
	rownames(counts.norm) <- sites$Pos
	
	# Initialize the list of genes, determine genome length
	# No longer expects KEGG or pathway info here.  You'll have to combine that yourself outside fo this script
	gff <- read.delim(file=paste(gff_pfx, ".trunc.gff", sep=""), sep="\t", fill=TRUE, header=FALSE, col.names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "att")) 
	#colnames(gff) <- c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "att", "KO", "pathways")
	#genomelength <- as.numeric(strsplit(as.character(gff[3,1]), " ")[[1]][4])
	genomelength <- as.numeric(max(gff$end)*1.1)
	#print(head(gff))
	#gff <- tail(gff, n=-2)
	#change this value if your gff file contains anything besides genes
	gff <- gff[(gff$feature=="gene"),]
	#print(head(gff))
	# Generate pseudo-datasets with the same number of insertion sites and total reads mapping to those sites, randomly distributed across the genome
	print("Generating pseudo-datasets")
	counts.df <- data.frame(counts.norm)
	counts.df$Pos <- as.numeric(rownames(counts.df))
	numreads <- sum(counts.norm)/ctrl_reps
	numsites <- length(which(counts.norm>0))/ctrl_reps
	##read in list of possible TA sites from "gff_pfx_TAsites.tsv" and sample from only TA sites to calculate essential genes
	TA_sites <- read_tsv(paste0(gff_pfx, "_TAsites.tsv"))
	for (i in 1:num_expected) {
		print(paste0("Build null dataset ", i))
		expected <- data.frame(Pos=sample(TA_sites$sites, numsites), Exp=sample(sites$V1, numsites)) %>% arrange(Pos)
		colnames(expected)[2] <- paste("Expected", i, sep=".")
		print(system.time(counts.df <- merge(counts.df, expected, by="Pos", all=T)))# %>% arrange(Pos)
		counts.df[is.na(counts.df)] <- 0
	}
	rownames(counts.df) <- counts.df$Pos
	counts.norm <- as.matrix(counts.df[,(2:length(counts.df))])

	# Initialize the lists of read counts per gene and number of independent Tn sites per gene
	controlreps <- 0
	expreps <- 0
	for (c in 1:length(counts.norm[1,])) {
		gff[,c+9] <- rep(1,length(gff[,1]))
		if (controlreps < ctrl_reps) {
			controlreps <- controlreps + 1
			colnames(gff)[c+9] <- paste(ctrl_pfx, controlreps, sep=".")
		}
		else {
			expreps <- expreps + 1
			colnames(gff)[c+9] <- paste("Expected", expreps, sep=".")
		}
	}

	# Output gene boundaries and read counts per Tn site for Perl binning script
	print("Binning read counts by gene boundaries")
	boundariesfile <- paste(out_pfx, ".boundaries.tsv", sep="")
	sitecountsfile <- paste(out_pfx, ".sitecounts.tsv", sep="")
	write.table(gff[,c(4,5, 10:length(gff))], boundariesfile, quote=FALSE, sep="\t", row.names=F)
	write.table(counts.df, sitecountsfile, quote=FALSE, sep="\t", row.names=F)
	system(paste("perl ~/local/bin/TnGeneBin.pl", boundariesfile, sitecountsfile))
	genecounts <- read.table(paste(boundariesfile, "out", sep="."), header=T)[,-c(1,2)]
	numsites <- read.table(paste(boundariesfile, "numsites.out", sep="."), header=T)[,-c(1,2)]
	system(paste("rm", boundariesfile,
		paste(boundariesfile, "out", sep="."),
		paste(boundariesfile, "numsites.out", sep=".")))

	# Uncomment this section if you DO NOT have a kegg annotation description file of the genes and their products
	genes <- data.frame(id = rep("", length(gff[,1]), stringsAsFactors = FALSE))#, length(gff[,1]), 1)
	genes$id <- as.character(genes$id)
	
	for (i in 1:length(gff[,1])) {
		genes$id[i] <- strsplit(grep("locus_tag",strsplit(as.character(gff$att[i]),";")[[1]], value=T),"=")[[1]][2]
	}
	
	colnames(genes) <- c("id")
	write.table(genecounts, paste(out_pfx, ".genecounts.tsv", sep=""), quote=FALSE, sep="\t", row.names=FALSE)

	# Perform differential fitness analysis
	colnames(numsites) <- colnames(gff)[12:length(gff)]
	numsitesout <- data.frame(numsites[,(1:ctrl_reps)])
	numsitesout[,ctrl_reps+1] <- rowMeans(numsites[,-(1:ctrl_reps)])
	colnames(numsitesout)[ctrl_reps+1] <- "Expected"
	colData <- data.frame(c(rep(ctrl_pfx, ctrl_reps), rep("Expected", num_expected)), condition = c(rep(ctrl_pfx, ctrl_reps),rep("Expected", num_expected)))
	genescds <- DESeqDataSetFromMatrix(countData = round(genecounts), colData = colData, design = ~ condition)
	#genescds <- newCountDataSet(round(genecounts), c(rep(ctrl_pfx, ctrl_reps), rep("Expected", num_expected)))
	#genescds$sizeFactor <- rep(1, length(genecounts[1,])) # This is manually set as 1 because we normalized by site above
	genescds <- estimateSizeFactors(genescds)
	genescds <- estimateDispersions(genescds)
	genescds <- nbinomWaldTest(genescds)
	res <- results(genescds, contrast = c("condition", ctrl_pfx, "Expected"))
	print(head(res)) 
	#colnames(res)[4] <- paste(ctrl_pfx, "Mean", sep="")
	#colnames(res)[3] <- "ExpectedMean"
	out <- cbind(res, genes$id, numsitesout) # Uncomment if you have a kegg annotation
	colnames(out)[7] <- "id"
	#out <- cbind(res, genes[,2:3], numsitesout) %>% tbl_df # Uncomment if you do not have a kegg annotation

	# Perform bimodal clustering and essentiality calling and output results
	library(mclust)
	fit <- Mclust(out$log2FoldChange, G=1:2)
	category <- rep("",length(out$id))
	for (i in 1:length(out$id)) {
		if (fit$classification[i] == 1 & out$log2FoldChange[i] < 0) {
			category[i] <- "Reduced"
		}
		else {
			category[i] <- "Unchanged"
		}
	}

	fit$uncertainty[which(out$log2FoldChange > 0)] <- 0
	print(head(category, 10))
	essentiality <- as.data.frame(cbind(category, fit$uncertainty))
	colnames(essentiality) <- c("Essentiality", "Uncertainty")
	out <- cbind(out, essentiality) 
	write.table(out, file=paste(out_pfx, ".DESeq.tsv", sep=""), quote=FALSE, row.names=FALSE, sep="\t")
	return(out)
}
Args <- commandArgs(TRUE)
TnSeqDESeqEssential(Args[1], as.numeric(Args[2]), Args[3], Args[4], as.numeric(Args[5]), as.numeric(Args[6]), Args[-(1:6)])
back to top