https://github.com/spleonard1/Tn-seq
Raw File
Tip revision: f6d20057f6541e72d184b12f33a99af0a4b6227b authored by spleonard1 on 06 December 2016, 18:09:12 UTC
script to count TA sites not general
Tip revision: f6d2005
TnSeqDESeq2.R
# Defined as a function, return value is table of differential fitness results (source into R/RStudio for interactive analyses)

TnSeqDESeq <- function(ctrl_pfx, ctrl_reps, test_pfx, test_reps, gff_pfx, out_pfx, to_trim, in_files) {
	# Read in sites files
	library(dplyr)
	sites <- data.frame(Pos=c(0)) 
	for (i in 1:length(in_files)) {
		newsites <- read.table(paste(paste(in_files[i], in_files[i], sep="/"), "sites.txt", sep="-")) 
		colnames(newsites) <- c(paste("V", i, sep=""), "Pos")
		newsites <- tail(newsites, n=-to_trim) %>% arrange(Pos)
		sites <- merge(sites, newsites, all=T) 
	}
	sites <- tail(sites, n=-1)
	sites[is.na(sites)] <- 0
	
	# OPTIONAL - perform site filtering. Example: only consider sites identified in 2 or more of 6 replicates
	#sites <- sites %>% mutate(numreps = (V1 > 0) + (V2 > 0) + (V3 > 0) + (V4 > 0)) %>% filter(numreps >= 2)
	#sites <- sites[-6]
	sites  <- sites %>% mutate(numreps = (V1 > 0 ) + (V2 > 0) + (V3 > 0) + (V4 > 0) + (V5 > 0) + (V6 > 0)) %>% filter(numreps >= 2) %>% select(-numreps)
	#sites <- sites[-8]
	
	#Is this LOESS smooting necessary?  removing.
	# LOESS smooth data
	#for (i in 2:(length(sites))) { 
	#	counts.loess <- loess(sites[,i] ~ sites$Pos, span=1, data.frame(x=sites$Pos, y=sites[,i]), control=loess.control(statistics=c("approximate"),trace.hat=c("approximate")))
	#	counts.predict <- predict(counts.loess, data.frame(x=sites$Pos))
	#	counts.ratio <- counts.predict/median(counts.predict)
	#    sites[,i] <- sites[,i]/counts.ratio
	#}
	
	# Normalize data by reads/site
	library(DESeq2)
	colData <- data.frame(c(rep(ctrl_pfx, ctrl_reps), rep(test_pfx, test_reps)), condition=c(rep("untreated", ctrl_reps), rep("treated", test_reps)))
	sitescds <- sites[,2:length(sites)] %>% round %>% DESeqDataSetFromMatrix(colData = colData, design = ~ condition)
	sitescds <- estimateSizeFactors(sitescds)
	counts.norm <- counts(sitescds, normalized=F)
	rownames(counts.norm) <- sites$Pos
	counts.df <- data.frame(counts.norm)

	# Initialize the list of genes, read counts per gene, and number of independent Tn sites per gene
	gff <- read.delim(file=paste(gff_pfx, ".trunc.gff", sep=""), sep="\t", fill=TRUE, header=FALSE, col.names = c("seqname", "source", "feature", "start", "end", "score", "strand", "frame", "att"))
	print(head(gff))
	#gff <- tail(gff, n=-2)
	genomelength <- as.numeric(max(gff$end)*1.1)
	gff <- gff[(gff$feature=="gene"),]
	controlreps <- 0
	testreps <- 0
	for (c in 1:length(counts.norm[1,])) {
		gff[,c+9] <- rep(1,length(gff[,1]))
		if (controlreps < ctrl_reps) {
			controlreps <- controlreps + 1
			colnames(gff)[c+9] <- paste(ctrl_pfx, controlreps, sep=".")
		}
		else {
			testreps <- testreps + 1
			colnames(gff)[c+9] <- paste(test_pfx, testreps, sep=".")
		}
	}
	
	# Output gene boundaries and read counts per Tn site for Perl binning script
	print("Binning read counts by gene boundaries")
	boundariesfile <- paste(out_pfx, ".boundaries.tsv", sep="")
	sitecountsfile <- paste(out_pfx, ".sitecounts.tsv", sep="")
	write.table(gff[,c(4,5, 10:length(gff))], boundariesfile, quote=FALSE, sep="\t", row.names=F)
	write.table(counts.df, sitecountsfile, quote=FALSE, sep="\t", row.names=T)
	system(paste("perl ~/local/bin/TnGeneBin.pl", boundariesfile, sitecountsfile))
	genecounts <- read.table(paste(boundariesfile, "out", sep="."), header=T)[,-c(1,2)]
	numsites <- read.table(paste(boundariesfile, "numsites.out", sep="."), header=T)[,-c(1,2)]
	#system(paste("rm", boundariesfile,
	#	paste(boundariesfile, "out", sep="."),
	#	paste(boundariesfile, "numsites.out", sep=".")))

	# Uncomment this section if you have a kegg annotation description file of the genes and their products
	#genes <- read.delim(file=paste(gff_pfx, ".gene.products.kegg.txt", sep=""), sep="\t", header=TRUE)
	#rownames(genecounts) <- genes$Locus
	#write.table(genecounts, paste(out_pfx, ".genecounts.tsv", sep=""), quote=FALSE, sep="\t")

	# Uncomment this section if you DO NOT have a kegg annotation description file of the genes and their products
	genes <- data.frame(id = rep("", length(gff[,1]), stringsAsFactors = FALSE))#, length(gff[,1]), 1)
	genes$id <- as.character(genes$id)
	for (i in 1:length(gff[,1])) {
		genes[i,1] <- strsplit(grep("locus_tag",strsplit(as.character(gff$att[i]),";")[[1]], value=T),"=")[[1]][2]
		#genes[i,2] <- strsplit(grep("product",strsplit(as.character(gff$att[i]),";")[[1]], value=T),"=")[[1]][2]
		#genes[i,3] <- as.character(gff$KO[i])
		#genes[i,4] <- as.character(gff$pathways[i])	
	}
	colnames(genes) <- c("id")
	write.table(genecounts, paste(out_pfx, ".genecounts.tsv", sep=""), quote=FALSE, sep="\t", row.names=FALSE)
	
	# Perform differential fitness analysis and output results
	colnames(numsites) <- colnames(gff)[10:length(gff)]
	
	genescds <-DESeqDataSetFromMatrix(countData = round(genecounts), colData=colData, design = ~ condition) 
	#genescds$sizeFactor <- rep(1, length(genecounts[1,])) # This is manually set as 1 because we normalized by site above
	#genescds <- estimateDispersions(genescds, fitType="local") # Use this if estimateDispersions fails
	genescds <- estimateSizeFactors(genescds)
	genescds <- estimateDispersions(genescds)
	genescds <- nbinomWaldTest(genescds)
	
	res <- results(genescds, contrast = c("condition", "treated", "untreated"))
	print(head(res)) 
	#colnames(res)[3] <- paste(ctrl_pfx, "Mean", sep="")
	#colnames(res)[4] <- paste(test_pfx, "Mean", sep="")
	res <- cbind(res, genes[,1], numsites) # Uncomment if you have a kegg annotation
	#res <- cbind(res, genes[,1:2], numsites) %>% tbl_df # Uncomment if you do not have a kegg annotation
	colnames(res)[7] <- "id"
	write.table(res, file=paste(out_pfx, ".DESeq.tsv", sep=""), quote=FALSE, row.names=FALSE, sep="\t")
	return(res)
}
Args <- commandArgs(TRUE)
TnSeqDESeq(Args[1], as.numeric(Args[2]), Args[3], as.numeric(Args[4]), Args[5], Args[6], as.numeric(Args[7]), Args[-(1:7)])
back to top