https://github.com/anthonyuren/MuLV_pipeline
Tip revision: eea420f3dabc5b338126999a88c1f1da6fd5a8a1 authored by anthonyuren on 03 June 2018, 13:04:52 UTC
Update README.md
Update README.md
Tip revision: eea420f
QC1Fast.R
setwd(".")
library(plyr)
#Script that take the mouse ID and sample ID for each replicated insertion
# run this section once to break the insert list up by chromosomes to speed up processing of next steps
#read input file, a large matrix but only a handful of fields will be used
m <- read.table("clon10bp.txt", sep="\t", row.names=NULL, stringsAsFactors=FALSE)
#dilution/mixed samples are removed from the entire dataset prior to filtering
m<-m[(m[,13]!=1208),]
m<-m[(m[,13]!=1189),]
# #order by orientation, chromosome and then startpos
x<-m[order(m[,5], m[,1] ,as.numeric(m[,2])),]
#for each chromosome create an insert file
for (i in 1:max(x[,1])) {
write.table( x[(x[1]==i),] , file=paste(i,"_chr.txt", sep=""), sep="\t", col.names = F, row.names = F, quote=FALSE)
}
##############################################################################
# args <- commandArgs(trailingOnly = FALSE)
chr_files <- dir(pattern = "chr.txt") #list all the relevant files
for (file_num in 1:length(chr_files)) { # for each file i.e. for each chromosome forward file and reverse file
myargument <- chr_files[file_num]
myargument2 <- sub(".txt","",myargument)
cat(myargument, "\n")
cat(myargument2, "\n")
# cat(myargument)
# cat(paste(myargument, ".txt", sep=""))
# Read in the matrix file
x <- read.table(myargument, sep="\t", stringsAsFactors=FALSE, header=FALSE, row.names=NULL)
row.names(x)<-NULL
#matrix that contains only orientation, best base, sample id, insert id, mouse id, chromosome, pos, end pos
contam<-x[,c(5,20,12,38,13,1,18,2,3)]
colnames(contam)<-c("ori", "bestbase", "sample", "insert_id", "mouse","chr", "fragments","startpos","endpos")
#windowSize<-9 ###
duplicate<-matrix(nrow=0, ncol=10)
colnames(duplicate)<-c("ori", "bestbase", "sample", "insert_id", "mouse","chr", "fragments","startpos","endpos", "dup")
for (i in 1:nrow(contam)){
# Determine the current position and criteria
currentChr <- contam[i,6]
currentOri <- contam[i,1]
currentBestBase <- contam[i,2]
gap <- 0
if(i < nrow(contam) && i > 1 ) { # if i is not the first or last line of inserts_table
as.numeric(unlist(strsplit(currentBestBase, ", ")))
gap1 <- contam$endpos[i+1] - contam$startpos[i] #gap ahead
gap2 <- contam$startpos[i] - contam$endpos[i-1] #gap behind
gap <- min(c(gap1,gap2))
}
if (gap <1000) { #if the next base or previous base is close (1000 is overkill, 1 should do it)
if (grepl(",",currentBestBase)){
density<-0
for (el in unlist(strsplit(currentBestBase, ", "))) {
# Count how many inserts appear within the window and meet the qualifying criteria
density <- max(density, nrow(contam[(contam[6]==currentChr & grepl(paste("\\<",el,"\\>", sep=""), contam[,2]) & contam[1]==currentOri),]))
}
duplicateV<-cbind(contam[i,],density)
duplicate<-rbind(duplicate, duplicateV)
} else{
# Count how many inserts appear within the window and meet the qualifying criteria
density <- nrow(contam[(contam[6]==currentChr & grepl(paste("\\<",currentBestBase,"\\>", sep=""), contam[,2]) & contam[1]==currentOri),])
duplicateV<-cbind(contam[i,],density)
duplicate<-rbind(duplicate, duplicateV)
}
} else { # if the next base and previous base are more than "gap" away
density <- 1
duplicateV<-cbind(contam[i,],density)
duplicate<-rbind(duplicate, duplicateV)
}
if (i%%1000==0){
cat(i)
}
}
duplicate <- duplicate[,c(1,2,3,4,5,6,7,10)]
write.table(duplicate, file=paste(myargument2, "_dup.txt", sep=""), sep="\t", col.names = F, row.names = F, quote=FALSE)
}