library(tidyverse) library(dplyr) library(pkgsearch) library(factoextra) library(NbClust) --- Identify K clusters ------- clusters.R12methrate<- read.table("/groups/nodine/lab/members/Ranjith/mCHG/BS_all/pooledTsvs/BC_R12.CHG.bed.allClusters") clusters.WTmethrate<- read.table("/groups/nodine/lab/members/Ranjith/mCHG/BS_all/pooledTsvs/BC_WT.CHG.bed.allClusters") clusters.R12_WT.methrate<-as.data.frame(clusters.R12methrate[,c(4:7)])%>%inner_join(clusters.WTmethrate[,c(4:7)],by="V4") clusters.R12_WT.methrate<-clusters.R12_WT.methrate%>%filter() median.diff<-as.data.frame(aggregate((V6.x-V6.y)~V5.x,clusters.R12_WT.methrate,median)) par(mfrow=c(3,3),las=2, tcl=-0.35) mp<-barplot(median.diff[,2]*100,border=NA,names.arg=median.diff[,1],xaxt="n", ylim=c(0,17)) axis(1, at=mp, labels=c(1:4), las=2) #pc.genes <- read.table('/groups/nodine/lab/members/Ranjith/annotations_ranj/gene_types/protein_coding.txt') #mc.genes = read.table('/groups/nodine/lab/members/Ranjith/annotations_ranj/gene_types/mitochondrial_chloroplast.txt') #nm.genes = read.table('/groups/nodine/lab/members/Ranjith/annotations_ranj/gene_types/nuclear_mitochondria.txt') #ercc.genes = read.table('/groups/nodine/lab/members/Ranjith/annotations_ranj/gene_types/ercc_spike_ins.txt') #protein.coding.genes <- pc.genes[!pc.genes %in% c(mc.genes,nm.genes)] #names(protein.coding.genes)<-"V4" #genes.bed<-read.table("/groups/nodine/lab/members/Ranjith/mCHG/BS_all/pooledTsvs/nonAth_TAIR10_protein_coding.bed.ID") #genes.bed<-as.data.frame(genes.bed) #PCG.genes.bed<-as.data.frame(protein.coding.genes)%>%inner_join(genes.bed,by="V4") #PCG.genes.bed<-PCG.genes.bed[,c(2:4,1,5:7)] #write.table(PCG.genes.bed,"/groups/nodine/lab/members/Ranjith/mCHG/BS_all/pooledTsvs/nonAth_TAIR10_protein_coding.bed",quote=F) ###3 Cluster siRNA.TE<- list.files("/groups/nodine/lab/members/Ranjith/mCHG/BS_all/pooledTsvs",pattern = "CHG.Genemapped", full.names=T) siRNA.TE<-siRNA.TE[c(3,6,7,10,13,14)] siRNA.TEmap = as.data.frame(do.call(cbind,lapply(siRNA.TE , function(x) { DF <- read.table(x) colnames(DF)<- c(paste0(basename(x),'.chr'),paste0(basename(x),'.start'), paste0(basename(x),'.end'),paste0(basename(x),'.AGI'), paste0(basename(x),'.V5'),paste0(basename(x),'.Strand'),paste0(basename(x),'.V6'), paste0(basename(x),'.count'),paste0(basename(x),'.methRate')) return(DF)}))) names(siRNA.TEmap)<-gsub(x = names(siRNA.TEmap), pattern = ".CHG.Genemapped", replacement = "") siRNA.TEmap1<-siRNA.TEmap%>%dplyr::select(matches("BC_Col_old.chr|BC_Col_old.start|BC_Col_old.end|BC_Col_old.AGI|BC_Col_old.V5|BC_Col_old.Strand|BC_Col_old.V6|methRate")) names(siRNA.TEmap1) siRNA.TEmap1$r12.wt.diff.emb<-siRNA.TEmap1$BC_R12.methRate-siRNA.TEmap1$BC_Col_old.methRate mclusted.sirna<- Mclust(siRNA.TEmap1$r12.wt.diff.emb,G=4, modelNames="V") siRNA.TEmap1 <- cbind(siRNA.TEmap1,mclusted.sirna$classification) names(siRNA.TEmap1) colnames(siRNA.TEmap1)[15] <-"mClust" table(siRNA.TEmap1$mClust) boxplot(siRNA.TEmap1$r12.wt.diff.emb~siRNA.TEmap1$mClust,outline=F) boxplot((siRNA.TEmap1$BC_Col_old.end-siRNA.TEmap1$BC_Col_old.start)~siRNA.TEmap1$mClust,outline=F) #siRNA.TEmap1$ntiles<- ntile(siRNA.TEmap1$r12.wt.diff.emb, 5) #boxplot(siRNA.TEmap1$r12.wt.diff.emb~siRNA.TEmap1$ntiles,outline=F) #km.res <- kmeans(siRNA.TEmap1$r12.wt.diff.emb, 4, nstart = 25) #siRNA.TEmap1$clusters<-km.res$cluster #table(siRNA.TEmap1$clusters) #boxplot(siRNA.TEmap1$r12.wt.diff.emb~siRNA.TEmap1$clusters,outline=F) #set.seed(123) siRNA.TEmap1$TAIR10<-"TAIR10" siRNA.TEmap1$gene<-"gene" forgtfs<-siRNA.TEmap1[,c(1,16,17,2:3,5:6,15,4)] head(forgtfs) #write.table(forgtfs,"/groups/nodine/lab/members/Ranjith/mCHG/main.allclusters.txt",quote=F) library(mclust) BIC.mCHGdiff <- mclustBIC(te.24nt.mclust.mean1,G=seq(2,20),by=2) #plot(BIC.siRNA,las=2) #abline(v=8) siRNA.TEmap1$TAIR10<-"TAIR10" siRNA.TEmap1$gene<-"gene" forgtfs<-siRNA.TEmap1[,c(1,16,17,2:3,5:6,18,4)]