Content - 58e60b1d2f2b4ed580c087f73117b8946e60a99b - f1fa145/Real data/softImpute_Karlsson.R

visit type:
Tip revision: 7074d3edcc2a250ec9d5139af126f786f7c060a2 authored by ruochenj on 15 March 2020, 02:07:42 UTC
Add files via upload
Tip revision: 7074d3e
softImpute_Karlsson.R
# run softImpute on real data
library(softImpute)
si_impute <- function(sim_tab){
  y_sim = sim_tab
  # add filter
  filter <- lapply(X = 1:ncol(y_sim), FUN = function(col_i){
    y = y_sim[,col_i]
    n = length(y)
    nz <- sum(y <= (log10(1.01) + 1e-6))
    pz = 1 - nz/n
    test = pz - 1.96 * sqrt(pz * (1-pz)/n)
    if(nz == n || test <= 0){
      return(0)
    }else{
      return(1)
    }
  })
  y_imp <- y_sim
  #perform imputation on the rest
  filter_vec <- which(unlist(filter) == 1)
  y_sim = y_sim[, filter_vec]
  
  na_idx <- y_sim < log10(1.01) + 0.001
  y_sim[y_sim < log10(1.01) + 0.001] = NA
  
  # y_sim_cv <- unlist(y_sim)
  # y_sim_validate <- matrix(y_sim_cv, nrow(y_sim), ncol = ncol(y_sim))
  # identical(y_sim_validate, y_sim)
  y_sim_cv <- unlist(y_sim)
  na_intro <- sample(which(!is.na(y_sim_cv)), floor(sum(!is.na(y_sim_cv))/10))
  y_sim_cv_intro <- y_sim_cv
  y_sim_cv_intro[na_intro] = NA
  y_sim_cv_intro <- matrix(y_sim_cv_intro, nrow = nrow(y_sim), ncol = ncol(y_sim))
  
  j = 1
  se = 1e10
  for(i in 1:5){
    si_cv_1 <- softImpute(y_sim_cv_intro, rank.max = i, lambda = 0)
    y_imp_cv <- complete(y_sim_cv_intro, si_cv_1)
    y_sim_vali <- as.vector(y_imp_cv)
    se2 <- sum((y_sim_cv[na_intro] - y_sim_vali[na_intro])^2)
    if(se2 < se){
      se = se2
      j = i
    }
  }
  
  si1 <- softImpute(y_sim_cv_intro, rank.max = j, lambda = 0, trace.it = TRUE)
  impute_mat <- complete(y_sim_cv_intro, si1)
  
  y_imp[, filter_vec] = impute_mat
  
  return(y_imp)
}

# si_imp_crc = si_impute(sim_tab_zi_CRC)
# si_imp_control = si_impute(sim_tab_zi_control)

otu_real_data_T2D <- read.csv("otu_real_data_T2D.csv", row.names = "X")
otu_real_data_IGT <- read.csv("otu_real_data_IGT.csv", row.names = "X")
otu_real_data_control <- read.csv("otu_real_data_control.csv", row.names = "X")

T2D_mat_si <- si_impute(otu_real_data_T2D)
control_mat_si <- si_impute(otu_real_data_control)
IGT_mat_si <- si_impute(otu_real_data_IGT)


#regroup the conditions
mat_imp_si <- matrix(1, ncol = m, nrow = n)
mat_imp_si[T2D_idx,] = as.matrix(T2D_mat_si)
mat_imp_si[control_idx,] = as.matrix(control_mat_si)
mat_imp_si[IGT_idx,] = as.matrix(IGT_mat_si)

########imputation2#############
imp_mat2 <- mat_imp

set.seed(42) # Set a seed if you want reproducible results
library(Rtsne)
imp_rtsne_out <- Rtsne(imp_mat2) # Run TSNE

imp_mat2[imp_mat2 > 6] = 5

png("plots/si_rtsne_imp.png")
plot(imp_rtsne_out$Y, col = as.factor(study_con))
dev.off()
library(tsne)
imp_tsne_out <- tsne(imp_mat2)
png("plots/si_tsne_imp.png")
plot(imp_tsne_out, col = as.factor(study_con))
dev.off()
imp_pca_out <- prcomp(imp_mat2, center = TRUE)
par(mfrow = c(1,1))
png("plots/si_pca_1_2_imp.png")
plot(imp_pca_out$x[,1:2], col = as.factor(study_con))
dev.off()
png("plots/si_pca_1_3_imp2.png")
plot(imp_pca_out$x[,c(1,3)], col = as.factor(study_con))
dev.off()
png("plots/si_pca_2_3_imp.png")
plot(imp_pca_out$x[,2:3], col = as.factor(study_con))
dev.off()

png("plots/si_htmap_imp.png")
heatmap(as.matrix(imp_mat2), Colv = NA, Rowv = NA, scale="row")
dev.off()

png("plots/si_htmap_imp_ordered.png")
heatmap(as.matrix(imp_mat2), scale = "row")
dev.off()

png("plots/si_htmap_imp_dis.png")
distmat <- as.matrix(dist(as.matrix(imp_mat2)))
diag(distmat) = max(distmat)
heatmap(distmat)
dev.off()

selected_taxa <- c()
p_T2D_control <- rep(1, m)
p_T2D_IGT <- rep(1, m)
p_IGT_control <- rep(1, m)
for(j in 1:m){
  #test <- pairwise.wilcox.test(mat_imp[,j], g = study_con, p.adjust.method = "none", alternative = "greater")
  test <- pairwise.t.test(mat_imp_si[,j], g = study_con, p.adjust.method = "none", alternative = "greater")
  p_T2D_control[j] = test$p.value[2,1] #T2D > control
  p_T2D_IGT[j] <- test$p.value[2,2] #T2D > IGT
  p_IGT_control[j] <- test$p.value[1,1] #IGT > control
}

#install.packages("fdrtool")
grt_T2D_control <- which(p.adjust(p_T2D_control, method = "fdr") < 0.1)
grt_T2D_IGT <- which(p.adjust(p_T2D_IGT, method = "fdr") < 0.1)
grt_IGT_control <- which(p.adjust(p_IGT_control, method = "fdr") < 0.1)

grt_tc_df <- cbind(grt_T2D_control, p.adjust(p_T2D_control, method = "bonferroni")[grt_T2D_control])
grt_ti_df <- cbind(grt_T2D_IGT, p.adjust(p_T2D_IGT, method = "bonferroni")[grt_T2D_IGT])
grt_ic_df <- cbind(grt_IGT_control, p.adjust(p_IGT_control, method = "bonferroni")[grt_IGT_control])

p_T2D_control2 <- rep(1, m)
p_T2D_IGT2 <- rep(1, m)
p_IGT_control2 <- rep(1, m)
for(j in 1:m){
  #test <- pairwise.wilcox.test(mat_imp[,j], g = study_con, p.adjust.method = "none", alternative = "less")
  test <- pairwise.t.test(mat_imp_si[,j], g = study_con, p.adjust.method = "none", alternative = "less")
  p_T2D_control2[j] = test$p.value[2,1] #T2D < control
  p_T2D_IGT2[j] <- test$p.value[2,2] #T2D < IGT
  p_IGT_control2[j] <- test$p.value[1,1] #control < IGT
}

less_T2D_control <- which(p.adjust(p_T2D_control2, method = "fdr") < 0.1)
less_T2D_IGT <- which(p.adjust(p_T2D_IGT2, method = "fdr") < 0.1)
less_IGT_control <- which(p.adjust(p_IGT_control2, method = "fdr") < 0.1)

less_tc_df <- cbind(less_T2D_control, p.adjust(p_T2D_control2, method = "bonferroni")[less_T2D_control])
less_ti_df <- cbind(less_T2D_IGT, p.adjust(p_T2D_IGT2, method = "bonferroni")[less_T2D_IGT])
less_ic_df <- cbind(less_IGT_control, p.adjust(p_IGT_control2, method = "bonferroni")[less_IGT_control])

record_table_imputed <- rbind(cbind(length(less_tc_df)/2, length(less_ti_df)/2, length(less_ic_df)/2),
                              cbind(length(grt_tc_df)/2, length(grt_ti_df)/2, length(grt_ic_df)/2))
colnames(record_table_imputed) <- c("T2D VS Control", "T2D vs IGT", "IGT vs Control")
rownames(record_table_imputed) <- c("less", "greater")

intersect_table <- rbind(cbind(length(intersect(Karlsson_raw_counts_compare$less_tc_df[,1], less_tc_df[,1])), length(intersect(Karlsson_raw_counts_compare$less_ti_df[,1], less_ti_df[,1])), length(intersect(Karlsson_raw_counts_compare$less_ic_df[,1], less_ic_df[,1]))),
                         cbind(length(intersect(Karlsson_raw_counts_compare$greater_tc_df[,1], grt_tc_df[,1])), length(intersect(Karlsson_raw_counts_compare$greater_ti_df[,1], grt_ti_df[,1])), length(intersect(Karlsson_raw_counts_compare$greater_ic_df[,1], grt_ic_df[,1]))))

write.csv(intersect_table, "summary/si_intersect_table_wct.csv")
write.csv(record_table_imputed, "summary/si_record_table_imputed_wct.csv")

write.csv(colnames(otable)[ grt_tc_df[which(! grt_tc_df[,1] %in%  Karlsson_raw_counts_compare$greater_tc_df[,1]), 1] ]
          , "summary/si_newly_found_taxa_tc.csv")
write.csv(colnames(otable)[ grt_tc_df[which(grt_tc_df[,1] %in% Karlsson_raw_counts_compare$greater_tc_df[,1]),1] ]
          , "summary/si_overlapped_taxa_tc.csv")
write.csv(colnames(otable)[Karlsson_raw_counts_compare$greater_tc_df[,1]], "summary/si_raw_identified_taxa_tc.csv")

write.csv(colnames(otable)[ grt_ti_df[which(! grt_ti_df[,1] %in%  Karlsson_raw_counts_compare$greater_ti_df[,1]), 1] ]
          , "summary/si_newly_found_taxa_ti.csv")
write.csv(colnames(otable)[ grt_ti_df[which(grt_ti_df[,1] %in% Karlsson_raw_counts_compare$greater_ti_df[,1]),1] ]
          , "summary/si_overlapped_taxa_ti.csv")
write.csv(colnames(otable)[Karlsson_raw_counts_compare$greater_ti_df[,1]], "summary/si_raw_identified_taxa_ti.csv")

write.csv(colnames(otable)[ grt_ic_df[which(! grt_ic_df[,1] %in%  Karlsson_raw_counts_compare$greater_ic_df[,1]), 1] ]
          , "summary/si_newly_found_taxa_ic.csv")
write.csv(colnames(otable)[ grt_ic_df[which(grt_ic_df[,1] %in% Karlsson_raw_counts_compare$greater_ic_df[,1]),1] ]
          , "summary/si_overlapped_taxa_ic.csv")
write.csv(colnames(otable)[Karlsson_raw_counts_compare$greater_ic_df[,1]], "summary/si_raw_identified_taxa_ic.csv")

sum(length(grt_T2D_control), length(grt_IGT_control), length(grt_T2D_IGT))
# 203
Browse the archive

https://github.com/ruochenj/MBImpute