https://github.com/jdaeth274/ISA
Raw File
Tip revision: c3873d851fdfb01efd8bb1f8a18f33acb06b6fc5 authored by jdaeth274 on 13 May 2021, 12:33:05 UTC
tweaking pbp blasting
Tip revision: c3873d8
gaph_results.R
###############################################################################
## Outputting flank scores for different cluster scores #######################
###############################################################################

require(ggplot2, quietly = TRUE)
require(stringr, quietly = TRUE)
require(ggpubr, quietly = TRUE)
require(tictoc, quietly = TRUE)
require(dplyr, quietly = TRUE)
require(asht, quietly = TRUE)

graph_getter <- function(dir_to_files,graph_name, current_flanks, insert_name){
  #browser()  

  last_character <- base::substr(dir_to_files,
                                 nchar(dir_to_files),
                                 nchar(dir_to_files))
  if(last_character != "/"){
    dir_to_files <- paste(dir_to_files, "/", sep = "")
  }
  summo_csv_file <- list.files(dir_to_files, pattern = "*species_compo*")
  summary_csv <- paste(dir_to_files, summo_csv_file,sep = "")

  summo_csv <- read.csv(summary_csv, stringsAsFactors = FALSE)
  blast_results <- list.files(dir_to_files, pattern = "*list*.csv")

  counted_summo <- plyr::count(summo_csv$insertion_point)  
  
  if(insert_name != "total"){
    counted_summo <- counted_summo[counted_summo$x %in% c(insert_name, "reference"),]
    narrowed_refs <- summo_csv[summo_csv$insertion_point == insert_name, 1]
    refs_to_use <- paste(narrowed_refs, "$",sep = "")
    summo_csv <- summo_csv[grepl(paste(refs_to_use, collapse = "|"), summo_csv[,1]),]
  }
  
  
  #############################################################################
  ## Ok, so now we've got the summary_csv, lets go through each of the ########
  ## insertion sites, getting overall pneumo weight by bitscore, rank of top ##
  ## pneumo hit and then top hit bit / bit of top pneumo hit ##################
  #############################################################################
  
  for(k in 1:nrow(counted_summo)){
    #browser()
    tic(paste(as.character(k),"row done"))
    current_insertion <- as.character(counted_summo[k, 1])
    subsetted_data <- summo_csv[summo_csv$insertion_point == current_insertion,]
    
    if(k == 1){
      results_df <- bitscore_res(subsetted_data, blast_results,
                                 current_insertion, dir_to_files, current_flanks)
      toc()
    }else{
      new_df <- bitscore_res(subsetted_data, blast_results,
                             current_insertion, dir_to_files, current_flanks)
      results_df <- rbind.data.frame(results_df, new_df)
      toc()
    }
    
  }
  
  total_run_through <- bitscore_res(summo_csv, blast_results,
                                    "total", dir_to_files, current_flanks)
  
  #browser()
  total_hist_weighted <- ggplot(data = total_run_through) + geom_boxplot(aes(x = insertion_loci, 
                                                                             y = weighted_bit)) + 
    labs(title = paste(graph_name, "Weighted total"))
  total_hist_ranked <- ggplot(data = total_run_through) + geom_boxplot(aes(x = insertion_loci,
                                                                           y = ranked_pneumo)) +
    labs(title = paste(graph_name, "ranked pneumo"))
  total_bit_ranked <- ggplot(data = total_run_through) + geom_boxplot(aes(x = insertion_loci,
                                                                          y = bit_pneumo)) +
    labs(title = paste(graph_name, "bitscore pneumo"))
  
  arranged_plots <- ggarrange(total_hist_weighted, total_hist_ranked, total_bit_ranked,
                              ncol = 3, nrow = 1, align = "hv")
  
  
  hist_weighted <- ggplot(data = results_df) + geom_boxplot(aes(x = insertion_loci, 
                                                                y = weighted_bit)) + 
    labs(title = paste(graph_name, "Weighted total"))
  hist_ranked <- ggplot(data = results_df) + geom_boxplot(aes(x = insertion_loci,
                                                              y = ranked_pneumo)) +
    labs(title = paste(graph_name, "ranked pneumo"))
  bit_ranked <- ggplot(data = results_df) + geom_boxplot(aes(x = insertion_loci,
                                                             y = bit_pneumo)) +
    labs(title = paste(graph_name, "bitscore pneumo"))
  
  
  
  
  
  return(list(by_insertion = results_df, total = total_run_through, total_plots = arranged_plots,
              weighted_hist = hist_weighted, ranked_hist = hist_ranked, ranked_bit = bit_ranked))
}

bitscore_res <- function(data_subset, blast_res_list, insertion_loci, dir_to_files, current_flanks){
  
  isolate_file <- paste(data_subset$isolate, "_species_list.csv",sep = "")
  files_to_open <- blast_res_list[which(blast_res_list %in% isolate_file)]
  files_to_open <- paste(dir_to_files, files_to_open, sep = "")
  
  #############################################################################
  ## So now we've got the vector of files to open in files_to_open, we'll loop#
  ## through these and get the weight by bitscores, rank of top_pneumo, and ###
  ## bitscore of top pneumo/top hit for each of the files #####################
  #############################################################################
  
  output_data <- data.frame(data = matrix(NA, nrow = length(files_to_open),
                                          ncol = 5))
  colnames(output_data) <- c("isolate","insertion_loci","weighted_bit","ranked_pneumo",
                             "bit_pneumo")
  output_data$isolate <- data_subset$isolate
  output_data$insertion_loci <- rep(insertion_loci, nrow(output_data))
  
  for(k in 1:length(files_to_open)){
    current_file <- read.csv(files_to_open[k])
    current_file_ordered <- current_file[order(current_file$bitscore),]
    
    pneumo_rows <- pneumo_finder(current_file)
    
    
    if(length(pneumo_rows) == 0){
      output_data[k,3:5] <- 0
    }else{
      ## weighted bitscores first ##
      bit_score_tot <- sum(current_file$bitscore)
      pneumo_bits <- sum(current_file$bitscore[pneumo_rows])
      output_data[k, 3] <- pneumo_bits / bit_score_tot
      
      ## ranked pneumo second ##
      
      ranking <- (nrow(current_file) - pneumo_rows[1] + 1) / nrow(current_file)
      output_data[k, 4] <- ranking
      
      ## bit_top_pnuemo ##
      
      rank_pneumo_bit <- current_file$bitscore[pneumo_rows[1]]
      top_bit <- current_file$bitscore[1] 
      output_data[k, 5] <- rank_pneumo_bit / top_bit
      
      
      
      
      
      
    }
    
  }
  
  return(output_data)
  
  
  
}


pneumo_finder <- function(data_set){
  
  #############################################################################
  ## This function finds the position of pneumo isolates in the blast results #
  #############################################################################
  
  pneumo_rows <- NULL
  for(k in 1:nrow(data_set)){
    pneumo_grep <- grepl("strep_pneumo", data_set[k,2])
    gpsc_grep <- grepl("GPSC", data_set[k, 2])
    
    if(pneumo_grep == TRUE | gpsc_grep == TRUE){
      pneumo_rows <- c(pneumo_rows, k)
    }
  }
  
  return(pneumo_rows)
}

series_display <- function(list_of_results, prefix, flanks_vector, cluster, region = "total", 
                           reference = TRUE, sd = FALSE){
  
  graph_df <- data.frame(data = matrix(NA, ncol = 9, nrow = (length(list_of_results) * 2)))
  colnames(graph_df) <- c("weighted_bit","ranked_pneumo","bit_pneumo","cluster","flank",
                          "region","lqr","uqr", "sd_3")
  
  graph_df2 <- NULL
  
  ## for references flank has to be repped 6 times before input. 
  graph_df$flank <- rep(flanks_vector, each = 2)
  graph_df$cluster <- rep(cluster, nrow(graph_df))
  new_regions <- paste(region, "Control")
  region <- paste(cluster, region)
  regionz <- rep("ND", length(region)*2)
  for(i in 1:((length(region) * 2))){
    if((i %% 2) == 0)
      regionz[i] <- new_regions[(i/2)]
    else{
      i_val <- as.integer(((i/2) + 0.5))
      regionz[i] <- region[i_val]
    }
    
  }
  
  region <- c(region, new_regions)
  graph_df$region <- rep(regionz, (nrow(graph_df) / length(region)))
  
  k_vec <- seq(1,(length(list_of_results)*2), by = 2)
  
  for(k in 1:length(list_of_results)){
    current_k <- k_vec[k]
    current_res <- list_of_results[[k]]
    if(cluster == "total"){

      data_set <- current_res$total
      data_set2 <- current_res$by_insertion
      ref_ids <- grep("!", data_set$isolate)
      ref_ids2 <- grep("!", data_set2$isolate)
      
      
      graph_df$weighted_bit[current_k] <- mean(data_set$weighted_bit[-ref_ids])
      graph_df$lqr[current_k] <- quantile(data_set$bit_pneumo[-ref_ids], probs = 0.05)
      graph_df$ranked_pneumo[current_k] <- mean(data_set$ranked_pneumo[-ref_ids])
      graph_df$uqr[current_k] <- quantile(data_set$bit_pneumo[-ref_ids], probs = 0.95)
      graph_df$bit_pneumo[current_k] <- median(data_set$bit_pneumo[-ref_ids])
      graph_df$sd_3[current_k] <- stats::sd(data_set$bit_pneumo[-ref_ids])
      
      graph_df$weighted_bit[current_k + 1] <- mean(data_set$weighted_bit[ref_ids])
      graph_df$lqr[current_k + 1] <- quantile(data_set$bit_pneumo[ref_ids], probs = 0.05)
      graph_df$ranked_pneumo[current_k+ 1] <- mean(data_set$ranked_pneumo[ref_ids])
      graph_df$uqr[current_k+ 1] <- quantile(data_set$bit_pneumo[ref_ids], probs = 0.95)
      graph_df$bit_pneumo[current_k+ 1] <- median(data_set$bit_pneumo[ref_ids])
      graph_df$sd_3[current_k+ 1] <- stats::sd(data_set$bit_pneumo[ref_ids])
      
      graphing_rows <- data_set2[,c("isolate","bit_pneumo", "insertion_loci")]
      graphing_rows$control <- "Actual"
      graphing_rows$control[ref_ids2] <- "Control"
      graph_df2 <- bind_rows(graph_df2, graphing_rows)
      
      
    }else{

      reference_db <-  current_res$total
      ref_ids <- grep("!", reference_db$isolate)
      reference_db <-  reference_db[ref_ids,]
      
      
      data_set <- current_res$by_insertion[current_res$by_insertion$insertion_loci == cluster,]
      ref_isolates <- paste("!", data_set$isolate, sep = "")
      ref_matches <- unique(grep(paste(ref_isolates, collapse = "|"), reference_db$isolate))
      
      reference_db <- reference_db[ref_matches,]
      reference_db$insertion_loci <- cluster
      data_set2 <- bind_rows(data_set, reference_db)
      ref_ids2 <- grep("!",data_set2$isolate)
#      browser()
      graph_df$weighted_bit[current_k] <- mean(data_set$weighted_bit)
      graph_df$lqr[current_k] <- quantile(data_set$bit_pneumo, probs = 0.25)[1]
      graph_df$ranked_pneumo[current_k] <- mean(data_set$ranked_pneumo)
      graph_df$uqr[current_k] <- quantile(data_set$bit_pneumo, probs = 0.75)[1]
      graph_df$bit_pneumo[current_k] <- median(data_set$bit_pneumo)
      graph_df$sd_3[current_k] <- stats::sd(data_set$bit_pneumo)
      
      graph_df$weighted_bit[current_k + 1] <- mean(reference_db$weighted_bit)
      graph_df$lqr[current_k + 1] <- quantile(reference_db$bit_pneumo, probs = 0.25)
      graph_df$ranked_pneumo[current_k+ 1] <- mean(reference_db$ranked_pneumo)
      graph_df$uqr[current_k+ 1] <- quantile(reference_db$bit_pneumo, probs = 0.75)
      graph_df$bit_pneumo[current_k+ 1] <- median(reference_db$bit_pneumo)
      graph_df$sd_3[current_k+ 1] <- stats::sd(reference_db$bit_pneumo)
      
      
      graphing_rows <- data_set2[,c("isolate","bit_pneumo", "insertion_loci")]
      graphing_rows$control <- "Actual"
      graphing_rows$control[ref_ids2] <- "Control"
      graph_df2 <- bind_rows(graph_df2, graphing_rows) 
    }
  }
  
  if(sd){
    
    compo_plot_weighted <- ggplot(data = graph_df, aes(x = flank, y = weighted_bit, colour = region)) + geom_point() +
      labs(x = "Flank length", y = "Weighted bit score", title = paste(prefix,  "weighted bit")) + 
      geom_pointrange(aes(ymin = (weighted_bit - sd_1), ymax = (weighted_bit + sd_1)))
    
    compo_plot_ranked <- ggplot(data = graph_df, aes(x = flank, y = ranked_pneumo, colour = region)) + geom_point() +
      labs(x = "Flank length", y = "ranked pneumo", title = paste(prefix,  "ranked pneumo")) + 
      geom_pointrange(aes(ymin = (ranked_pneumo - sd_2), ymax = (ranked_pneumo + sd_2)))
    
    compo_plot_pneumo <- ggplot(data = graph_df, aes(x = flank, y = bit_pneumo, colour = region)) + geom_point() +
      labs(x = "Flank length", y = "ranked pneumo", title = paste(prefix,  "bit pneumo")) + 
      geom_pointrange(aes(ymin = (bit_pneumo - sd_3), ymax = (bit_pneumo + sd_3)))
  }else{

    compo_plot_weighted <- ggplot(data = graph_df, aes(x = flank, y = weighted_bit, colour = region)) + geom_point(size = 2) +
      labs(x = "Flank length", y = "Weighted bit score") + theme(legend.position = "none")#, title = paste(prefix,  "weighted bit"))
    
    compo_plot_ranked <- ggplot(data = graph_df, aes(x = flank, y = ranked_pneumo, colour = region)) + geom_point(size = 2) +
      labs(x = "Flank length", y = "ranked pneumo") + theme(legend.position = "none")#, title = paste(prefix,  "ranked pneumo")) 
    
    compo_plot_pneumo <- ggplot(data = graph_df, aes(x = flank, y = bit_pneumo, colour = region)) + geom_point(size = 2) +
      geom_line() + geom_ribbon(aes(ymin = lqr, ymax = uqr, fill = region), colour = NA, alpha = 0.25) +
      labs(x = "Flank length", y = "Score") + theme(legend.position = "none") 
      
  }
  
  return(list(graph_data = graph_df, weighted_plot = compo_plot_weighted,
              ranked_plot = compo_plot_ranked, pneumo_plot = compo_plot_pneumo, total_df = graph_df2))
  
  
}

folder_to_res <- function(results_folder, graph_name, insert_name){
  ## Function to take in the output from the flanks_only_search and output the over flanks graphs 
  ## Results folder should be the main run folder, flanks veccy the main 
  
  folders <- list.files(results_folder, full.names = TRUE, include.dirs = TRUE)
  folders <- sub("//","/",folders)
  whole_blast_res <- folders[grep("_blast_results", folders)]
  before_blast_res <- folders[grep("_before_flank_blast_res", folders)]
  after_blast_res <- folders[grep("_after_flank_blast_res", folders)]
  
  ## whole blast res 
  tic("Running whole flanks sum up")
  pmen3_list <- list()
  flanks_veccy <- as.integer(sub("_blast_results","",basename(whole_blast_res)))
  current_iter <- 1
  for(k in whole_blast_res){
    #next
    current_graphed_res <- graph_getter(k, graph_name, flanks_veccy[current_iter], insert_name)
    pmen3_list[[length(pmen3_list) + 1]] <- current_graphed_res
    current_iter <- current_iter + 1
  }

  pmen3_mega_total <- series_display(pmen3_list, graph_name, flanks_veccy, insert_name, region = "whole")
  toc()
  ## before blast res 
  tic("Before flanks sum up")
  pmen3_list_bef <- list()
  flanks_veccy <- as.integer(sub("_before_flank_blast_res","",basename(before_blast_res)))
  current_iter <- 1
  for(k in before_blast_res){
    #next
    
    current_graphed_res <- graph_getter(k, graph_name, flanks_veccy[current_iter], insert_name)
    pmen3_list_bef[[length(pmen3_list_bef) + 1]] <- current_graphed_res
    current_iter <- current_iter + 1
  }
  
  before_name <- paste(graph_name, "before flanks")
  pmen3_mega_before <- series_display(pmen3_list_bef, before_name, flanks_veccy, insert_name, region = "before")
  toc()
  
  ## after blast res 
  tic("After flanks sum up")
  pmen3_list_aft <- list()
  flanks_veccy <- as.integer(sub("_after_flank_blast_res","",basename(after_blast_res)))
  current_iter <- 1
  for(k in after_blast_res){
    
    current_graphed_res <- graph_getter(k, graph_name, flanks_veccy[current_iter], insert_name)
    pmen3_list_aft[[length(pmen3_list_aft) + 1]] <- current_graphed_res
    current_iter <- current_iter + 1
  }
  
  after_name <- paste(graph_name, "after flanks")
  pmen3_mega_after <- series_display(pmen3_list_aft, after_name, flanks_veccy, insert_name, region = "after")
  toc()
  return(list(whole_res = pmen3_mega_total, before_res = pmen3_mega_before, after_res = pmen3_mega_after))
  
}

parse_args <- function(){
  input_args <- commandArgs(trailingOnly = TRUE)
  
  if(length(input_args) != 3){
    cat("Not enough arguments, need 3, you have:", length(input_args), "\n")
    cat("\n")
    cat("Usage: Rscript --vanilla ./gaph_results.R <Graph name> <folder_of_flank_res> <pdf_out_file>")
    cat("\n")
    cat("\n")
    stop("Not enough input files")
  }
  
  return(input_args)
}

###############################################################################
## system arguments: 1: Graph name 
##                   2: Directory of blast results 
##                   3: pdf out 


input_args <- parse_args()
input_args <- NULL

graph_name <- input_args[1]
graph_name <- "GPS MEGA"

input_args[2] <- "~/Dropbox/phd/insertion_site_analysis/data/gps_run_data/mega_res/gps_mega_run4/"

pmen_mega_res_total <- folder_to_res(results_folder = input_args[2],
                               graph_name = graph_name,
                               insert_name = "total")

## Get some info out from the gamma scores 
mega_whole_res <- pmen_mega_res_total$whole_res$total_df

## MGE insert sites 
inserts_whole <- mega_whole_res[mega_whole_res$control == "Actual",]
mean(inserts_whole$bit_pneumo)
reference_whole <- mega_whole_res[mega_whole_res$control == "Control",]
mean(reference_whole$bit_pneumo)

wilcox.test(inserts_whole$bit_pneumo, reference_whole$bit_pneumo)
t.test(inserts_whole$bit_pneumo, reference_whole$bit_pneumo)

ggplot(data = mega_whole_res, aes(x = control, y = bit_pneumo, fill = control, colour = control)) +
  geom_violin()

whole_df <- bind_rows(pmen_mega_res_total$before_res$total_df, pmen_mega_res_total$after_res$total_df)

whole_df_insert <- whole_df[whole_df$control == "Actual",]
whole_df_ref <- whole_df[whole_df$control == "Control",]
wilcox.test(whole_df_insert$bit_pneumo, whole_df_ref$bit_pneumo, conf.int = TRUE)
median(whole_df_insert$bit_pneumo)
median(whole_df_ref$bit_pneumo)

mann_u_output <- wmwTest(whole_df_insert$bit_pneumo, whole_df_ref$bit_pneumo)


boxplot_whole <- ggplot(data = whole_df,aes(x = control, y = bit_pneumo)) + geom_boxplot() 
boxplot_whole

wilcox.test()

whole_df <- whole_df %>% mutate(control = ifelse(control == "Actual", "Tag","Control"))
whole_df$control <- factor(whole_df$control, levels = c("Tag","Control"))

violin_plot_whole <- ggplot(data = whole_df, aes(x = control, y = bit_pneumo)) + geom_violin(aes(fill = control)) +
  labs(x = "Isolate", y = "Score") + scale_fill_discrete(breaks = c("Actual","Control")) + 
  guides(fill = guide_legend(title = "Isolate"))

violin_plot_whole

whole_plot <- pmen_mega_res$whole_res$pneumo_plot

###############################################################################
## Run it through for Tn916 ###################################################
###############################################################################

pmen_tn916_res_total <- folder_to_res(results_folder = "~/Dropbox/phd/insertion_site_analysis/data/gps_run_data/tn916_res/gps_tn916_run_free/",
                                     graph_name = graph_name,
                                     insert_name = "total")

gps_tn916_total <- pmen_tn916_res_total$whole_res$total_df
inserts_whole <- gps_tn916_total[gps_tn916_total$control == "Actual",]
mean(inserts_whole$bit_pneumo)
reference_whole <- gps_tn916_total[gps_tn916_total$control == "Control",]
mean(reference_whole$bit_pneumo)

wilcox.test(inserts_whole$bit_pneumo, reference_whole$bit_pneumo)

t.test(inserts_whole$bit_pneumo, reference_whole$bit_pneumo)


whole_df <- bind_rows(pmen_tn916_res_total$before_res$total_df, pmen_tn916_res_total$after_res$total_df)

whole_df_insert <- whole_df[whole_df$control == "Actual",]
whole_df_ref <- whole_df[whole_df$control == "Control",]

## Lets pair it up 
pairing_df <- whole_df %>% mutate(iso = ifelse(grepl("!",isolate),sub(".*!","",isolate), isolate))

wilcox.test(whole_df_insert$bit_pneumo, whole_df_ref$bit_pneumo, conf.int = TRUE)
median(whole_df_insert$bit_pneumo)
median(whole_df_ref$bit_pneumo)

boxplot_whole <- ggplot(data = whole_df,aes(x = control, y = bit_pneumo)) + geom_boxplot() 
boxplot_whole

wilcox.test()

whole_df <- whole_df %>% mutate(control = ifelse(control == "Actual", "Tag","Control"))
whole_df$control <- factor(whole_df$control, levels = c("Tag","Control"))

violin_plot_whole <- ggplot(data = whole_df, aes(x = control, y = bit_pneumo)) + geom_violin(aes(fill = control)) +
  labs(x = "Isolate", y = "Score") + scale_fill_discrete(breaks = c("Actual","Control")) + 
  guides(fill = guide_legend(title = "Isolate"))

violin_plot_whole

whole_plot <- pmen_mega_res$whole_res$pneumo_plot

###############################################################################

pmen_mega_res_tag <- folder_to_res(results_folder = input_args[2],
                                     graph_name = "GPS Tn1207.1",
                                     insert_name = "28")
## Get some info on the insertions 
## Look into the top species hits for tag at 500bp upstream. 


pmen_mega_res_tag$whole_res$graph_data
whole_df <- bind_rows(pmen_mega_res_tag$before_res$total_df, pmen_mega_res_tag$after_res$total_df)
whole_df <- whole_df %>% mutate(control = ifelse(control == "Actual", "Tag","Control"))
whole_df$control <- factor(whole_df$control, levels = c("Tag","Control"))

violin_plot_whole <- ggplot(data = whole_df, aes(x = control, y = bit_pneumo)) + geom_violin(aes(fill = control)) +
  labs(x = "Isolate", y = "Score") + scale_fill_discrete(labels = c(expression(paste(italic("tag"), " insertion",sep = "")),"Unmodified Locus")) + 
  guides(fill = guide_legend(title = "Isolate")) + geom_point(aes(x = control, y = bit_pneumo, color = control),
                                                              position = position_jitter(width = 0.45, height = 0),
                                                              size = 0.25) +
  scale_color_discrete(labels = c(expression(paste(italic("tag"), " insertion",sep = "")),"Unmodified Locus"), name = "Isolate") + 
  theme(legend.text.align = 0) +
  scale_x_discrete(labels = c(expression(paste(italic("tag"), " insertion",sep = "")),"Unmodified Locus")) +
  theme_bw()

violin_plot_whole

#whole_plot <- pmen_mega_res$whole_res$pneumo_plot
whole_plot_figshare <- whole_df[,c(1,2,4)]
colnames(whole_plot_figshare) <- c("Isolate","gamma","Insertion")

write.csv(whole_plot_figshare, file = "~/Dropbox/phd/elife_paper/figures_data/Figure10_species_over_flank_length/GPS_tag_and_control_isolates",
          row.names = FALSE)

before_flanks <- pmen_mega_res_tag$before_res$graph_data %>% mutate(region = sub("before","upstream", region)) %>%
  select(c(bit_pneumo,flank, region, lqr, uqr)) %>% rename(gamma = bit_pneumo) %>%
  mutate(region = sub("28", "tag",region))

after_flanks <- pmen_mega_res_tag$after_res$graph_data %>% mutate(region = sub("after","downstream", region)) %>%
  select(c(bit_pneumo,flank, region, lqr, uqr)) %>% rename(gamma = bit_pneumo) %>%
  mutate(region = sub("28", "tag",region))


write.csv(before_flanks, file = "~/Dropbox/phd/elife_paper/figures_data/Figure10_species_over_flank_length/GPS_tag_upstream_flanks_summary.csv",
          row.names = FALSE)
write.csv(after_flanks, file = "~/Dropbox/phd/elife_paper/figures_data/Figure10_species_over_flank_length/GPS_tag_downstream_flanks_summary.csv",
          row.names = FALSE)


mean_40_60 <- ggdraw()  +
  draw_plot(pmen_mega_res_tag$before_res$pneumo_plot + labs(x = NULL) + theme_bw() + theme(legend.position = "none"), x = 0, y = 0.5, width = .4, height = .5) +
  draw_plot(pmen_mega_res_tag$after_res$pneumo_plot  + labs(x = "Flank Length (bp)")  + theme_bw() + theme(legend.position = "none"), x = 0, y = 0, width = 0.4, height = 0.5) +
  draw_plot(violin_plot_whole + theme(legend.text.align = 0), x = 0.4, y = 0, width = 0.6, height = 1) +
  draw_plot_label(label = c("A", "B", "C"), size = 15,
                  x = c(0, 0, 0.4), y = c(0.95, 0.45, 0.95)) 

mean_40_60
  mean_95 <- ggdraw()  +
  draw_plot(pmen_mega_res$whole_res$pneumo_plot, x = 0, y = 0.6, width = .5, height = .3) +
  draw_plot(pmen_mega_res$before_res$pneumo_plot, x = 0, y = 0.3, width = .5, height = .3) +
  draw_plot(pmen_mega_res$after_res$pneumo_plot, x = 0, y = 0, width = 0.5, height = 0.3) +
  draw_plot(violin_plot_whole, x = 0.5, y = 0, width = 0.5, height = 0.9) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 15,
                  x = c(0, 0, 0, 0.5), y = c(0.9, 0.6, 0.3, 0.9)) + draw_plot_label("Mean score, 95% range", size = 15,
                                                                                    x = 0.2, y = 1)

mean_95

median_iqr <- ggdraw()  +
  draw_plot(pmen_mega_res$whole_res$pneumo_plot, x = 0, y = 0.6, width = .5, height = .3) +
  draw_plot(pmen_mega_res$before_res$pneumo_plot, x = 0, y = 0.3, width = .5, height = .3) +
  draw_plot(pmen_mega_res$after_res$pneumo_plot, x = 0, y = 0, width = 0.5, height = 0.3) +
  draw_plot(violin_plot_whole, x = 0.5, y = 0, width = 0.5, height = 0.9) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 15,
                  x = c(0, 0, 0, 0.5), y = c(0.9, 0.6, 0.3, 0.9)) + draw_plot_label("Median score, IQR range", size = 15,
                                                                                    x = 0.2, y = 1)

median_iqr

median_95 <- ggdraw()  +
  draw_plot(pmen_mega_res$whole_res$pneumo_plot, x = 0, y = 0.6, width = .5, height = .3) +
  draw_plot(pmen_mega_res$before_res$pneumo_plot, x = 0, y = 0.3, width = .5, height = .3) +
  draw_plot(pmen_mega_res$after_res$pneumo_plot, x = 0, y = 0, width = 0.5, height = 0.3) +
  draw_plot(violin_plot_whole, x = 0.5, y = 0, width = 0.5, height = 0.9) +
  draw_plot_label(label = c("A", "B", "C", "D"), size = 15,
                  x = c(0, 0, 0, 0.5), y = c(0.9, 0.6, 0.3, 0.9)) + draw_plot_label("Median score, 95% range", size = 15,
                                                                                    x = 0.2, y = 1)

median_95

pdf(file = "~/Dropbox/phd/LSA/figure_4_options.pdf", paper = "a4r", width = 12, height = 7)

print(mean_iqr)

print(mean_95)

print(median_iqr)

print(median_95)

dev.off()




mean_95 <- plot_grid(draw_label("Mean score, 95% range") , mean_95, ncol = 1, rel_heights = c(0.1,1))


median_95 <- ggdraw() +
   draw_plot(pmen_mega_res$whole_res$pneumo_plot, x = 0, y = 0.67, width = .5, height = .33) +
   draw_plot(pmen_mega_res$before_res$pneumo_plot, x = 0, y = 0.34, width = .5, height = .33) +
   draw_plot(pmen_mega_res$after_res$pneumo_plot, x = 0, y = 0, width = 0.5, height = 0.33) +
   draw_plot(violin_plot_whole, x = 0.5, y = 0, width = 0.5, height = 1) +
   draw_plot_label(label = c("A", "B", "C", "D"), size = 15,
                   x = c(0, 0, 0, 0.5), y = c(1, 0.67, 0.33, 1))

pdf(file = input_args[3], paper = "a4r", width = 12, height = 7)

print(pmen_mega_res$whole_res$pneumo_plot)

print(pmen_mega_res$before_res$pneumo_plot)

print(pmen_mega_res$after_res$pneumo_plot)


dev.off()



pmen_mega_res_tag <- folder_to_res(results_folder = input_args[2],
                                   graph_name = "GPS Tn1207.1",
                                   insert_name = "28")


tag_res <- graph_getter(dir_to_files = "~/Dropbox/phd/insertion_site_analysis/data/gps_run_data/mega_res/gps_mega_run4/500_before_flank_blast_res/",
                        graph_name = "before_tag", insert_name = "28",current_flanks = "500")


#######################################################################################
## Re run for the updated pipeline ####################################################
#######################################################################################

gps_mega_updated <- folder_to_res(results_folder = "~/Dropbox/phd/insertion_site_analysis/data/gps_run_data/mega_res/gps_mega_21_4_2021/",
                                     graph_name = graph_name,
                                     insert_name = "total")

## Get some info out from the gamma scores 
whole_df <- bind_rows(gps_mega_updated$before_res$total_df, gps_mega_updated$after_res$total_df)

whole_df_insert <- whole_df[whole_df$control == "Actual",]
whole_df_ref <- whole_df[whole_df$control == "Control",]
wilcox.test(whole_df_insert$bit_pneumo, whole_df_ref$bit_pneumo, conf.int = TRUE)
median(whole_df_insert$bit_pneumo)
median(whole_df_ref$bit_pneumo)

mann_u_output <- wmwTest(whole_df_insert$bit_pneumo, whole_df_ref$bit_pneumo)

## Now for the tag inserts

gps_mega_res_tag <- folder_to_res(results_folder = "~/Dropbox/phd/insertion_site_analysis/data/gps_run_data/mega_res/gps_mega_21_4_2021/",
                                   graph_name = "GPS Tn1207.1",
                                   insert_name = "28")
## Get some info on the insertions 
## Look into the top species hits for tag at 500bp upstream. 


gps_mega_res_tag$whole_res$graph_data
whole_df <- bind_rows(gps_mega_res_tag$before_res$total_df, gps_mega_res_tag$after_res$total_df)
whole_df <- whole_df %>% mutate(control = ifelse(control == "Actual", "Tag","Control"))
whole_df$control <- factor(whole_df$control, levels = c("Tag","Control"))

violin_plot_whole <- ggplot(data = whole_df, aes(x = control, y = bit_pneumo)) + geom_violin(aes(fill = control)) +
  labs(x = "Isolate", y = "Score") + scale_fill_discrete(labels = c(expression(paste(italic("tag"), " insertion",sep = "")),"Unmodified Locus")) + 
  guides(fill = guide_legend(title = "Isolate")) + geom_point(aes(x = control, y = bit_pneumo, color = control),
                                                              position = position_jitter(width = 0.45, height = 0),
                                                              size = 0.25) +
  scale_color_discrete(labels = c(expression(paste(italic("tag"), " insertion",sep = "")),"Unmodified Locus"), name = "Isolate") + 
  theme(legend.text.align = 0) +
  scale_x_discrete(labels = c(expression(paste(italic("tag"), " insertion",sep = "")),"Unmodified Locus")) +
  theme_bw()

violin_plot_whole

#whole_plot <- pmen_mega_res$whole_res$pneumo_plot
whole_plot_figshare <- whole_df[,c(1,2,4)]
colnames(whole_plot_figshare) <- c("Isolate","gamma","Insertion")

before_flanks <- pmen_mega_res_tag$before_res$graph_data %>% mutate(region = sub("before","upstream", region)) %>%
  select(c(bit_pneumo,flank, region, lqr, uqr)) %>% rename(gamma = bit_pneumo) %>%
  mutate(region = sub("28", "tag",region))

after_flanks <- pmen_mega_res_tag$after_res$graph_data %>% mutate(region = sub("after","downstream", region)) %>%
  select(c(bit_pneumo,flank, region, lqr, uqr)) %>% rename(gamma = bit_pneumo) %>%
  mutate(region = sub("28", "tag",region))


mean_40_60 <- ggdraw()  +
  draw_plot(pmen_mega_res_tag$before_res$pneumo_plot + labs(x = NULL) + theme_bw() + theme(legend.position = "none"), x = 0, y = 0.5, width = .4, height = .5) +
  draw_plot(pmen_mega_res_tag$after_res$pneumo_plot  + labs(x = "Flank Length (bp)")  + theme_bw() + theme(legend.position = "none"), x = 0, y = 0, width = 0.4, height = 0.5) +
  draw_plot(violin_plot_whole + theme(legend.text.align = 0), x = 0.4, y = 0, width = 0.6, height = 1) +
  draw_plot_label(label = c("A", "B", "C"), size = 15,
                  x = c(0, 0, 0.4), y = c(0.95, 0.45, 0.95)) 



######################################
## Tn916 #############################
######################################

pmen_tn916_res_total <- folder_to_res(results_folder = "~/Dropbox/phd/insertion_site_analysis/data/gps_run_data/tn916_res/gps_tn916_run_free/",
                                      graph_name = graph_name,
                                      insert_name = "total")


whole_df <- bind_rows(pmen_tn916_res_total$before_res$total_df, pmen_tn916_res_total$after_res$total_df)

whole_df_insert <- whole_df[whole_df$control == "Actual",]
whole_df_ref <- whole_df[whole_df$control == "Control",]

## Lets pair it up 
pairing_df <- whole_df %>% mutate(iso = ifelse(grepl("!",isolate),sub(".*!","",isolate), isolate))

wilcox.test(whole_df_insert$bit_pneumo, whole_df_ref$bit_pneumo, conf.int = TRUE)
median(whole_df_insert$bit_pneumo)
median(whole_df_ref$bit_pneumo)





back to top