Revision 469689991e7dbce2be4c4f618584304d91841c49 authored by Chase W. Nelson on 01 October 2020, 17:39:04 UTC, committed by Chase W. Nelson on 01 October 2020, 17:39:04 UTC
1 parent 0b9f8cb
Raw File
riboseq_sliding_window.R
##############################################################################################################
### Ribo-seq sliding windows

# Mac
suppressMessages(library(package = readr))
suppressMessages(library(package = stringr))
suppressMessages(library(package = dplyr))
suppressMessages(library(package = tidyr))


############################################################################################################
############################################################################################################
### GATHER PARAMETERS FROM COMMAND LINE
ARGV <- commandArgs(trailingOnly = T)

kill_script <- FALSE

if(! (length(ARGV) == 4)) {
  kill_script <- TRUE
} else if(! str_detect(string = ARGV[1], pattern = "\\w")) {
  kill_script <- TRUE
} else if(! str_detect(string = ARGV[2], pattern = "\\w")) {
  kill_script <- TRUE
} else if(! str_detect(string = ARGV[3], pattern = "\\d")) {
  kill_script <- TRUE
} else if(! str_detect(string = ARGV[4], pattern = "\\d")) {
  kill_script <- TRUE
} # could add more conditions later

if(kill_script) {
  cat("\n\n### WARNING: there must be 4 command line arguments, in this order:\n")
  cat("    (1) INFILE_FRAMES: file with condition metadata for samples\n")
  cat("    (2) INFILE_READS: file with read data\n")
  cat("    (3) WIN_SIZE: size of the sliding window\n")
  cat("    (4) READ_LEN: length of reads to consider\n\n")
  quit(save = 'no', status = 1, runLast = TRUE)
}

### PARAMETERS ###
INFILE_FRAMES <- as.character(ARGV[1])
INFILE_READS <- as.character(ARGV[2])
WIN_SIZE <- as.integer(ARGV[3]) 
READ_LEN <- as.integer(ARGV[4]) 
##################

# Produce some helpful warning messages
if(! (WIN_SIZE >= 1 && WIN_SIZE <= 1000)) {
  cat("### WARNING: WIN_SIZE must be in the range [1,1000]. Using: 29.\n")
  WIN_SIZE <- 29
} else {
  
}

if(! (READ_LEN >= 1 && READ_LEN <= 100)) {
  cat("### WARNING: READ_LEN must be in the range [1,100]. Using: 29.\n")
  READ_LEN <- 29
}

# Product output file name
OUTFILE_NAME <- str_replace(string = INFILE_FRAMES, pattern = "\\w+.\\w+$", replacement = "")
OUTFILE_NAME <- paste0(OUTFILE_NAME, "mapped_reads_by_readlength_sumSamples_win", WIN_SIZE, "read", READ_LEN, ".tsv")

cat("\n\nOUTPUT will be written to: ", OUTFILE_NAME, "\n")


##############################################################################################################
### INPUT FILES
## File 1
suppressMessages(frames_table <- read_tsv(INFILE_FRAMES,
                                          col_names = c("sample", "condition", "time", "treatment")))

## File 2
suppressMessages(mapped_reads_by_readlength <- read_tsv(INFILE_READS,
                                       col_names = c("sample", "read_length", "chromosome", "position", "frame", "read_count")))

# Convert to 1-based
mapped_reads_by_readlength$position <- mapped_reads_by_readlength$position + 1


##############################################################################################################
### JOIN sample data

# construct sample metadata translator
sample_metadata_joiner <- dplyr::select(frames_table, sample, condition)

# Extract time
sample_metadata_joiner$time <- sample_metadata_joiner$condition
sample_metadata_joiner$time <- str_replace(string = sample_metadata_joiner$time,
                                            pattern = "\\w+(\\d+)hr", replacement = "\\1")
sample_metadata_joiner$time <- factor(sample_metadata_joiner$time,
                                       levels = c("5", "4"),
                                       labels = c("5 hpi", "24 hpi"))

# Extract treatment
sample_metadata_joiner$treatment <- as.character(sample_metadata_joiner$condition)
sample_metadata_joiner[str_detect(string = sample_metadata_joiner$treatment, "chx"), ]$treatment <- "CHX"
sample_metadata_joiner[str_detect(string = sample_metadata_joiner$treatment, "harr"), ]$treatment <- "Harr"
sample_metadata_joiner[str_detect(string = sample_metadata_joiner$treatment, "ltm"), ]$treatment <- "LTM"
sample_metadata_joiner[str_detect(string = sample_metadata_joiner$treatment, "mrna"), ]$treatment <- "mRNA"

# unique only
sample_metadata_joiner 
sample_metadata_joiner <- unique(sample_metadata_joiner)

# Join
mapped_reads_by_readlength <- left_join(x = mapped_reads_by_readlength, y = sample_metadata_joiner, by = "sample")

# Another condition option
mapped_reads_by_readlength$condition_combn <- paste0(mapped_reads_by_readlength$treatment, " / ", mapped_reads_by_readlength$time)
mapped_reads_by_readlength$condition_combn <- factor(mapped_reads_by_readlength$condition_combn,
                                                     levels = c("CHX / 5 hpi", "CHX / 24 hpi",
                                                                "Harr / 5 hpi", "Harr / 24 hpi",
                                                                "LTM / 5 hpi", "LTM / 24 hpi",
                                                                "mRNA / 5 hpi", "mRNA / 24 hpi")) # each of these has two samples

# Factor
mapped_reads_by_readlength$frame <- factor(mapped_reads_by_readlength$frame,
                                           levels = c(0, 1, 2),
                                           labels = c('Codon pos 1', 'Codon pos 2', 'Codon pos 3'))


####################################################################################################
### SLIDING WINDOWS prop frame analysis, pooling the samples for a given condition

# Summarize read counts by condition, position, frame (pool samples)
mapped_reads_by_readlength_sumSamples <- filter(mapped_reads_by_readlength, read_length == READ_LEN) %>%
    group_by(condition_combn, position, frame) %>%
    summarise(
      read_count_sumSamples = sum(read_count)
    )

### PERFORM SLIDING WINDOWS,  summing each window's total reads for each codon position

# Prepare new columns
mapped_reads_by_readlength_sumSamples$window_sum_codon_pos_1 <- NA
mapped_reads_by_readlength_sumSamples$window_sum_codon_pos_2 <- NA
mapped_reads_by_readlength_sumSamples$window_sum_codon_pos_3 <- NA

### TAKES FOREVER ###
# specific to read lengths specified above # TODO: vectorize the approach
for(this_condition in unique(mapped_reads_by_readlength_sumSamples$condition_combn)) {
  #this_condition <- "CHX / 5 hpi"
  condition_data <- mapped_reads_by_readlength_sumSamples[mapped_reads_by_readlength_sumSamples$condition_combn == this_condition, ]
  
  for(i in sort(unique(condition_data$position))) { # no genes at tail end of genome, so we're good
    #i <- 1
    
    # Extract window; analyze
    window_data <- condition_data[condition_data$position >= i & condition_data$position <= (i + WIN_SIZE - 1), ]
    
    ### Add to data frame
    # Codon pos 1
    mapped_reads_by_readlength_sumSamples[mapped_reads_by_readlength_sumSamples$condition_combn == this_condition &
                                            mapped_reads_by_readlength_sumSamples$position == i, ]$window_sum_codon_pos_1 <- 
      sum(window_data[window_data$frame == "Codon pos 1", ]$read_count_sumSamples)
    
    # Codon pos 2
    mapped_reads_by_readlength_sumSamples[mapped_reads_by_readlength_sumSamples$condition_combn == this_condition &
                                            mapped_reads_by_readlength_sumSamples$position == i, ]$window_sum_codon_pos_2 <- 
      sum(window_data[window_data$frame == "Codon pos 2", ]$read_count_sumSamples)
    
    # Codon pos 3
    mapped_reads_by_readlength_sumSamples[mapped_reads_by_readlength_sumSamples$condition_combn == this_condition &
                                            mapped_reads_by_readlength_sumSamples$position == i, ]$window_sum_codon_pos_3 <- 
      sum(window_data[window_data$frame == "Codon pos 3", ]$read_count_sumSamples)
    
  } # end last window
} # end last condition_combn

#View(mapped_reads_by_readlength_sumSamples)

### SAVE ###
write_tsv(mapped_reads_by_readlength_sumSamples, OUTFILE_NAME)


back to top