https://github.com/emsanford/combined_responses_paper
Revision 6d8d1e6f9e3e660f8acd4e748b5a80b3bb6c6d7c authored by emsanford on 03 January 2021, 23:20:22 UTC, committed by GitHub on 03 January 2021, 23:20:22 UTC
1 parent e25f3d9
Raw File
Tip revision: 6d8d1e6f9e3e660f8acd4e748b5a80b3bb6c6d7c authored by emsanford on 03 January 2021, 23:20:22 UTC
Update README.md
Tip revision: 6d8d1e6
visualizeNearbyDoseChangesATACseq_v3.R
library(tidyverse)
library(GenomicRanges)
library(here)
library(gtable)
library(grid)
library(gridExtra)


## v3 adds TGFb to the gene expression plots and to the chromatin accessibility track
consistent.additive.gene.set <- c("ENSG00000013588", "ENSG00000069764", "ENSG00000100918", "ENSG00000116871", 
                                  "ENSG00000123364", "ENSG00000127561", "ENSG00000150676", "ENSG00000159840", 
                                  "ENSG00000162552", "ENSG00000162949", "ENSG00000163932", "ENSG00000164778", 
                                  "ENSG00000166780", "ENSG00000186594", "ENSG00000188488", "ENSG00000198959", 
                                  "ENSG00000205869", "ENSG00000213977", "ENSG00000249859", "ENSG00000283526")
consistent.multiplicative.gene.set <- c("ENSG00000072195", "ENSG00000072422", "ENSG00000106683", "ENSG00000116254", "ENSG00000118263",
                                        "ENSG00000133216", "ENSG00000137166", "ENSG00000139112", "ENSG00000142453", "ENSG00000162434", 
                                        "ENSG00000162772", "ENSG00000165238", "ENSG00000168209", "ENSG00000172602", "ENSG00000176371", 
                                        "ENSG00000196526", "ENSG00000221926")

gene.set.to.run <- consistent.multiplicative.gene.set
outputfolder <- here("plots", "peaksNearGenesTrackView", "multiplicative_up")


#########
samplemetadata   <- read_tsv(here('sampleMetadata_SI2-SI4.txt'))

TSS.window.radius <- 100000  #  include peaks within this many base pairs of a gene's transcription start site
  
diffpeakstib <- read_tsv(here('extractedData', 'differentialAtacPeaks.tsv'))
deseqtib     <- read_tsv(here('extractedData', 'DeSeqOutputAllConds.tsv'))

## to do-- make this table "wide" and calc average values for each condition in order to plot nearby gene expr changes
rnaseqPipelineOut <- filter(read_tsv(here('extractedData', 'si2-si4_RNA-seq-pipeline-output-normalized.tsv')), sampleID == "01-TGFb-low")
TSS.loc.tib.extended <- rnaseqPipelineOut %>% 
  mutate(tss = TSS_loc) %>% 
  mutate(ensg = gene_id) %>%
  dplyr::select(-TSS_loc, -gene_id)  #use legacy name and add column names for quick compatibility with prev. script

genes_dose_responsive_up <- filter(deseqtib, 
                   `RA-med_isDeGene` == 1, 
                   `RA-low_log2fc` > 0, 
                   (`RA-low_avgTPM`) * 1.75 < (`RA-high_avgTPM`))$ensg
genes_dose_responsive_down <- filter(deseqtib, 
                   `RA-med_isDeGene` == 1, 
                   `RA-low_log2fc` < 0, 
                   (`RA-low_avgTPM`) > (`RA-high_avgTPM`) * 1.75)$ensg
gene_set <- genes_dose_responsive_up

# outputfolder <- here("plots", "peaksNearGenesTrackView", "dose_responsive_down")
#########



# add TSS coordinate column, which depends on strand, to TSS location table
TSS.loc.tib.extended[["tss"]] <- rep(0, nrow(TSS.loc.tib))
TSS.loc.tib.extended[["tss"]][which(TSS.loc.tib.extended$strand == "+")] <- TSS.loc.tib.extended[["tx_start"]][which(TSS.loc.tib.extended$strand == "+")]
TSS.loc.tib.extended[["tss"]][which(TSS.loc.tib.extended$strand == "-")] <- TSS.loc.tib.extended[["tx_end"]][which(TSS.loc.tib.extended$strand == "-")]


makeBeeswarmPlot <- function(geneToPlot, tibforplot, deseq.tib, ordertib) {
  
  gene.symbol <- deseq.tib$gene_name[deseq.tib$ensg == geneToPlot]
  
  tfp <- tibforplot %>% filter(ensg == geneToPlot)
  
  conds.ordered <- c("EtOH-halfDensity", "EtOH-nlDensity", "EtOH-highDensity",
                     "TGFb-low", "TGFb-med", "TGFb-high",
                     "RA-low", "RA-med", "RA-high",
                     "TGFb-and-RA-low", "TGFb-and-RA-med", "TGFb-and-RA-high")
  colors.ordered <- c("black", "black", "black",
                      "blue", "blue", "blue",
                      "red", "red", "red",
                      "purple", "purple", "purple")
  
  tfp[["condition"]] <- factor(tfp[["condition"]], levels = conds.ordered)
  
  p <- ggplot() +
    geom_point(tfp, mapping = aes(x = order, y = yaxis, color = condition)) + 
    # scale_color_grey(start=0.65, end = 0.05) + 
    scale_color_manual(factor(conds.ordered, levels = conds.ordered), 
                       values = colors.ordered, name = NULL) + 
    scale_x_continuous(breaks = ordertib$order, minor_breaks = NULL, labels = ordertib$condition) +
    xlab("") +
    ylab("") + expand_limits(y = 0) +
    theme_classic(base_size = 6) + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 
          legend.position = "none",
          axis.line.x=element_blank(),
          axis.line.y=element_blank()) 
  
  return(p)
}

makeTrackOfBeeswarmPlots <- function(nearbyGeneInfo, deseqtib, yaxis = "tpm") {
  counter <- 1
  savedplots <- list()
  plotcenters <- c()
  plotymaxs <- c()
  plotymins <- c()
  for (i in 1:nrow(nearbyGeneInfo)) {
    ensg <- nearbyGeneInfo$ensg[i]
    tssloc <- nearbyGeneInfo$tss[i]
    if (ensg %in% deseqtib$ensg) {
      if (yaxis == "tpm") {
        # yaxis.only.subset <- deseqtib %>% dplyr::select(matches("^(TGFb|RA)-(low|med|high)_avgTPM$"))
        # yaxis.only.subset[["EtOH-nlDensity_avgTPM"]] <- deseqtib[["EtOH-nlDensity_avgTPM"]]
        ind.yaxis.values <- dplyr::select(deseqtib, matches("(ensg)|(_tpm)") ) %>% dplyr::select(-"46-EtOH-nlDensity_tpm")
      } else if (yaxis == "log2fc") {
        # yaxis.only.subset <- deseqtib %>% dplyr::select(matches("^(TGFb|RA)-(low|med|high)_avgTPM$"))
        # yaxis.only.subset[["EtOH-nlDensity_avgTPM"]] <- deseqtib[["EtOH-nlDensity_avgTPM"]]
        ind.yaxis.tpm.values <- dplyr::select(deseqtib, matches("(ensg)|(_tpm)") ) %>% dplyr::select(-"46-EtOH-nlDensity_tpm")
        # this is going to be a messy block of code but should work. the column names will say "tpm" but the tpm will later be removed.
        # this will change the tpm value to a fold-change value in three stepwise operations, one for each replicate.
        # we will "mutate" the data frame to replace tpm values with foldchange values, then change all foldchange values to log fold change values.
        # the index numbers are manually taken from the samplemetadata spreadsheet, with +1 added to account for the offset due to the ensg column.
        indicesrep1 <- 1:12 + 1
        indicesrep2 <- c(13:22, 35:36) + 1
        indicesrep3 <- 23:34 + 1
        ind.yaxis.foldchange.values <- ind.yaxis.tpm.values
        ind.yaxis.foldchange.values[, indicesrep1] <- ind.yaxis.tpm.values[, indicesrep1] / pull(ind.yaxis.tpm.values, "05-EtOH-nlDensity_tpm")
        ind.yaxis.foldchange.values[, indicesrep2] <- ind.yaxis.tpm.values[, indicesrep2] / pull(ind.yaxis.tpm.values, "15-EtOH-nlDensity_tpm")
        ind.yaxis.foldchange.values[, indicesrep3] <- ind.yaxis.tpm.values[, indicesrep3] / pull(ind.yaxis.tpm.values, "27-EtOH-nlDensity_tpm")
        ind.yaxis.log2fc.values <- ind.yaxis.foldchange.values
        ind.yaxis.log2fc.values[, 1:36 + 1] <- log2(ind.yaxis.foldchange.values[, 1:36 + 1])
        ind.yaxis.values <- ind.yaxis.log2fc.values
      } else {
        stop("yaxis value in makeTrackOfBeeswarmPlots must be 'tpm' or 'log2fc'")
      }
      ind.yaxis.values.long <- gather(ind.yaxis.values, key = "sampleID", value = "yaxis", -"ensg") 
      ind.yaxis.values.long[['sampleID']] <- sapply(ind.yaxis.values.long$sampleID, function(x) strsplit(x, "_")[[1]][1])  # remove _tpm or _log2fc 
      
      tibforplot <- left_join(ind.yaxis.values.long, dplyr::select(samplemetadata, sampleID, condition), by="sampleID")
      ordertib <- tibble(condition = c("EtOH-halfDensity", "EtOH-nlDensity", "EtOH-highDensity",
                                       "TGFb-low", "TGFb-med", "TGFb-high",
                                       "RA-low", "RA-med", "RA-high", 
                                       "TGFb-and-RA-low", "TGFb-and-RA-med", "TGFb-and-RA-high"), 
                         order = 1:12)
      tibforplot <- left_join(tibforplot, ordertib, by="condition")
      tibforplot <- left_join(tibforplot, dplyr::select(samplemetadata, sampleID, replicate), by = "sampleID")
      replicate.spacing <- 0.15
      tibforplot[tibforplot[["replicate"]] == "rep1", "order"] <- tibforplot[tibforplot[["replicate"]] == "rep1", "order"] - replicate.spacing
      tibforplot[tibforplot[["replicate"]] == "rep3", "order"] <- tibforplot[tibforplot[["replicate"]] == "rep3", "order"] + replicate.spacing
      
      p.this <- makeBeeswarmPlot(ensg, tibforplot, deseqtib, ordertib)
      savedplots[[counter]] <- p.this
      
      plotcenters <- c(plotcenters, tssloc - this.tsstib$tss[1])
      builtplot <- ggplot_build(p.this)
      plotymaxs <- c(plotymaxs, builtplot$layout$panel_scales_y[[1]]$range$range[2])
      plotymins <- c(plotymins, builtplot$layout$panel_scales_y[[1]]$range$range[1])
      
      counter <- counter + 1
    }
  }
  max.yaxis.val <- max(plotymaxs)
  min.yaxis.val <- min(plotymins)
  for (i in 1:length(savedplots)) {
    savedplots[[i]] <- savedplots[[i]] + ylim(min.yaxis.val, max.yaxis.val)
  }
  
  # now mush these plots together as "annotations"
  yaxis_radius_p1 <- max.yaxis.val
  p1 <- ggplot(NULL) + 
    xlim(-1 * TSS.window.radius, TSS.window.radius) + 
    ylim(0, yaxis_radius_p1) +
    theme_classic() +
    ylab(paste0("rna-seq ", yaxis)) +
    theme(axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.line.x=element_blank(),
          axis.line.y=element_blank(),
          axis.ticks.y=element_blank(),
          axis.text.y=element_blank()) 
  
  width_x_topplots <- 20000
  left_offset <- width_x_topplots * 0.085
  for (i in 1:length(savedplots)) {
    if (plotcenters[i] == 0) {
      plotmelaterindex <- i # skip the "central gene" to plot last so that it doesn't get plotted over
    } else {
      g <- ggplotGrob(savedplots[[i]])
      p1 <- p1 + annotation_custom(grob = g,
                                   xmin = plotcenters[i] - width_x_topplots/2 - left_offset,
                                   xmax = plotcenters[i] + width_x_topplots/2 - left_offset,
                                   ymin = 0,
                                   ymax = yaxis_radius_p1)
    }
  }
  # finally, plot the "central gene"
  g <- ggplotGrob(savedplots[[plotmelaterindex]])
  

  p1 <- p1 + annotation_custom(grob = g,
                               xmin = plotcenters[plotmelaterindex] - width_x_topplots/2 - left_offset,
                               xmax = plotcenters[plotmelaterindex] + width_x_topplots/2 - left_offset,
                               ymin = 0,
                               ymax = yaxis_radius_p1)
  return(p1)
} 


for (gene.to.plot in gene.set.to.run) {

  this.tsstib <- filter(TSS.loc.tib.extended, ensg == gene.to.plot)
  if (nrow(this.tsstib) == 0) {
    print(paste0("no TSS info available for ", gene.to.plot))
    next
  }
  this.hgncsymbol <- this.tsstib$gene_name[1]
  grTssWindowThisGene <- GRanges(seqnames = c(this.tsstib$chrom),
                                 ranges   = IRanges(start = this.tsstib$tss - TSS.window.radius,
                                                    end   = this.tsstib$tss + TSS.window.radius),
                                 strand   = this.tsstib$strand,
                                 gene     = this.tsstib$ensg)
  
  grTssAllGenes <- GRanges(seqnames = c(TSS.loc.tib.extended$chrom),
                           ranges   = IRanges(start = TSS.loc.tib.extended$tss,
                                              end   = TSS.loc.tib.extended$tss),
                           strand   = TSS.loc.tib.extended$strand,
                           gene     = TSS.loc.tib.extended$ensg)
  
  nearbyGeneInfo <- TSS.loc.tib.extended[subjectHits(findOverlaps(grTssWindowThisGene, grTssAllGenes, ignore.strand=TRUE)), ]
  nearbyGeneInfo$gene_name[is.na(nearbyGeneInfo$gene_name)] <- nearbyGeneInfo$ensg[is.na(nearbyGeneInfo$gene_name)]
   
  grDiffPeaks   <-  GRanges(seqnames = diffpeakstib$chrom,
                            ranges = IRanges(start = diffpeakstib$startLocs,
                                             end   = diffpeakstib$endLocs) ,
                            fragCtsControl = diffpeakstib$`EtOH-nlDensity-avgNormFragmentCounts`)
  
  TssWindowPeakOverlaps <- findOverlaps(grTssWindowThisGene, grDiffPeaks, ignore.strand=TRUE)
  nearbyPeakInfo <- diffpeakstib[subjectHits(TssWindowPeakOverlaps), ] %>% 
    mutate(peakmidpoint = (startLocs + endLocs) / 2) %>%
    mutate(relativepeakmidpoint = peakmidpoint - this.tsstib$tss[1])
  
  # Plots 0 and 1: show the gene expression changes associated to the central and neighboring genes, in tpm + log2fc 
  p0 <- makeTrackOfBeeswarmPlots(nearbyGeneInfo, deseqtib, yaxis = "log2fc")
  p1 <- makeTrackOfBeeswarmPlots(nearbyGeneInfo, deseqtib, yaxis = "tpm")
  grid.arrange(p0, p1, ncol = 1)

  # Plot 2: show the relative locations of differential peaks and nearby genes in the chosen radius from the central gene's TSS
  p2tib <- tibble(x = nearbyPeakInfo$relativepeakmidpoint, y = rep(0, nrow(nearbyPeakInfo)))
  p2 <- ggplot(p2tib, aes(x = x, y = y)) + 
    geom_line(data = tibble(x = c(-1 * TSS.window.radius, TSS.window.radius), y = c(0, 0)), mapping = aes(x=x, y=y)) +
    geom_linerange(ymin = -1, ymax = 1) +
    xlim(-1 * TSS.window.radius, TSS.window.radius) + ylim(-10, 10) + 
    geom_text(data = nearbyGeneInfo, mapping = aes(x = nearbyGeneInfo$tss - this.tsstib$tss[1], 
                                                   y = rep(-2, nrow(nearbyGeneInfo)), 
                                                   label = nearbyGeneInfo$gene_name)) +
    geom_point(data = nearbyGeneInfo, mapping = aes(x = nearbyGeneInfo$tss - this.tsstib$tss[1], 
                                                   y = rep(0, nrow(nearbyGeneInfo))), size = I(2))
  p2 <- p2 + theme_void()
  
  # Plot 3: show how the accessibility of each peak changes according to dosage or cell density
  # just use a for loop and keep adding new plots to the same graphics object
  p3 <- ggplot(data = NULL)
  for (condition.set in list(c("RA-low", "RA-med", "RA-high", "red"),
                             c("TGFb-low", "TGFb-med", "TGFb-high", "blue"),
                             c("TGFb-and-RA-low", "TGFb-and-RA-med", "TGFb-and-RA-high", "purple"),
                             c("EtOH-halfDensity", "EtOH-nlDensity", "EtOH-highDensity", "black"))) {
    for (i in 1:nrow(nearbyPeakInfo)) {
      y1 <- pull(nearbyPeakInfo, paste0(condition.set[1], "-avgNormFragmentCounts"))[i]
      y2 <- pull(nearbyPeakInfo, paste0(condition.set[2], "-avgNormFragmentCounts"))[i]
      y3 <- pull(nearbyPeakInfo, paste0(condition.set[3], "-avgNormFragmentCounts"))[i]
      condcolor <- condition.set[4]
      xbase <- pull(nearbyPeakInfo, "relativepeakmidpoint")[i]
      xbaseincrement <- 1000
      thislineplottib <- tibble(x = c(xbase - xbaseincrement, xbase, xbase + xbaseincrement),
                                y = c(y1, y2, y3))
      p3 <- p3 + geom_line(data = thislineplottib, mapping = aes(x = x, y = y), color = condcolor)
    }
  }
  p3 <- p3 + 
    ylab("avg. nl. fragment counts (atac)") +
    xlim(-1 * TSS.window.radius, TSS.window.radius) +
    theme_classic() + 
    theme(axis.title.x=element_blank(),
          axis.text.x=element_blank(),
          axis.ticks.x=element_blank(),
          axis.line.x=element_blank(),
          axis.line.y=element_blank()) +
    geom_hline(mapping=NULL, yintercept = 0, color = "gray")
  
  # Plot 4: add histograms to the bottom to display the distribution of change in accessibility: fold change, log2fc, and raw change
  condForNearbyPeaks <- 'TGFb-and-RA'
  nearbyPeakInfoEtOHAvgNormFragCts <- pull(nearbyPeakInfo, "EtOH-nlDensity-avgNormFragmentCounts")
  nearbyPeakInfoAvgNormFragCts <- select(nearbyPeakInfo, matches(paste0('^', condForNearbyPeaks, '-(low|med|high)', '-avgNormFragmentCounts')))
  nearbyPeakInfoAvgFoldChange  <- select(nearbyPeakInfo, matches(paste0('^', condForNearbyPeaks, '-(low|med|high)', '-avgFoldchange')))
  nearbyPeakInfoIsDiffPeakThisCond  <- select(nearbyPeakInfo, matches(paste0('^', condForNearbyPeaks, '-(low|med|high)', '-isDiffPeak')))
  isDiffPeakAnyDose <- c()

  for (i in 1:nrow(nearbyPeakInfo)) {
    isDiffPeakAnyDose <- c(isDiffPeakAnyDose, any(as_vector(nearbyPeakInfoIsDiffPeakThisCond[i,])))
  }
  if (nrow(nearbyPeakInfo) == 0) {
    print("no nearby peaks in any condition for")
    print(gene.to.plot)
    next()
  } else if (sum(isDiffPeakAnyDose) == 0) {
    print("no nearby peaks for")
    print(gene.to.plot)
    next()
  }
  nearbyPeakInfoAvgNormFragCts     <- nearbyPeakInfoAvgNormFragCts[isDiffPeakAnyDose,]
  nearbyPeakInfoEtOHAvgNormFragCts <- nearbyPeakInfoEtOHAvgNormFragCts[isDiffPeakAnyDose]
  nearbyPeakInfoRawAccDiff       <- nearbyPeakInfoAvgNormFragCts - nearbyPeakInfoEtOHAvgNormFragCts
  nearbyPeakInfoAvgFoldChange    <- nearbyPeakInfoAvgFoldChange[isDiffPeakAnyDose,]
  nearbyPeakInfoAvgLogFoldChange <- log2(nearbyPeakInfoAvgFoldChange)
  
  p4 <- nearbyPeakInfoRawAccDiff %>% 
    mutate(local_peak_id = 1:nrow(nearbyPeakInfoRawAccDiff)) %>%
    gather(key = "dose", value = "AvgNormFragCountDiff", -local_peak_id) %>%
    mutate(dose = sapply(dose, function(x) paste0(condForNearbyPeaks, " ", strsplit(x, "-")[[1]][length(strsplit(x, "-")[[1]])-1], " dose"))) %>%
    mutate(dose = factor(dose, levels = paste0(condForNearbyPeaks, " ", c("low", "med", "high"), " dose"))) %>%
    ggplot(aes(AvgNormFragCountDiff)) + 
    geom_histogram() +
    facet_wrap(vars(dose), nrow=3) +
    theme_minimal(base_size = 10)
  xaxisfactor <- max(abs(min(nearbyPeakInfoRawAccDiff)), abs(max(nearbyPeakInfoRawAccDiff))) * 1.05
  p4 <- p4 + xlim(-1 * xaxisfactor, xaxisfactor)
  
  p5 <- nearbyPeakInfoAvgFoldChange %>% 
    mutate(local_peak_id = 1:nrow(nearbyPeakInfoAvgFoldChange)) %>%
    gather(key = "dose", value = "AvgFoldChangeDiff", -local_peak_id) %>%
    mutate(dose = sapply(dose, function(x) paste0(condForNearbyPeaks, " ", strsplit(x, "-")[[1]][length(strsplit(x, "-")[[1]])-1], " dose"))) %>%
    mutate(dose = factor(dose, levels = paste0(condForNearbyPeaks, " ", c("low", "med", "high"), " dose"))) %>%
    ggplot(aes(AvgFoldChangeDiff)) + 
    geom_histogram() +
    facet_wrap(vars(dose), nrow=3) +
    theme_minimal(base_size = 10) +
    theme(axis.title.y = element_blank())
  xaxisfactor <- max(abs(min(nearbyPeakInfoAvgFoldChange)), abs(max(nearbyPeakInfoAvgFoldChange))) * 1.05
  p5 <- p5 + xlim(0, xaxisfactor)
  
  p6 <- nearbyPeakInfoAvgLogFoldChange %>% 
    mutate(local_peak_id = 1:nrow(nearbyPeakInfoAvgLogFoldChange)) %>%
    gather(key = "dose", value = "AvgLog2FoldChangeDiff", -local_peak_id) %>%
    mutate(dose = sapply(dose, function(x) paste0(condForNearbyPeaks, " ", strsplit(x, "-")[[1]][length(strsplit(x, "-")[[1]])-1], " dose"))) %>%
    mutate(dose = factor(dose, levels = paste0(condForNearbyPeaks, " ", c("low", "med", "high"), " dose"))) %>%
    ggplot(aes(AvgLog2FoldChangeDiff)) + 
    geom_histogram() +
    facet_wrap(vars(dose), nrow=3) +
    theme_minimal(base_size = 10) +
    theme(axis.title.y = element_blank())
  xaxisfactor <- max(abs(min(nearbyPeakInfoAvgLogFoldChange)), abs(max(nearbyPeakInfoAvgLogFoldChange))) * 1.05
  p6 <- p6 + xlim(-1 * xaxisfactor, xaxisfactor)

  g0 <- ggplotGrob(p0)
  g1 <- ggplotGrob(p1)
  g2 <- ggplotGrob(p2)
  g3 <- ggplotGrob(p3)
  g4 <- ggplotGrob(p4)
  g5 <- ggplotGrob(p5)
  g6 <- ggplotGrob(p6)
  g7 <- cbind(g4, g6, g5)
  
  
  g <- rbind(g0, g1, g2, g3, size = "first")
  g$widths <- unit.pmax(g2$widths, g3$widths)
  # grid.newpage()
  # grid.draw(g)
  
  layout.matx <- rbind(c(1), 
                       c(1),
                       c(1),
                       c(2))
  track.panel.final <- grid.arrange(g, g7, layout_matrix = layout.matx)
  ggsave(paste0(outputfolder, '/', this.hgncsymbol, '_', gene.to.plot, ".svg"), plot = track.panel.final, width = 16, height = 9)
}
back to top