Content - 0cb655215d82b621360dfcf0109981a6d71feccb - 07f01d9/exec/postprocess_mv.R

postprocess_mv.R
#!/usr/bin/env Rscript
#
# Plot figures and other post-processing
#
# Get the results directory of a run_GEMMA.R script and prepare publication-ready figures
# The script was built and tested on GEMMA only with multiple phenotypes.
# Input also includes the number of clusters to use for clustering. Colors are preset

# Load relevant libraries:
library(plyr)
library(dplyr)
library(readr)
library(tibble)
library(ggplot2)
library(RColorBrewer)
library(viridis)
library(gplots)
library(argparse)
library(yaml)
library(cowplot)
library(grid)
library(gtable)
library(mousegwas)

parser <- ArgumentParser()

# specify our desired options
# by default ArgumentParser will add an help option
parser$add_argument("--outdir", "-o",
                    help = "run_GEMMA.R output dir")
parser$add_argument("--plotdir", "-p", default = ".",
                    help = "Where to put the plots")
parser$add_argument("-y", "--yaml",
                    help="Yaml file which includes the groups under phenotypes")
parser$add_argument("--clusters",
                    '-c',
                    default = 7,
                    type = "integer",
                    help = "Number of peaks clusters")
parser$add_argument("--sample",
                    "-s",
                    type = "integer",
                    default = 10000,
                    help = "Number of SNPs to sample for the LD plotting")
parser$add_argument("--pvalthr",
                    default = 5,
                    type = "double",
                    help = "p-value threshold for plotting and getting gene lists")
parser$add_argument("--nomv",
                    default = FALSE,
                    action = "store_true",
                    help = "Ignore multivariate results and use only single phenotypes")
parser$add_argument("--inrich", default = "inrich",
                    help = "INRICH exe utable and custom parameters, deafult: inrich")
parser$add_argument("--inrich_i",
                    "-i",
                    type = "double",
                    default = 5,
                    help = "Minimal group size for inrich (-i), default 5")
parser$add_argument("--inrich_j",
                    "-j",
                    type = "double",
                    default = 200,
                    help = "Maximal group size for inrich (-j), default 200")
parser$add_argument("--ldpeakdist",
                    type = "integer",
                    default = 10000000,
                    help = "Peak maximal width")
parser$add_argument("--peakcorr",
                    type = "double",
                    default = 0.25,
                    help = "SNPs R^2 correlation cutoff for peak determination")
parser$add_argument("--loddrop",
                    type = "double",
                    default = 1.5,
                    help = "LOD drop from the peak SNP for the purpose of selecting genes related to the QTL")
parser$add_argument(
  "--external_inrich",
  action = "store_true",
  default = FALSE,
  help = "Prepare INRICH files but don't run INRICH"
)
parser$add_argument(
  "--coat_phenotype",
  action = "store_true",
  default = FALSE,
  help = "GWAS of coat color, no phenotypes in yaml file"
)
parser$add_argument("--colorgroup",
                    action = "store_true",
                    default = FALSE,
                    help = "Color the Group Manhattan plots by group rather than cluster")
parser$add_argument("--meanvariance", action="store_true", default=FALSE,
                    help="Plot the PVE of mean and variance phenotypes in different plots")
parser$add_argument("--set3", action="store_true", default=FALSE,
                    help="Use Set3 color palette for groups, default is Accent")
parser$add_argument("--minherit", type="double", default=0,
                    help= "Heritability threshold (PVE) to include a phenotype in the aggregated reports")
args <- parser$parse_args()

# Step 1: Read the color palette
# Cluster colors
ccols <- brewer.pal(args$clusters, "Dark2")[1:args$clusters]
# Heatmap plot for m-values
pigr <- RColorBrewer::brewer.pal(name = "PiYG", n = 11)
hmcol <-
  viridis(128)#colorRampPalette(pigr[c(2,5,10)])(128)#viridis(128, option="cividis")
if (args$set3) {
  grpcol <- RColorBrewer::brewer.pal(12, "Set3")[2:12]
} else{
  grpcol <- RColorBrewer::brewer.pal(8, "Accent")
}
fullw <- 7.25
halfw <- 3.54
fheight <- 11-1.25
height <- 3.54#fheight/4

ffam <- "Arial"
# Read the data
# Phenotypes names
phenos <-
  as.character(read.csv(
    paste0(args$outdir, "/phenotypes_order.txt"),
    header = FALSE,
    skip = 1
  )$V1)
#pnames <- read.csv(args$names, row.names = 2)
pnames <- data.frame(PaperName = character(0), Group = character(0), stringsAsFactors = FALSE)
yamin <- yaml.load_file(args$yaml)
pheno_names <- c()
for (n in names(yamin$phenotypes)) {
  pheno_names <- c(pheno_names, n)
  pnames <-
    rbind(
      pnames,
      data.frame(
        Group = if (length(yamin$phenotypes[[n]]$group)) yamin$phenotypes[[n]]$group else "NoGroup",
        PaperName = yamin$phenotypes[[n]]$papername, stringsAsFactors = FALSE
      )
    )
}
if (args$coat_phenotype){
  for (p in phenos){
    pheno_names <- c(pheno_names, p)
    pnames <- rbind(pnames, data.frame(Group=if(grepl("coat", p)) "Coat" else "Eyes", PaperName=gsub(" ", "-", gsub("(coat|eyes)", "", p))))
  }
}
row.names(pnames) <- pheno_names

phenos <- as.character(pnames[phenos, "PaperName", drop = T])

# SNPs annotations
anno <-
  read_delim(
    paste0(args$outdir, "/annotations.csv"),
    ",",
    col_names = c("rs", "ps", "chr"),
    guess_max = Inf
  )
# Read the p-values from all the pasted phenotypes and mv LOCO files
pvalmat <- NULL

# Read the genotypes
geno_t <-
  read_csv(
    paste0(args$outdir, "/strains_genotypes_all.csv"),
    col_types = cols(
      .default = col_double(),
      chr = col_character(),
      rs = col_character(),
      major = col_character(),
      minor = col_character()
    )
  )

geno <-
  as.matrix(
    geno_t %>% column_to_rownames(var = "rs") %>% dplyr::select(-chr, -bp38, -major, -minor)
  )

PVE <- read_csv(paste0(args$outdir, "/PVE_GEMMA_estimates.txt"))
groupsOrder = if (length(yamin$groups)) yamin$groups else unique(pnames$Group)
PVE <-
  left_join(PVE, as_tibble(pnames, rownames = "phenotype"), by = ("phenotype"))
if (args$meanvariance){
  gnames <- groupsOrder[!grepl("Variance", groupsOrder)]
  grpcol <- rep(grpcol[1:min(length(gnames), length(grpcol))], ceiling(length(gnames)/length(grpcol)))
  grpcol <- grpcol[1:length(gnames)]
  names(grpcol) <- gnames
  for (g in groupsOrder[grepl("Variance", groupsOrder)]){
    cadd <- grpcol[gsub("Variance", "", g)]
    names(cadd) <- g
    grpcol <- c(grpcol, cadd)
  }
}else{
grpcol <- rep(grpcol[1:min(length(groupsOrder), length(grpcol))], ceiling(length(groupsOrder)/length(grpcol)))
grpcol <- grpcol[1:length(groupsOrder)]
names(grpcol) <- groupsOrder
}
pnames <-
  left_join(pnames, tibble(Group = groupsOrder, color = grpcol), by =
              "Group")
row.names(pnames) <- pheno_names

# We're all set
dir.create(args$plotdir, recursive = TRUE)
set.seed(490)

# Plot the PVE estimates with SE

pvh <- height
if (args$meanvariance) {
  if (dim(PVE)[1] > 40)
    pvh <- height * 2
  pveplot <- paired_PVE_plot(PVE, minherit=args$minherit)
  pvep <-
    cbind(
      ggplotGrob(pveplot$mean_plot + scale_fill_manual(values = grpcol) + theme_bw()  +
        theme(
          text = element_text(size = 10, family = ffam),
          legend.position = "none",
          axis.ticks.y = element_blank(),
          plot.title = element_text(hjust = 0.5)
        )),
      ggplotGrob(pveplot$var_plot + scale_fill_manual(values = grpcol) + theme_bw()  +
        theme(
          text = element_text(size = 10, family = ffam),
          legend.position = "none",
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank(),
          plot.title = element_text(hjust = 0.5)
        ))
    )
  cairo_pdf(
    paste0(args$plotdir, "/PVE_plot.pdf"),
    width = halfw * 1.5,
    height = pvh,
    family = ffam
  )
  grid.draw(pvep)
  dev.off()
} else{
  if (dim(PVE)[1] > 40)
    pvh <- height * 2
  pvep <-
    ggplot(PVE, aes(reorder(PaperName,-PVE), PVE, fill = Group, alpha=PVE>args$minherit)) + geom_bar(color =
                                                                              "black", stat = "identity") +
    scale_fill_manual(values = grpcol) +
    geom_errorbar(aes(ymin = PVE - PVESE, ymax = PVE + PVESE), width = .2) +
    scale_alpha_manual(breaks = c(FALSE, TRUE), values=c(0.4,1)) + guides(alpha=FALSE) +
    xlab("Phenotype") + coord_flip() +
    theme_bw()  +
    theme(text = element_text(size = 10, family = ffam),
          legend.position = "right")

ggsave(
  paste0(args$plotdir, "/PVE_plot.pdf"),
  plot = pvep,
  device = cairo_pdf,
  dpi = "print",
  width = halfw * 1.5,
  height = pvh,
  units = "in"
)
}

# Write the genes for INRICH, get the gene annotations and pass them on
annot <- write_genes_map(args$plotdir)

# Plot each phenotype's Manhattan plot
lilp <- vector("list", 0)
allpeaks <- NULL
allsnps <- NULL

for (i in 1:length(phenos)) {
  pp <-
    plot_gemma_lmm(
      paste0(args$outdir, "/output/lmm_pheno_", i, "_all_LOCO.assoc.txt"),
      name = "Chromosome",
      genotypes = geno,
      namethr = args$pvalthr,
      redthr = args$pvalthr,
      maxdist = args$ldpeakdist,
      corrthr = args$peakcorr,
      annot = annot
    )
  pname <- phenos[i]
  lilp[[pname]] <- pp
  allsnps <- pp$gwas$rs
  if (is.null(pvalmat)) {
    pvalmat <-
      pp$gwas %>% mutate(!!(pname) := P) %>% dplyr::select(rs, !!(pname))
  } else{
    pvalmat <-
      left_join(pvalmat,
                pp$gwas %>% mutate(!!(pname) := P) %>% dplyr::select(rs, !!(pname)),
                by = "rs")
  }
  allpeaks <- c(allpeaks, pp$gwas$rs[pp$gwas$ispeak])
  ggsave(
    filename = paste0(
      args$plotdir,
      "/Manhattan_plot_phenotype_",
      i,
      "_",
      phenos[i],
      ".pdf"
    ),
    plot = pp$plot + ggtitle(pname) + theme(text = element_text(size = 10, family =
                                                 ffam)),
    dpi = "print",
    device = cairo_pdf,
    width = fullw,
    height = height,
    units = "in"
  )
}

# Plot the groups MAnhattan plots
grpwas <- list()
if (args$nomv) {
  # Plot each group's max P
  # Add "All Phenotypes" to the list of groups. If meanvariance add two groups for the mean and the variance
  grplist <- c(unique(as.character(pnames$Group)), "All Phenotypes")
  if (args$meanvariance){
    grplist <- c(grplist, "All Mean Phenotypes", "All Variance Phenotypes")
  }
  for (g in grplist) {
    allpwas = NULL
    plist <- pnames$PaperName[pnames$Group == g]
    if (g == "All Phenotypes") {
      plist <- pnames$PaperName
    }
    if (g == "All Mean Phenotypes"){
      plist <- pnames$PaperName[!grepl("Variance", pnames$PaperName)]
    }
    if (g == "All Variance Phenotypes"){
      plist <- pnames$PaperName[grepl("Variance", pnames$PaperName)]
    }
    plist <- intersect(plist, PVE$PaperName[PVE$PVE >= args$minherit])
    for (p in intersect(plist, names(lilp))) {
      if (is.null(allpwas)) {
        allpwas <- lilp[[p]]$pwas %>% dplyr::select(chr, rs, ps, allele0, allele1, af, P, p_wald, BPcum)
        allpwas$grpcolor <- pnames$Group[pnames$PaperName==p][1]
      } else{
        allpwas <-
          left_join(allpwas,
                    lilp[[p]]$pwas[, c("rs", "P", "p_wald")],
                    by = "rs",
                    suffix = c("", ".x"))
        allpwas$P <- pmax(allpwas$P, allpwas$P.x)
        allpwas$p_wald <- pmin(allpwas$p_wald, allpwas$p_wald.x)
        allpwas$grpcolor[allpwas$p_wald == allpwas$p_wald.x] <- pnames$Group[pnames$PaperName==p][1]
        allpwas <- allpwas %>% dplyr::select(-P.x,-p_wald.x)

      }
    }

    if (is.null(allpwas)) {
      print(g)
      next
    }
    pnums <-
      rep_peaks(
        geno,
        allpwas,
        rs_thr = args$peakcorr,
        pthr = 10 ^ -args$pvalthr,
        mxd = args$ldpeakdist
      )
    allpwas <- allpwas %>% left_join(pnums, by = "rs")
    pname = g
    allpeaks <- c(allpeaks, allpwas$rs[allpwas$ispeak])
#   pvalmat <-
#      left_join(pvalmat,
#                allpwas %>% mutate(!!(pname) := P) %>% dplyr::select(rs, !!(pname)),
#                by = "rs")

    grpwas[[g]] <- allpwas
    # Recolor the second layer with the clusters colors
  }
} else{
  for (grpf in Sys.glob(paste0(
    args$outdir,
    "/output/lmm_phenotypes_*_all_LOCO.assoc.txt"
  ))) {
    pp <-
      plot_gemma_lmm(
        grpf,
        name = "Chromosome",
        genotypes = geno,
        namethr = args$pvalthr,
        redthr = args$pvalthr,
        maxdist = args$ldpeakdist,
        corrthr = args$peakcorr,
        test = "p_score",
        annot = annot
      )
    pname <-
      gsub(".*phenotypes_(.*)_all_LOCO.assoc.txt", "\\1", grpf)
    grpwas[[pname]] = pp$pwas

    allpeaks <- c(allpeaks, pp$gwas$rs[pp$gwas$ispeak])
#    pvalmat <-
#      left_join(pvalmat,
#                pp$gwas %>% mutate(!!(pname) := P) %>% dplyr::select(rs, !!(pname)),
#                by = "rs")

    ggsave(
      filename = paste0(
        args$plotdir,
        "/Manhattan_plot_phenotypes_",
        pname,
        ".pdf"
      ),
      plot = pp$plot + ggtitle(pname) + theme(text = element_text(size = 10, family =
                                                   ffam)),
      dpi = "print",
      device = cairo_pdf,
      width = fullw,
      height = height,
      units = "in"
    )
  }
}


# Cluster the peaks using the P values

pgwas <-
  pvalmat %>% filter(rs %in% allpeaks) %>% column_to_rownames(var = "rs")
print(sum(is.na(pgwas)))
pgwas[is.na(pgwas)] = 0
pgwas <- as.matrix(pgwas)

kk <- kmeans(pgwas, args$clusters)

# Plot the m-value heatmap
# Order the columns
rowarr <- NULL
for (i in unique(kk$cluster)) {
  cc <- hclust(dist(pgwas[kk$cluster == i, , drop = F]))
  rowarr <- c(rowarr, row.names(pgwas)[kk$cluster == i][cc$order])
}
pgwas <- pgwas[rowarr,]
clustcol <- tibble(cluster = unique(kk$cluster), color = ccols)
colrow <-
  tibble(rs = rownames(pgwas), cluster = kk$cluster[rowarr]) %>% left_join(clustcol, by =
                                                                             "cluster") %>% column_to_rownames(var = "rs") %>% dplyr::select(color)
grptocol <-
  tibble(Group = groupsOrder, color = grpcol[1:length(groupsOrder)])
colcol <-
  PVE %>% left_join(grptocol) %>% column_to_rownames(var = "PaperName") %>% dplyr::select(color)
colcol <- colcol$color
hwid <- fullw
if (dim(PVE)[1]>40) hwid <- fullw*2

write.csv(cbind(pgwas, cbind(colrow, kk$cluster[rowarr])), file = paste0(args$plotdir, "/all_peaks_values_clusters.csv"))
cairo_pdf(
  paste0(args$plotdir, "/all_peaks_heatmap.pdf"),
  width = hwid,
  height = height + 1,
  family = ffam
)
hplt <- heatmap.2(
  pgwas,
  col = hmcol,
  Rowv = F,
  Colv = T,
  dendrogram = "col",
  scale = "none",
  trace = "none",
  RowSideColors = colrow[, 1, drop = T],
  ColSideColors = colcol,
  labRow = NA,
  hclustfun = function(x)
    hclust(x, method = "average"),
  distfun = function(x)
    dist(scale(x)),
  margins = c(12, 8),
  srtCol = 45,
  key = T,
  key.title = "-log(P-value)",
  density.info = "none"
)

dev.off()

svg(paste0(args$plotdir, "/all_peaks_heatmap.svg"),
    width = hwid,
    height = height + 1,
    family = ffam)
eval(hplt$call)
dev.off()


# Plot the phenotypes manhattan plots with clusters colors
# Add the cluster number to the pwas object
if (!args$colorgroup){
  for (i in names(lilp)) {
  p <- lilp[[i]]
  p$pwas <-
    p$pwas %>% left_join(tibble(rs = rownames(pgwas), cluster = as.factor(kk$cluster[rowarr])), by =
                           "rs")
  if (sum(p$pwas$ispeak) == 0)
    next
  p$pwas <-
    p$pwas %>% dplyr::select(-cluster) %>% left_join(filter(p$pwas, ispeak) %>%
                                                       dplyr::select(choose, cluster),
                                                     by = "choose")

  # Recolor the second layer with the clusters colors
  pnoname <- p$plot
  pnoname$layers <- pnoname$layers[1:2]
  ggsave(
    filename = paste0(args$plotdir, "/replot_Manhattan_clusters_", i, ".pdf"),
    plot = pnoname + ggnewscale::new_scale("alpha") + ggnewscale::new_scale("color") + ggnewscale::new_scale("size") +
      geom_point(aes(color = p$pwas$cluster, size=P, alpha=rsq)) +
      scale_color_manual(values = ccols) +
      scale_size_continuous(range=c(0,1), trans = "exp") +
      scale_alpha_continuous(range = c(0,1), trans="exp") +
      ggnewscale::new_scale("alpha") + ggnewscale::new_scale("color") + ggnewscale::new_scale("size") +
      geom_point(
        aes(alpha = p$pwas$ispeak),
        size = 1.2,
        color = "black"
      ) +
      scale_alpha_manual(values = c(0, 1)) + ggnewscale::new_scale("alpha") + ggnewscale::new_scale("color") + ggnewscale::new_scale("size") +
      #ggnewscale::new_scale_color() +
      geom_point(aes(
        color = p$pwas$cluster,
        alpha = p$pwas$ispeak
      ), size = 1) +
      scale_color_manual(values = ccols) +
      scale_alpha_manual(values = c(0, 1)) +
      ggtitle(i) +
      theme(text = element_text(size = 10, family = ffam)),
    device = cairo_pdf,
    dpi = "print",
    width = fullw,
    height = height,
    units = "in"
  )
}
}
mainplot = NULL
for (g in names(grpwas)) {
  # Add cluster to ispeak
  allpwas <-
    grpwas[[g]] %>% left_join(tibble(rs = rownames(pgwas), cluster = as.factor(kk$cluster[rowarr])), by =
                                "rs")
  # Expand cluster to all choose
  allpwas <-
    allpwas %>% dplyr::select(-cluster) %>% left_join(filter(allpwas, ispeak) %>%
                                                        dplyr::select(choose, cluster),
                                                      by = "choose")
  axisdf <-
    allpwas %>% group_by(chr) %>% summarize(center = (max(BPcum) + min(BPcum)) / 2)
  ymax <- 1.25 * max(allpwas$P, na.rm = TRUE)
  ymin <- 1.25 * min(allpwas$P, na.rm = TRUE)
  chr_label <- axisdf$chr
  chr_label[chr_label == 20] = "X"
  colorby <- "cluster"
  palette <- ccols
  pbreaks <- unique(kk$cluster)
  if (args$colorgroup) {
    colorby = "grpcolor"
    palette <- grpcol
  }
  print(palette)
  outplot <- ggplot2::ggplot(allpwas, aes(x = BPcum, y = P)) +

    # Show all points
    geom_point(aes(color = as.factor(chr), size=P) , alpha = 1) +
    scale_color_manual(values = c(rep(
      c("#CCCCCC", "#969696"), 10
    ))) +
    scale_size_continuous(range=c(0,1), trans = "exp") +
    geom_segment(y = args$pvalthr, x=min(allpwas$BPcum)-50000000, xend=max(allpwas$BPcum)+50000000, yend=args$pvalthr,color="#FCBBA1") +
    ggnewscale::new_scale("alpha") + ggnewscale::new_scale("color") + ggnewscale::new_scale("size")  +
    geom_point(aes_string(color = colorby, size="rsq", alpha="rsq")) +
    scale_color_manual(values = palette) +
    scale_size_continuous(range=c(0,1), trans = "exp") +
    scale_alpha_continuous(range = c(0,1), trans="exp") +
    ggnewscale::new_scale("alpha") + ggnewscale::new_scale("color") + ggnewscale::new_scale("size")  +
    geom_point(aes(alpha = ispeak), size = 1.2, color = "black") +
    scale_alpha_manual(values = c(0, 1)) +
    ggnewscale::new_scale("alpha") + ggnewscale::new_scale("color") + ggnewscale::new_scale("size")  +
    geom_point(aes_string(color = colorby,
                   alpha = "ispeak"), size = 1) +
    scale_color_manual(values = palette) +
    scale_alpha_manual(values = c(0, 1)) +
    scale_x_continuous(label = chr_label, breaks = axisdf$center) +
    scale_y_continuous(expand = c(0, 0)) +     # remove space between plot area and x axis
    ylim(ymin, ymax) +
    xlab(g) +
    ylab("-log(P-value)") +
    theme_bw() +
    theme(
      legend.position = "none",
      text = element_text(size = 10, family = ffam),
      panel.border = element_blank(),
      panel.grid.major.x = element_blank(),
      panel.grid.minor.x = element_blank()
    )
  ggsave(
    filename = paste0(
      args$plotdir,
      "/replot_Manhattan_clusters_",
      gsub(" ", "_", g),
      ".pdf"
    ),
    plot = outplot,
    device = cairo_pdf,
    dpi = "print",
    width = fullw,
    height = height,
    units = "in"
  )
  if (g == "All Phenotypes") mainplot = outplot
  if (g == "All Mean Phenotypes") meanplot = outplot
  if (g == "All Variance Phenotypes") varplot = outplot
}

# Plot the LD drop figure
comp_LD_2 <-
  function(genotypes,
           c,
           maxdist = 2500000,
           MAF = 0.1,
           miss = 0.1) {
    gen <- genotypes %>% filter(chr == c, !is.na(rs)) %>% arrange(bp38)
    gen <-
      gen[(
        rowSums(gen[, 6:ncol(gen)] == 0, na.rm = T) > (dim(gen)[2] - 5) * MAF &
          rowSums(gen[, 6:ncol(gen)] > 0, na.rm = T) > (dim(gen)[2] -
                                                          5) * MAF &
          rowSums(is.na(gen[, 6:ncol(gen)])) < (dim(gen)[2] - 5) *
          miss
      ),]
    allcor = NULL
    for (i in 1:(nrow(gen) - 1)) {
      tmat <-
        as.data.frame(base::t(
          gen %>% filter(between(bp38, gen$bp38[i], gen$bp38[i] + maxdist)) %>%
            dplyr::select(-bp38, -chr, -major, -minor) %>% column_to_rownames(var = "rs")
        ))
      if (ncol(tmat) < 2)
        next()
      ct <-
        cor(tmat[, 1], tmat[, 2:ncol(tmat), drop = F], use = 'pairwise.complete.obs', method =
              "pearson")
      allcor <-
        rbind(
          allcor,
          as_tibble(t(ct), rownames = "SNP2") %>% mutate(r_sq = V1 ^ 2, SNP1 = gen$rs[i]) %>% dplyr::select(SNP1, SNP2, r_sq)
        )
    }
    return(allcor)
  }
geno_s <-
  geno_t %>% filter(chr != "X", chr != "Y", chr != "MT") %>% sample_n(args$sample)
allchr = NULL
for (chr in unique(geno_s$chr)) {
  allchr <- rbind(allchr, comp_LD_2(geno_s, chr))
}
allchr <-
  allchr %>% left_join(dplyr::select(geno_s, rs, bp38), by = c("SNP1" = "rs")) %>%
  left_join(dplyr::select(geno_s, rs, bp38), by = c("SNP2" = "rs")) %>% mutate(dist = bp38.y -
                                                                                 bp38.x)
avgwin = 5000
allchr$distc <-
  (cut(allchr$dist, breaks = seq(
    from = min(allchr$dist) - 1,
    to = max(allchr$dist) + 1,
    by = avgwin
  )))
allavg <-
  allchr %>% group_by(distc) %>% summarise(avdist = mean(dist), avr_sq =
                                             mean(r_sq, na.rm = T)) %>% ungroup()
pld <-
  ggplot(allavg, aes(avdist / 1000000, avr_sq)) + geom_smooth(
    color = RColorBrewer::brewer.pal(3, "Set1")[3],
    method = "loess",
    span = 0.3,
    se = FALSE
  ) +
  xlim(c(0, 2.5)) + labs(x = "Distance (Mbp)", y = expression("Average LD" ~
                                                                (r ^ {
                                                                  2
                                                                }))) + theme_bw() + theme(
                                                                  panel.border = element_blank(),
                                                                  panel.grid.major.x = element_blank(),
                                                                  panel.grid.minor.x = element_blank(),
                                                                  panel.grid.major.y = element_blank(),
                                                                  panel.grid.minor.y = element_blank(),
                                                                  text = element_text(size = 10, family = ffam)
                                                                )
ggsave(
  filename = paste0(args$plotdir, "/plot_LD_drop.pdf"),
  plot = pld,
  device = cairo_pdf,
  dpi = "print",
  width = halfw,
  height = height,
  units = "in"
)
# Plot Figure 1: pvep pld and mainplot
combp <- plot_grid(plot_grid(pvep, pld, ncol=2, nrow=1, labels=c('A', 'B'), label_size = 12, label_fontface = "plain", rel_widths = c(1.5,1)), mainplot, NULL, nrow = 3, ncol = 1, labels = c('', 'C','D'), label_size = 12, fontface="plain")
ggsave(filename = paste0(args$plotdir, "/combined_figure1.svg"),
       plot = combp,
       device = svg,
       dpi = "print",
       width = fullw,
       height = fheight,
       units = "in")
if (args$meanvariance){
  combp <- plot_grid(pvep, meanplot, varplot, mainplot, nrow = 4, ncol = 1, labels = c('A', 'B', 'C', 'D'), label_size = 12, fontface="plain", rel_heights = c(2.5, 1, 1, 1))
  ggsave(filename = paste0(args$plotdir, "/combined_figure2.svg"),
         plot = combp,
         device = svg,
         dpi = "print",
         width = fullw,
         height = fheight,
         units = "in")
}
# Plot MAF histogram
mafdat <-
  tibble(rs = geno_t$rs, maf = rowSums(geno_t[, -1:-5]) / (2 * (ncol(geno_t) -
                                                                  5)))
mafdat$maf <- pmin(mafdat$maf, 1 - mafdat$maf)
mafdat <- mafdat %>% filter(rs %in% allsnps)
if ("All Phenotypes" %in% names(grpwas)) {
  mafdat <- inner_join(mafdat, grpwas[["All Phenotypes"]] %>% dplyr::select(rs, choose), by = "rs")
} else{
  mafdat$choose <- 0
}
mafp <-
  ggplot(mafdat, aes(maf, fill = (choose > 0), color = (choose > 0))) + geom_histogram(binwidth = 1 /
                                                                                       (ncol(geno_t) - 5)) + xlim(c(0, 0.5)) +
  scale_color_manual(
    values = RColorBrewer::brewer.pal(12, "Paired")[3:4],
    name = "",
    labels = c("All", "Peak")
  ) +
  scale_fill_manual(
    values = RColorBrewer::brewer.pal(12, "Paired")[3:4],
    name = "",
    labels = c("All", "Peak")
  ) +
  xlab("MAF") +
  theme_bw() + theme(
    legend.position = c(0.8, 0.9),
    panel.border = element_blank(),
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    text = element_text(size = 10, family = ffam)
  )
ggsave(
  filename = paste0(args$plotdir, "/plot_MAF_hist.pdf"),
  plot = mafp,
  device = cairo_pdf,
  dpi = "print",
  width = halfw,
  height = height,
  units = "in"
)


# Expand each peak to include the entire peak, not just the single SNP
ext_peak_sing <- function(snps, maxdist = 500000, loddrop = args$loddrop) {
  if (loddrop){
  snps$minps <- snps$ps
  snps$maxps <- snps$ps
  for (c in unique(snps$choose)){
    peakps <- snps$ps[snps$ispeak & snps$choose==c]
    lodstop <- max(snps$P[snps$ispeak & snps$choose==c]) - loddrop
    snps$minps[snps$choose==c] <- max(c(peakps-maxdist, snps$ps[snps$choose==c & snps$P < lodstop & snps$ps < peakps]))
    snps$maxps[snps$choose==c] <- min(c(peakps+maxdist, snps$ps[snps$choose==c & snps$P < lodstop & snps$ps > peakps]))
  }
  return(snps)
  }else{
  csum <-
    snps %>% group_by(choose) %>% dplyr::summarize(maxps = max(ps), minps = min(ps)) %>% ungroup()
  snps %>% left_join(csum, by = "choose") %>% mutate(maxps = pmin(maxps, ps +
                                                                    maxdist),
                                                     minps = pmax(minps, ps - maxdist))
  }
}

# This tibble will accumulate all the genes for each cluster (ID and name)
clustgene <- vector(mode = "list", length = args$clusters)
clustmgi <- vector(mode = "list", length = args$clusters)
allgenes = c()
pgc <- tibble(rs = rownames(pgwas), cluster = kk$cluster[rowarr])
clusterpeaks = tibble(
  cluster = numeric(0),
  chr = character(0),
  minps = numeric(0),
  maxps = numeric(0)
)
write_inrich_snps(geno_t, args$plotdir)
for (i in 1:length(lilp)) {
  pp <- lilp[[i]]
  if (sum(pp$gwas$ispeak) == 0)
    next
  expp <- ext_peak_sing(pp$gwas, maxdist = args$ldpeakdist)
  write_inrich_phenotype(expp[expp$ispeak == T, ], args$plotdir, phenos[i])
  if (! args$external_inrich){
  run_inrich(
    args$plotdir,
    phenos[i],
    exec = args$inrich,
    i = args$inrich_i,
    j = args$inrich_j
  )
  }
  exppc <-
    expp %>% dplyr::filter(ispeak) %>% left_join(pgc, by = "rs") %>% dplyr::select(cluster, chr, minps, maxps)
  clusterpeaks <- rbind(clusterpeaks, exppc)
  affgen <-
    get_genes(expp[expp$ispeak == T, ], dist = 1000, annot = annot)
  if (nrow(affgen) > 0) {
    # Add the genes to the appropriate cluster
    clustp <- left_join(affgen, pgc, by = "rs")
    for (c in unique(clustp$cluster)) {
      clustgene[[c]] <-
        unique(c(clustgene[[c]], clustp$ensembl_gene_id[clustp$cluster == c]))
      clustmgi[[c]] <-
        unique(c(clustmgi[[c]], clustp$mgi_symbol[clustp$cluster == c]))
    }
    allgenes <- unique(c(allgenes, clustp$mgi_symbol))
    write_csv(affgen,
              path = paste0(args$plotdir, "/genes_for_phenotype_", i, ".csv"))
    t2 <- ungroup(summarize(group_by(filter(affgen, gene_biotype=="protein_coding"), rs), protein_coding_genes = n()))
    ngene_tbl <- left_join(expp[expp$ispeak == T, ], t2, by = "rs")
    write_csv(ngene_tbl,
              path = paste0(args$plotdir,
                            "/intervals_for_phenotype_",
                            i,
                            ".csv"))
  }
}

# Run each group
for (n in names(grpwas)) {
  pp <- grpwas[[n]]
  # Change the chromosome to character
  pp$chr <- as.character(pp$chr)
  pp$chr[pp$chr == "20"] <- "X"
  if (sum(pp$ispeak) == 0)
    next
  expp <- ext_peak_sing(pp, maxdist = args$ldpeakdist)
  write_inrich_phenotype(expp[expp$ispeak == T, ], args$plotdir, n)
  if (! args$external_inrich){
  run_inrich(
    args$plotdir,
    n,
    exec = args$inrich,
    i = args$inrich_i,
    j = args$inrich_j
  )
  }
  affgen <-
    get_genes(expp[expp$ispeak == T, ], dist = 1000, annot = annot)
  if (nrow(affgen) > 0) {
    # Add the genes to the appropriate cluster
    write_csv(affgen,
              path = paste0(
                args$plotdir,
                "/genes_for_phenotype_Group_",
                gsub(" ", "_", n),
                ".csv"
              ))
    # Get the number of protein-coding genes
    t2 <- ungroup(summarize(group_by(filter(affgen, gene_biotype=="protein_coding"), rs), protein_coding_genes = n()))
    ngene_tbl <- left_join(expp[expp$ispeak == T, ], t2, by = "rs")
    if (n == "All Phenotypes"){
      ngene_tbl$groups <- ""
      # Add a column with ';' separated phenotype groups names that overlap the peak
      for (n1 in names(grpwas)){
        if (n1 != "All Phenotypes" && n1 != "All Variance Phenotypes" && n1 != "All Mean Phenotypes"){
          ot <- grpwas[[n1]]
          ot$chr <- as.character(ot$chr)
          ot$chr[ot$chr == "20"] <- "X"
          j1 <- left_join(ngene_tbl, ot[ot$P >= args$pvalthr, ], by = "chr")
          # Filter to where ps is in the minps-maxps range
          j1 <- filter(j1, ps.y >= minps & ps.y <= maxps)
          if (nrow(j1) > 0) {
            print(n1)
            print(j1)
            ngene_tbl$groups[ngene_tbl$rs %in% j1$rs.x] <-
              sapply(ngene_tbl$groups[ngene_tbl$rs %in% j1$rs.x], function(x)
                if (x == "")
                  n1
                else
                  paste(x, n1, sep = ";"))
          }
        }
      }
    }
    write_csv(ngene_tbl, path = paste0(
      args$plotdir,
      "/intervals_for_phenotype_Group_",
      gsub(" ", "_", n),
      ".csv"
    ))
  }
}

for (k in unique(kk$cluster)) {
  write_csv(
    data.frame(genes = clustgene[[k]]),
    path = paste0(args$plotdir, "/genes_for_cluster_", k, ".csv")
  )
  # Run INRICH
  write_inrich_phenotype(clusterpeaks %>% filter(cluster == k),
                         args$plotdir,
                         paste0("cluster_", k))
  if (! args$external_inrich){
    run_inrich(
    args$plotdir,
    paste0("cluster_", k),
    exec = args$inrich,
    i = args$inrich_i,
    j = args$inrich_j
  )}


}

# Plot markers density
chrord <- c("X", 19:1)
densp <- geno_t %>% filter(chr != "Y", chr != "MT")  %>%
  ggplot(aes(bp38 / 1000000, factor(chr, levels = chrord))) +
  geom_bin2d(binwidth = 1, drop = T) + xlab("Position (Mbp)") + ylab ("Chromosome") +
  scale_fill_viridis(name = expression(frac('markers', '1 Mbp'))) +
  theme_bw() +
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    text = element_text(size = 10, family = ffam)
  )
ggsave(
  filename = paste0(args$plotdir, "/Chromosome_density_plot.pdf"),
  plot = densp,
  device = cairo_pdf,
  dpi = "print",
  width = fullw,
  height = height,
  units = "in"
)

# Plot the effectplots for all the peaks
plot_effect(args$outdir, args$plotdir, allpeaks, pnames, fullw, fheight, ffam)