Revision 2e105bf050fd2acb9f1e0d1833566c0b1ca5b41f authored by Stephanie Yan on 23 August 2021, 15:32:48 UTC, committed by Stephanie Yan on 23 August 2021, 15:32:48 UTC
1 parent b19a9ee
Raw File
hwe_one_sided.R
library(HardyWeinberg)
library(data.table)
library(dplyr)
library(pbmcapply)

### Calculates one-sided Hardy-Weinberg p-values in each 1000 Genomes population
### and counts the number of populations in which an SV violates one-sided HWE
### (excess of heterozygotes). Y chromosome variants are ignored because they're haploid.

############################################################################


get_hwe_pvals <- function(population_ID) {
  
  autosomal_file <- paste0("gt_counts/",
                           population_ID,
                           "_gtcounts_autosome.txt")
  xchrom_file <- paste0("gt_counts/",
                        population_ID,
                        "_gtcounts_xchr.txt")
  
  # Do HWE calculations for autosomal variants
  
  # data table columns should be ordered: homozygous, het, homozygous
  # order of homozygotes for the minor vs. major allele does not matter
  a_data <- fread(autosomal_file,
                  header = FALSE,
                  col.names = c("SV","AA","AB","BB","missing"))
  
  # convert data into a matrix for HWExactMat
  a_data_mat <- data.matrix(a_data[,c("AA","AB","BB")])
  
  # calculate one-sided HWE p-values,
  # where only an excess of heterozygotes is evidence against HWE
  a_data$pvals <- HWExactMat(a_data_mat,
                             alternative = "greater",
                             x.linked = FALSE,
                             verbose = FALSE)$pvalvec
  
  
  # Do HWE calculations for variants on the X chromosome
  
  # X-chromosome input dataframe has genotype counts in order A, B, AA, AB, BB
  # A and B are the male counts, the rest are the female counts
  x_data <- fread(xchrom_file,
                  header = FALSE,
                  col.names = c("SV","A","B","AA","AB","BB","missing"))

  # convert data into a matrix for HWExactMat
  x_data_mat <- data.matrix(x_data[,c("A","B","AA","AB","BB")])

  # calculate one-sided HWE p-values,
  # where only an excess of heterozygotes is evidence against HWE
  x_data$pvals <- HWExactMat(x_data_mat,
                             alternative = "greater",
                             x.linked = TRUE,
                             verbose = FALSE)$pvalvec


  # combine autosome and X chromosome variants
  data <- rbind(a_data[,c("SV","pvals")],
                x_data[,c("SV","pvals")])
  return(data$pvals)
  
}


# get list of populations
file_list <- list.files(path = "gt_counts",
                        pattern = ".txt")
populations <- sapply(strsplit(file_list, "_"), `[`, 1) %>%
  unique()

# get vector of SV names from one of the gt counts files
autosomal_SVs <- fread("gt_counts/ACB_gtcounts_autosome.txt",
                       header = FALSE)[,1]
xchr_SVs <- fread("gt_counts/ACB_gtcounts_xchr.txt",
                  header = FALSE)[,1]

# get HWE p-values for all 1000 Genomes population
pvals <- pbmclapply(populations,
                    FUN = get_hwe_pvals,
                    mc.cores = getOption("mc.cores", 24L)) %>%
  setDT() %>%
  setnames(populations) %>%
  .[, SV := c(autosomal_SVs, xchr_SVs)]

# count # of populations where SV's HW p-value is < 0.0001
pvals_count <- mutate_all(pvals[,-27],
                          function(x) {ifelse((x < 0.0001),
                                              1,
                                              0)}) %>%
  mutate(non_hwe_counts = rowSums(.)) %>%
  setDT() %>%
  .[, SV := c(autosomal_SVs, xchr_SVs)]

# add column to pvals table with counts of populations in which SV violates HWE
pvals %>%
  .[, non_hwe_counts := pvals_count[, non_hwe_counts]]

fwrite(pvals, "HWE_pvals.txt",
       sep = "\t")
back to top