Revision 2e105bf050fd2acb9f1e0d1833566c0b1ca5b41f authored by Stephanie Yan on 23 August 2021, 15:32:48 UTC, committed by Stephanie Yan on 23 August 2021, 15:32:48 UTC
1 parent b19a9ee
Raw File
hwe_count_gts.py
#!/usr/bin/env python3

"""
Usage: ./hwe_count_genotypes.py <VCF> <manifest file>

Generates files with counts of samples that are hom ref, hom alt, and het for each SV,
within each population. Used as input into R's HardyWeinberg package. Autosomal and
X chromosome variants are treated separately. Y chromosome variants are ignored.
"""

import sys
from cyvcf2 import VCF
import pandas as pd


"""
Generates autosomal genotype counts for a list of samples.
(i.e. samples that belong to one population)
"""
def count_gts(sample_list):
    # load VCF and choose only samples of interest
    vcf = VCF(sys.argv[1])
    vcf.set_samples(sample_list)
    
    # initialize lists
    ids = []
    hets = []
    refs = []
    alts = []
    missing = []

    # count # of samples of each genotype for every variant
    for variant in vcf:
        if (variant.CHROM != "chrX") and (variant.CHROM != "chrY"): # skip X, Y chromosome variants
            ids.append(variant.ID)
            hets.append(variant.num_het)
            refs.append(variant.num_hom_ref)
            alts.append(variant.num_hom_alt)
            missing.append(variant.num_unknown)

    # create dataframes with gt counts
    df = pd.DataFrame(
        {"SV" : ids,
        "ref" : refs,
        "het" : hets,
        "alt" : alts,
        "missing" : missing
        })
    return(df)

"""
Generates X chromosome genotype counts for a list of samples.
(i.e. samples that belong to one population)
Only outputs ref/alt genotypes for male samples, since they're X chromosome haploids.
"""
def count_gts_xchr(sample_list, sex):
    # load VCF and choose only samples of interest
    vcf = VCF(sys.argv[1])
    vcf.set_samples(sample_list)
    
    # initialize lists
    ids = []
    hets = []
    refs = []
    alts = []
    missing = []
        
    # count # of samples of each genotype for every variant
    for variant in vcf:
        if variant.CHROM == "chrX": # only look at variants on X chromosome
            ids.append(variant.ID)
            hets.append(variant.num_het)
            refs.append(variant.num_hom_ref)
            alts.append(variant.num_hom_alt)
            missing.append(variant.num_unknown)

    # create dataframes with gt counts
    df = pd.DataFrame(
        {"SV" : ids,
        "ref" : refs,
        "het" : hets,
        "alt" : alts,
        "missing" : missing
        })
    if sex == "male":
        # incorporate het samples into alt allele calls
        df["alt"] = df["het"] + df["alt"]
        return(df[["SV","ref","alt","missing"]])
    else:
        return(df[["ref","het","alt","missing"]])


"""
Create a dataframe of samples and their corresponding populations.
"""

samples = pd.read_table(sys.argv[2], header=None, names=["ident","cov","sex","pop"], index_col=4)
# split sample path column to return only the sample ID
def split_col(ident):
    return((ident.split(".")[0]).split("/")[1])
samples["ident"] = samples["ident"].apply(lambda x: split_col(x))

populations = samples["pop"].unique()


"""
Loop through populations, counting autosome and X chromosome genotypes for each.
Output genotype counts as separate files.
"""

output_dir = "gt_counts/"

for pop in populations:
    # get IDs for samples in population
    roi = samples.loc[:,"pop"] == pop
    subset = samples.loc[roi,:]
    pop_ids = subset["ident"].tolist()
    
    # get IDs for male and female samples in population
    m_roi = subset.loc[:,"sex"] == "male"
    m_subset = subset.loc[m_roi,:] # male samples
    m_pop_ids = m_subset["ident"].tolist()
    f_subset = subset.loc[-m_roi,:] # female samples
    f_pop_ids = f_subset["ident"].tolist()
    
    # get genotype counts for autosomal variants
    autosome_counts = count_gts(pop_ids)
    autosome_counts.to_csv(output_dir + pop + "_gtcounts_autosome.txt", sep="\t", index=False, header=False)
    
    # get genotype counts for X chromosome variants
    xchr_m_counts = count_gts_xchr(m_pop_ids, "male")
    xchr_f_counts = count_gts_xchr(f_pop_ids, "female")
    xchr_counts = pd.concat([xchr_m_counts, xchr_f_counts], axis=1)
    # add counts of missing GT calls from male and female samples
    xchr_counts.iloc[:,7] = xchr_counts.iloc[:,3] + xchr_counts.iloc[:,7]
    xchr_counts.iloc[:,[0,1,2,4,5,6,7]].to_csv(output_dir + pop + "_gtcounts_xchr.txt", sep="\t", index=False, header=False)
back to top