geodist.snake
#!/bin/python3
configfile: "config/config.json"
import numpy as np
import pandas as pd
import sys
sys.path.append('src/')
shell.prefix("set -o pipefail; ")
DATA_DIR = config['DATA_DIR']
TMP_DIR = config['TMP_DIR']
## Reading in parameter files for all of the VCF identification
vcf_file_list_new = config['VCF_FILE_LIST']
VCF_DICT_NEW_HG38_FILT = pd.read_csv(vcf_file_list_new, sep='\s+', header=None).set_index(0).to_dict()[1]
PREFIX_FINAL_FILT = config['PREFIX']
VCF_DICT = {}
VCF_DICT[PREFIX_FINAL_FILT] = VCF_DICT_NEW_HG38_FILT
# Bin Dictionary for number of categories
bin_dict = {
'4': '[0.0, 0.01, 0.05]',
'3': '[0.0, 0.01]',
'3x': '[0.0, 0.05]',
'3c': '[0.0, 0.1]'
}
AUTOSOMES = np.arange(1,23)
# -------------- 1. Estimate Allele Frequencies --------------- #
rule estimate_allele_freq:
"""
Estimating the allele frequency for each SNP
"""
input:
vcf = lambda wildcards : VCF_DICT[wildcards.PREFIX][int(wildcards.CHROM)],
pops = 'params/parfiles/poplists/{poplist}.txt'
output:
tmp_vcf = temp(TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM,\d+}.biallelic_snps.{poplist}.frq.vcf.gz'),
freq = TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM,\d+}.biallelic_snps.{poplist}.frq.strat.gz'
run:
prefix = TMP_DIR + 'freq_set/%s_chr%s.biallelic_snps.%s' % (wildcards.PREFIX, wildcards.CHROM, wildcards.poplist)
shell('bcftools annotate --set-id \'%POS\' {input.vcf} | bcftools view -v snps -m2 -M2 | bgzip -@10 > {output.tmp_vcf}')
shell('plink --vcf {output.tmp_vcf} --double-id --freq gz --within {input.pops} --out {prefix}')
rule calc_mac_total:
"""
Generate files with the allele count and the total number of haplotypes
"""
input:
freq = rules.estimate_allele_freq.output.freq,
pops = 'params/parfiles/poplists/{poplist}.txt',
pop_labels = 'params/parfiles/poplists/{poplist}_panel.txt'
output:
mac_temp = temp(TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.mac.txt.tmp.gz'),
total_temp = temp(TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.total.txt.tmp.gz'),
tmp_header = temp(TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.header.txt.tmp.gz'),
mac_temp2 = temp(TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.mac.txt.tmp2.gz'),
total_temp2 = temp(TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.total.txt.tmp2.gz'),
mac = TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.mac.txt.gz',
total = TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.total.txt.gz'
shell:
"""
python3 ./src/reformat_plink_freq.py --freq_file {input.freq} --pop_labels {input.pop_labels} --mac True --header True | bgzip -@10 > {output.mac_temp}
python3 ./src/reformat_plink_freq.py --freq_file {input.freq} --pop_labels {input.pop_labels} --mac False --header True | bgzip -@10 > {output.total_temp}
zcat {output.mac_temp} | head -n1 | gzip > {output.tmp_header}
zcat {output.mac_temp} | awk \'NR > 1 {{a[$2]++; b[$2]=$0}} END {{ for (c in a) {{ print b[c] }}}}\' | sort -n -k2 | gzip > {output.mac_temp2}
zcat {output.total_temp} | awk \'NR > 1 {{a[$2]++; b[$2]=$0}} END {{ for (c in a) {{ print b[c] }}}}\' | sort -n -k2 | gzip > {output.total_temp2}
cat {output.tmp_header} {output.mac_temp2} > {output.mac}
cat {output.tmp_header} {output.total_temp2} > {output.total}
rm -f {input.freq}
"""
rule gen_freq_table:
"""
Generate a frequency table with frequencies per population
"""
input:
mac_total = rules.calc_mac_total.output.mac,
total_total = rules.calc_mac_total.output.total
output:
freq_table = TMP_DIR + 'freq_set/{PREFIX}_chr{CHROM, \d+}.biallelic_snps.{poplist}.freq.txt.gz'
shell:
"""
python3 src/gen_freq_table.py --mac {input.mac_total} --total {input.total_total} | bgzip -@10 > {output.freq_table}
rm {input.mac_total}
rm {input.total_total}
"""
rule collapse_freq_tables:
input:
expand(TMP_DIR + 'freq_set/{{PREFIX}}_chr{CHROM}.biallelic_snps.{{poplist}}.freq.txt.gz', CHROM=AUTOSOMES)
output:
tmp_total = temp(TMP_DIR + 'freq_set/{PREFIX}_total.biallelic_snps.{poplist}.freq.total.txt.gz'),
tmp_header = temp(TMP_DIR + 'freq_set/{PREFIX}_total.biallelic_snps.{poplist}.freq.tmp.header.txt.gz'),
total_freq = DATA_DIR + 'freq/{PREFIX}_total.biallelic_snps.{poplist}.freq.total.txt.gz'
shell:
"""
zcat {input} | grep -v ^CHR | gzip > {output.tmp_total}
zgrep CHR {input[0]} | gzip > {output.tmp_header}
cat {output.tmp_header} {output.tmp_total} > {output.total_freq}
"""
# ------------- 2. Running GeoDist algorithm ------------------ #
rule generate_geodist:
"""
Generate a naive version of geodist
"""
input:
rules.collapse_freq_tables.output.total_freq
output:
total_geodist = DATA_DIR + 'geodist/raw/{PREFIX}_total.biallelic_snps.{poplist}.ncat{NCAT}.geodist.total.txt.gz'
wildcard_constraints:
NCAT='(3|3x)',
poplist ='pops|superpops|superpops_amended2'
run:
bins = bin_dict[wildcards.NCAT]
shell('python3 src/geodist.py --freqs {input} --bins \"{bins}\" | bgzip > {output.total_geodist}')
rule count_geodist_categories:
"""
Rule to count number of geodist categories
"""
input: rules.generate_geodist.output.total_geodist
output:
geodist_cnt =
DATA_DIR + 'geodist/counts/total/{PREFIX}_total.biallelic_snps.{poplist}.ncat{NCAT}.filt_{filt,\d+}.geodist_cnt.txt.gz'
wildcard_constraints:
poplist='pops|superpops|superpops_amended2'
shell:
'''
zcat {input} | awk \'NR > 1 && $5 > {wildcards.filt}\' | awk \'{{counts[$7]++}} END{{for(i in counts){{print i,counts[i];}}}}\' | sort -n -k1,1 | gzip > {output.geodist_cnt}
'''
rule count_geodist_all:
input:
expand(DATA_DIR + 'geodist/counts/total/{PREFIX}_total.biallelic_snps.{poplist}.ncat{NCAT}.filt_{filt}.geodist_cnt.txt.gz', PREFIX=PREFIX_FINAL_FILT, filt=[0,1], NCAT=['3x','3'], poplist=['superpops_amended2'])