import os
import scipy as sp
from scipy import linalg
import h5py
import time
import sys
from ldpred import reporting
from ldpred import plinkfiles
from ldpred import util
def get_prs(genotype_file, rs_id_map, phen_map=None, only_score = False, verbose=False):
plinkf = plinkfiles.plinkfile.PlinkFile(genotype_file)
samples = plinkf.get_samples()
if not only_score:
# 1. Figure out indiv filter and get true phenotypes
indiv_filter = sp.zeros(len(samples), dtype='bool8')
true_phens = []
iids = []
if phen_map is not None:
pcs = []
sex = []
covariates = []
phen_iids = set(phen_map.keys())
for samp_i, sample in enumerate(samples):
if sample.iid in phen_iids:
indiv_filter[samp_i] = True
true_phens.append(phen_map[sample.iid]['phen'])
iids.append(sample.iid)
if 'pcs' in list(phen_map[sample.iid].keys()):
pcs.append(phen_map[sample.iid]['pcs'])
if 'sex' in list(phen_map[sample.iid].keys()):
sex.append(phen_map[sample.iid]['sex'])
if 'covariates' in list(phen_map[sample.iid].keys()):
covariates.append(phen_map[sample.iid]['covariates'])
if len(pcs) > 0:
assert len(pcs) == len(
true_phens), 'PC information missing for some individuals with phenotypes'
if len(sex) > 0:
assert len(sex) == len(
true_phens), 'Sex information missing for some individuals with phenotypes'
if len(covariates) > 0:
assert len(covariates) == len(
true_phens), 'Covariates missing for some individuals with phenotypes'
else:
for samp_i, sample in enumerate(samples):
if sample.affection != 2:
indiv_filter[samp_i] = True
true_phens.append(sample.affection)
iids.append(sample.iid)
num_individs = sp.sum(indiv_filter)
assert num_individs > 0, 'Issues in parsing the phenotypes and/or PCs?'
assert not sp.any(sp.isnan(
true_phens)), 'Phenotypes appear to have some NaNs, or parsing failed.'
if verbose:
print('%d individuals have phenotype and genotype information.' % num_individs)
else:
iids = [sample.iid for sample in samples]
num_individs = len(samples)
num_non_matching_nts = 0
num_flipped_nts = 0
pval_derived_effects_prs = sp.zeros(num_individs)
# If these indices are not in order then we place them in the right place
# while parsing SNPs.
if verbose:
print('Iterating over BED file to calculate risk scores.')
locus_list = plinkf.get_loci()
num_loci = len(locus_list)
loci_i = 0
snp_i = 0
duplicated_snps =0
found_loci = set()
for locus, row in zip(locus_list, plinkf):
loci_i += 1
if loci_i%100==0:
sys.stdout.write('\r%0.2f%%' % (100.0 * (float(loci_i) / (num_loci-1.0))))
sys.stdout.flush()
upd_pval_beta = 0
sid = locus.name
if sid not in rs_id_map:
continue
if sid in found_loci:
duplicated_snps+=1
continue
found_loci.add(locus.name)
rs_info = rs_id_map[sid]
if rs_info['upd_pval_beta'] == 0:
continue
# Check whether the nucleotides are OK, and potentially flip it.
ss_nt = rs_info['nts']
g_nt = [locus.allele1, locus.allele2]
flip_nts = False
os_g_nt = sp.array(
[util.opp_strand_dict[g_nt[0]], util.opp_strand_dict[g_nt[1]]])
if not (sp.all(g_nt == ss_nt) or sp.all(os_g_nt == ss_nt)):
# Opposite strand nucleotides
flip_nts = (g_nt[1] == ss_nt[0] and g_nt[0] == ss_nt[1]) or (
os_g_nt[1] == ss_nt[0] and os_g_nt[0] == ss_nt[1])
if flip_nts:
upd_pval_beta = -rs_info['upd_pval_beta']
num_flipped_nts += 1
else:
num_non_matching_nts += 1
continue
else:
upd_pval_beta = rs_info['upd_pval_beta']
# Parse SNP, and fill in the blanks if necessary.
if only_score:
snp = sp.array(row, dtype='int8')
else:
snp = sp.array(row, dtype='int8')[indiv_filter]
bin_counts = row.allele_counts()
if bin_counts[-1] > 0:
mode_v = sp.argmax(bin_counts[:2])
snp[snp == 3] = mode_v
# Update scores and move on.
pval_derived_effects_prs += -1*snp * upd_pval_beta
assert not sp.any(sp.isnan(pval_derived_effects_prs)
), 'Some individual weighted effects risk scores are NANs (not a number). They are corrupted.'
snp_i +=1
perc_missing = 1 - float(len(found_loci))/float(len(rs_id_map))
if perc_missing>0.01:
raise Warning('More than %0.2f %% of variants for which weights were calculated were not found in '
'validation data. This can lead to poor prediction accuracies. Please consider '
'using the --vbim flag in the coord step to identify overlapping SNPs.'
'\n'%(perc_missing*100))
plinkf.close()
if not verbose:
sys.stdout.write('\r%0.2f%%\n' % (100.0))
sys.stdout.flush()
if verbose:
print('Number of non-matching NTs: %d' % num_non_matching_nts)
print('Number of flipped NTs: %d' % num_flipped_nts)
if not only_score:
pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, true_phens)[0, 1]
pval_eff_r2 = pval_eff_corr ** 2
if verbose:
print('Current PRS correlation: %0.4f' % pval_eff_corr)
print('Current PRS r2: %0.4f' % pval_eff_r2)
ret_dict = {'pval_derived_effects_prs': pval_derived_effects_prs,
'true_phens': true_phens[:], 'iids': iids, 'prs_r2':pval_eff_r2, 'prs_corr':pval_eff_corr,
'num_snps':snp_i, 'num_non_matching_nts':num_non_matching_nts,
'num_flipped_nts':num_flipped_nts, 'perc_missing':perc_missing,
'duplicated_snps':duplicated_snps}
if len(pcs) > 0:
ret_dict['pcs'] = pcs
if len(sex) > 0:
ret_dict['sex'] = sex
if len(covariates) > 0:
ret_dict['covariates'] = covariates
else:
ret_dict = {'pval_derived_effects_prs': pval_derived_effects_prs,'iids': iids,
'num_snps':snp_i, 'num_non_matching_nts':num_non_matching_nts,
'num_flipped_nts':num_flipped_nts, 'perc_missing':perc_missing,
'duplicated_snps':duplicated_snps
}
return ret_dict
def parse_phen_file(pf, pf_format, verbose=False, summary_dict=None):
print(pf)
phen_map = {}
num_phens_found = 0
if pf != None:
if verbose:
print('Parsing Phenotypes')
if pf_format == 'FAM':
"""
Individual's family ID ('FID')
Individual's within-family ID ('IID'; cannot be '0')
Within-family ID of father ('0' if father isn't in dataset)
Within-family ID of mother ('0' if mother isn't in dataset)
Sex code ('1' = male, '2' = female, '0' = unknown)
Phenotype value ('1' = control, '2' = case, '-9'/'0'/non-numeric = missing data if case/control)
"""
with open(pf, 'r') as f:
for line in f:
l = line.split()
iid = l[1]
sex = int(l[4])
phen = float(l[5])
if sex != 0 and phen != -9:
phen_map[iid] = {'phen': phen, 'sex': sex}
num_phens_found +=1
summary_dict[1]={'name':'Phenotype file (plink format):','value':pf}
if pf_format == 'STANDARD':
"""
IID PHE
"""
with open(pf, 'r') as f:
for line in f:
l = line.split()
iid = l[0]
phen = float(l[1])
phen_map[iid] = {'phen': phen}
num_phens_found +=1
summary_dict[1]={'name':'Phenotype file (STANDARD format):','value':pf}
if pf_format == 'LSTANDARD':
"""
FID IID PHE
"""
with open(pf, 'r') as f:
# Read first line and see if it is header
line = f.readline()
l = line.split()
try:
iid=l[1]
phen=float(l[2])
phen_map[iid] = {'phen':phen}
num_phens_found+=1
except:
# Do nothing
pass
for line in f:
l = line.split()
iid=l[1]
phen=float(l[2])
phen_map[iid] = {'phen':phen}
num_phens_found+=1
summary_dict[1]={'name':'Phenotype file (LSTANDARD format):','value':pf}
elif pf_format == 'S2':
"""
IID Age Sex Height_Inches
"""
with open(pf, 'r') as f:
for line in f:
l = line.split()
iid = l[0]
age = float(l[1])
if l[2] == 'Male':
sex = 1
elif l[2] == 'Female':
sex = 2
else:
raise Exception('Sex missing')
phen = float(l[3])
phen_map[iid] = {'phen': phen, 'age': age, 'sex': sex}
num_phens_found +=1
summary_dict[1]={'name':'Phenotype file (S2 format):','value':pf}
print("Parsed %d phenotypes successfully"%num_phens_found)
return phen_map
def parse_ldpred_res(file_name):
rs_id_map = {}
"""
chrom pos sid nt1 nt2 raw_beta ldpred_inf_beta ldpred_beta
1, 798959, rs11240777, C, T, -1.1901e-02, 3.2443e-03, 2.6821e-04
"""
with open(file_name, 'r') as f:
next(f)
for line in f:
l = line.split()
chrom_str = l[0]
chrom = int(chrom_str[6:])
pos = int(l[1])
rs_id = l[2].strip()
nt1 = l[3].strip()
nt2 = l[4].strip()
nts = [nt1, nt2]
raw_beta = float(l[5])
upd_pval_beta = float(l[6])
rs_id_map[rs_id] = {'chrom': chrom, 'pos': pos, 'nts': nts, 'raw_beta': raw_beta,
'upd_pval_beta': upd_pval_beta}
return rs_id_map
def parse_pt_res(file_name):
non_zero_chromosomes = set()
rs_id_map = {}
"""
chrom pos sid nt1 nt2 raw_beta raw_pval_beta upd_beta upd_pval_beta
1 798959 rs11240777 C T -1.1901e-02 -1.1901e-02 2.6821e-04 2.6821e-04
"""
with open(file_name, 'r') as f:
next(f)
for line in f:
l = line.split()
chrom_str = l[0]
chrom = int(chrom_str[6:])
pos = int(l[1])
rs_id = l[2].strip()
nt1 = l[3].strip()
nt2 = l[4].strip()
nts = [nt1, nt2]
raw_beta = float(l[5])
upd_pval_beta = float(l[8])
if raw_beta != 0:
rs_id_map[rs_id] = {'chrom': chrom, 'pos': pos, 'nts': nts, 'raw_beta': raw_beta,
'upd_pval_beta': upd_pval_beta}
non_zero_chromosomes.add(chrom)
return rs_id_map, non_zero_chromosomes
def write_scores_file(out_file, prs_dict, pval_derived_effects_prs, adj_pred_dict,
output_regression_betas=False, weights_dict=None, verbose=False):
num_individs = len(prs_dict['iids'])
with open(out_file, 'w') as f:
if verbose:
print ('Writing polygenic scores to file %s'%out_file)
out_str = 'IID, true_phens, PRS'
if 'sex' in prs_dict:
out_str = out_str + ', sex'
if 'pcs' in prs_dict:
pcs_str = ', '.join(['PC%d' % (1 + pc_i)
for pc_i in range(len(prs_dict['pcs'][0]))])
out_str = out_str + ', ' + pcs_str
out_str += '\n'
f.write(out_str)
for i in range(num_individs):
out_str = '%s, %0.6e, %0.6e' % (prs_dict['iids'][i], prs_dict['true_phens'][i],
pval_derived_effects_prs[i])
if 'sex' in prs_dict:
out_str = out_str + ', %d' % prs_dict['sex'][i]
if 'pcs' in prs_dict:
pcs_str = ', '.join(map(str, prs_dict['pcs'][i]))
out_str = out_str +', '+ pcs_str
out_str += '\n'
f.write(out_str)
if len(list(adj_pred_dict.keys())) > 0:
with open(out_file + '.adj', 'w') as f:
adj_prs_labels = list(adj_pred_dict.keys())
out_str = 'IID, true_phens, PRS, ' + \
', '.join(adj_prs_labels)
out_str += '\n'
f.write(out_str)
for i in range(num_individs):
out_str = '%s, %0.6e, %0.6e' % (prs_dict['iids'][i], prs_dict['true_phens'][i],
pval_derived_effects_prs[i])
for adj_prs in adj_prs_labels:
out_str += ', %0.4f' % adj_pred_dict[adj_prs][i]
out_str += '\n'
f.write(out_str)
if output_regression_betas and weights_dict != None:
hdf5file = out_file + '.weights.hdf5'
if verbose:
print ('Writing PRS regression weights to file %s'%hdf5file)
oh5f = h5py.File(hdf5file, 'w')
for k1 in list(weights_dict.keys()):
kg = oh5f.create_group(k1)
for k2 in weights_dict[k1]:
kg.create_dataset(k2, data=sp.array(weights_dict[k1][k2]))
oh5f.close()
def write_only_scores_file(out_file, prs_dict, pval_derived_effects_prs):
num_individs = len(prs_dict['iids'])
with open(out_file, 'w') as f:
print ('Writing polygenic scores to file %s'%out_file)
out_str = 'IID, PRS \n'
f.write(out_str)
for i in range(num_individs):
out_str = '%s, %0.6e\n' % (prs_dict['iids'][i],
pval_derived_effects_prs[i])
f.write(out_str)
def calc_risk_scores(bed_file, rs_id_map, phen_map, out_file=None,
split_by_chrom=False, adjust_for_sex=False,
adjust_for_covariates=False, adjust_for_pcs=False,
non_zero_chromosomes=None, only_score = False,
verbose=False, summary_dict=None):
if verbose:
print('Parsing PLINK bed file: %s' % bed_file)
if split_by_chrom:
num_individs = len(phen_map)
assert num_individs > 0, 'No individuals found. Problems parsing the phenotype file?'
pval_derived_effects_prs = sp.zeros(num_individs)
for i in range(1, 23):
if non_zero_chromosomes is None or i in non_zero_chromosomes:
genotype_file = bed_file + '_%i_keep' % i
if os.path.isfile(genotype_file + '.bed'):
if verbose:
print('Working on chromosome %d' % i)
prs_dict = get_prs(genotype_file, rs_id_map, phen_map, only_score=only_score, verbose=verbose)
pval_derived_effects_prs += prs_dict['pval_derived_effects_prs']
elif verbose:
print('Skipping chromosome')
else:
prs_dict = get_prs(bed_file, rs_id_map, phen_map, only_score=only_score, verbose=verbose)
num_individs = len(prs_dict['iids'])
pval_derived_effects_prs = prs_dict['pval_derived_effects_prs']
res_dict = {'num_snps':prs_dict['num_snps'], 'num_non_matching_nts':prs_dict['num_non_matching_nts'],
'num_flipped_nts':prs_dict['num_flipped_nts'], 'perc_missing':prs_dict['perc_missing'],
'duplicated_snps':prs_dict['duplicated_snps'], 'pred_r2': 0, 'corr_r2':0}
if only_score:
write_only_scores_file(out_file, prs_dict, pval_derived_effects_prs)
elif sp.std(prs_dict['true_phens'])==0:
if verbose:
print('No variance left to explain in phenotype.')
else:
# Report prediction accuracy
assert len(phen_map) > 0, 'No individuals found. Problems parsing the phenotype file?'
# Store covariate weights, slope, etc.
weights_dict = {}
# Store Adjusted predictions
adj_pred_dict = {}
#If there is no prediction, then output 0s.
if sp.std(pval_derived_effects_prs)==0:
weights_dict['unadjusted'] = {'Intercept': 0, 'ldpred_prs_effect': 0}
else:
pval_eff_corr = sp.corrcoef(pval_derived_effects_prs, prs_dict['true_phens'])[0, 1]
pval_eff_r2 = pval_eff_corr ** 2
res_dict['pred_r2'] = pval_eff_r2
res_dict['corr_r2'] = pval_eff_corr
pval_derived_effects_prs.shape = (len(pval_derived_effects_prs), 1)
true_phens = sp.array(prs_dict['true_phens'])
true_phens.shape = (len(true_phens), 1)
# Direct effect
Xs = sp.hstack([pval_derived_effects_prs, sp.ones((len(true_phens), 1))])
(betas, rss00, r, s) = linalg.lstsq(
sp.ones((len(true_phens), 1)), true_phens)
(betas, rss, r, s) = linalg.lstsq(Xs, true_phens)
pred_r2 = 1 - rss / rss00
weights_dict['unadjusted'] = {
'Intercept': betas[1][0], 'ldpred_prs_effect': betas[0][0]}
if verbose:
print('PRS trait correlation: %0.4f R2: %0.4f' % (pval_eff_corr,pred_r2))
# Adjust for sex
if adjust_for_sex and 'sex' in prs_dict and len(prs_dict['sex']) > 0:
sex = sp.array(prs_dict['sex'])
sex.shape = (len(sex), 1)
(betas, rss0, r, s) = linalg.lstsq(
sp.hstack([sex, sp.ones((len(true_phens), 1))]), true_phens)
Xs = sp.hstack([pval_derived_effects_prs, sex,
sp.ones((len(true_phens), 1))])
(betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens)
weights_dict['sex_adj'] = {
'Intercept': betas[2][0], 'ldpred_prs_effect': betas[0][0], 'sex': betas[1][0]}
if verbose:
print('Fitted effects (betas) for PRS, sex, and intercept on true phenotype:', betas)
adj_pred_dict['sex_prs'] = sp.dot(Xs, betas)
pred_r2 = 1 - rss_pd / rss0
print('Variance explained (Pearson R2) by PRS adjusted for Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['Sex_adj_pred_r2'] = pred_r2
pred_r2 = 1 - rss_pd / rss00
print('Variance explained (Pearson R2) by PRS + Sex : %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['Sex_adj_pred_r2+Sex'] = pred_r2
# Adjust for PCs
if adjust_for_pcs and 'pcs' in prs_dict and len(prs_dict['pcs']) > 0:
pcs = prs_dict['pcs']
(betas, rss0, r, s) = linalg.lstsq(
sp.hstack([pcs, sp.ones((len(true_phens), 1))]), true_phens)
Xs = sp.hstack([pval_derived_effects_prs,
sp.ones((len(true_phens), 1)), pcs])
(betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens)
weights_dict['pc_adj'] = {
'Intercept': betas[1][0], 'ldpred_prs_effect': betas[0][0], 'pcs': betas[2][0]}
adj_pred_dict['pc_prs'] = sp.dot(Xs, betas)
pred_r2 = 1 - rss_pd / rss0
print('Variance explained (Pearson R2) by PRS adjusted for PCs: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['PC_adj_pred_r2'] = pred_r2
pred_r2 = 1 - rss_pd / rss00
print('Variance explained (Pearson R2) by PRS + PCs: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['PC_adj_pred_r2+PC'] = pred_r2
# Adjust for both PCs and Sex
if adjust_for_sex and 'sex' in prs_dict and len(prs_dict['sex']) > 0:
sex = sp.array(prs_dict['sex'])
sex.shape = (len(sex), 1)
(betas, rss0, r, s) = linalg.lstsq(
sp.hstack([sex, pcs, sp.ones((len(true_phens), 1))]), true_phens)
Xs = sp.hstack([pval_derived_effects_prs, sex,
sp.ones((len(true_phens), 1)), pcs])
(betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens)
weights_dict['sex_pc_adj'] = {
'Intercept': betas[2][0], 'ldpred_prs_effect': betas[0][0], 'sex': betas[1][0], 'pcs': betas[3][0]}
adj_pred_dict['sex_pc_prs'] = sp.dot(Xs, betas)
pred_r2 = 1 - rss_pd / rss0
print('Variance explained (Pearson R2) by PRS adjusted for PCs and Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['PC_Sex_adj_pred_r2'] = pred_r2
pred_r2 = 1 - rss_pd / rss00
print('Variance explained (Pearson R2) by PRS+PCs+Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['PC_Sex_adj_pred_r2+PC_Sex'] = pred_r2
# Adjust for covariates
if adjust_for_covariates and 'covariates' in prs_dict and len(prs_dict['covariates']) > 0:
covariates = prs_dict['covariates']
(betas, rss0, r, s) = linalg.lstsq(
sp.hstack([covariates, sp.ones((len(true_phens), 1))]), true_phens)
Xs = sp.hstack([pval_derived_effects_prs, covariates,
sp.ones((len(true_phens), 1))])
(betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens)
adj_pred_dict['cov_prs'] = sp.dot(Xs, betas)
pred_r2 = 1 - rss_pd / rss0
print('Variance explained (Pearson R2) by PRS adjusted for Covariates: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['Cov_adj_pred_r2'] = pred_r2
pred_r2 = 1 - rss_pd / rss00
print('Variance explained (Pearson R2) by PRS + Cov: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['Cov_adj_pred_r2+Cov'] = pred_r2
if adjust_for_pcs and 'pcs' in prs_dict and len(prs_dict['pcs']) and 'sex' in prs_dict and len(prs_dict['sex']) > 0:
pcs = prs_dict['pcs']
sex = sp.array(prs_dict['sex'])
sex.shape = (len(sex), 1)
(betas, rss0, r, s) = linalg.lstsq(
sp.hstack([covariates, sex, pcs, sp.ones((len(true_phens), 1))]), true_phens)
Xs = sp.hstack([pval_derived_effects_prs, covariates,
sex, pcs, sp.ones((len(true_phens), 1))])
(betas, rss_pd, r, s) = linalg.lstsq(Xs, true_phens)
adj_pred_dict['cov_sex_pc_prs'] = sp.dot(Xs, betas)
pred_r2 = 1 - rss_pd / rss0
print('Variance explained (Pearson R2) by PRS adjusted for Cov+PCs+Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['Cov_PC_Sex_adj_pred_r2'] = pred_r2
pred_r2 = 1 - rss_pd / rss00
print('Variance explained (Pearson R2) by PRS+Cov+PCs+Sex: %0.4f (%0.6f)' % (pred_r2, (1 - pred_r2) / sp.sqrt(num_individs)))
res_dict['Cov_PC_Sex_adj_pred_r2+Cov_PC_Sex'] = pred_r2
# Now calibration
y_norm = (true_phens - sp.mean(true_phens)) / sp.std(true_phens)
denominator = sp.dot(pval_derived_effects_prs.T, pval_derived_effects_prs)
numerator = sp.dot(pval_derived_effects_prs.T, y_norm)
regression_slope = (numerator / denominator)[0][0]
if verbose:
print('The slope for predictions with weighted effects is: %0.4f'% regression_slope)
num_individs = len(prs_dict['pval_derived_effects_prs'])
# Write PRS out to file.
if out_file != None:
write_scores_file(out_file, prs_dict, pval_derived_effects_prs, adj_pred_dict,
weights_dict=weights_dict, verbose=verbose)
return res_dict
def parse_covariates(p_dict,phen_map,summary_dict,verbose):
with open(p_dict['cov_file'], 'r') as f:
num_missing = 0
for line in f:
l = line.split()
iid = l[0]
if iid in phen_map:
covariates = list(map(float, l[1:]))
phen_map[iid]['covariates'] = covariates
else:
num_missing += 1
if num_missing > 0:
summary_dict[2.1]={'name':'Individuals w missing covariate information:','value':num_missing}
if verbose:
print('Unable to find %d iids in phen file!' % num_missing)
summary_dict[2]={'name':'Parsed covariates file:','value':p_dict['cov_file']}
def parse_pcs(p_dict,phen_map,summary_dict,verbose):
with open(p_dict['pcs_file'], 'r') as f:
num_missing = 0
for line in f:
l = line.split()
iid = l[1]
if iid in phen_map:
pcs = list(map(float, l[2:]))
phen_map[iid]['pcs'] = pcs
else:
num_missing += 1
if num_missing > 0:
summary_dict[3.1]={'name':'Individuals w missing PCs:','value':num_missing}
if verbose:
print('Unable to find %d iids in phen file!' % num_missing)
summary_dict[3]={'name':'Parsed PCs file:','value':p_dict['pcs_file']}
def main(p_dict):
assert p_dict['summary_file'] is None or not p_dict['only_score'], 'Prediction summary file cannot be produced when the --only-score flag is set.'
summary_dict = {}
non_zero_chromosomes = set()
verbose = p_dict['debug']
t0 = time.time()
summary_dict[0]={'name':'Validation genotype file (prefix):','value':p_dict['gf']}
summary_dict[0.1]={'name':'Input weight file(s) (prefix):','value':p_dict['rf']}
summary_dict[0.2]={'name':'Output scores file(s) (prefix):','value':p_dict['out']}
adjust_for_pcs=False
adjust_for_covs=False
if not p_dict['only_score']:
summary_dict[0.9]={'name':'dash', 'value':'Phenotypes'}
if verbose:
print('Parsing phenotypes')
if p_dict['pf'] is None:
if p_dict['gf'] is not None:
phen_map = parse_phen_file(p_dict['gf'] + '.fam', 'FAM', verbose=verbose, summary_dict=summary_dict)
else:
raise Exception('Validation phenotypes were not found.')
else:
phen_map = parse_phen_file(p_dict['pf'], p_dict['pf_format'], verbose=verbose, summary_dict=summary_dict)
t1 = time.time()
t = (t1 - t0)
summary_dict[1.1]={'name':'Individuals with phenotype information:','value':len(phen_map)}
summary_dict[1.2]={'name':'Running time for parsing phenotypes:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
if p_dict['cov_file'] != None:
adjust_for_covs=True
if verbose:
print('Parsing additional covariates')
parse_covariates(p_dict, phen_map, summary_dict, verbose)
if p_dict['pcs_file']:
adjust_for_pcs=True
if verbose:
print('Parsing PCs')
parse_pcs(p_dict,phen_map,summary_dict,verbose)
num_individs = len(phen_map)
assert num_individs > 0, 'No phenotypes were found!'
else:
phen_map = None
t0 = time.time()
prs_file_is_missing = True
res_dict = {}
if p_dict['rf_format'] == 'LDPRED' or p_dict['rf_format']=='ANY':
weights_file = '%s_LDpred-inf.txt' % (p_dict['rf'])
if os.path.isfile(weights_file):
print('')
print('Calculating LDpred-inf risk scores')
rs_id_map = parse_ldpred_res(weights_file)
out_file = '%s_LDpred-inf.txt' % (p_dict['out'])
res_dict['LDpred_inf'] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file,
split_by_chrom=p_dict['split_by_chrom'],
adjust_for_pcs=adjust_for_pcs,
adjust_for_covariates=adjust_for_covs,
only_score=p_dict['only_score'],
verbose=verbose, summary_dict=summary_dict)
if not p_dict['only_score']:
summary_dict[5.2]={'name':'LDpred_inf (unadjusted) Pearson R2:','value':'%0.4f'%res_dict['LDpred_inf']['pred_r2']}
prs_file_is_missing = False
best_ldpred_pred_r2 = 0
best_p = None
for p in p_dict['f']:
weights_file = '%s_LDpred_p%0.4e.txt' % (p_dict['rf'], p)
if os.path.isfile(weights_file):
print('')
print('Calculating LDpred risk scores using f=%0.3e' % p)
rs_id_map = parse_ldpred_res(weights_file)
out_file = '%s_LDpred_p%0.4e.txt' % (p_dict['out'], p)
method_str = 'LDpred_p%0.4e' % (p)
res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file,
split_by_chrom=p_dict['split_by_chrom'],
adjust_for_pcs=adjust_for_pcs,
adjust_for_covariates=adjust_for_covs,
only_score=p_dict['only_score'],
verbose=verbose, summary_dict=summary_dict)
if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_ldpred_pred_r2:
best_ldpred_pred_r2 = res_dict[method_str]['pred_r2']
best_p = p
prs_file_is_missing=False
if best_ldpred_pred_r2>0 and not p_dict['only_score']:
summary_dict[5.3]={'name':'Best LDpred (f=%0.2e) (unadjusted) R2:'%(best_p),'value':'%0.4f'%best_ldpred_pred_r2}
best_ldpred_fast_pred_r2 = 0
best_p = None
for p in p_dict['f']:
weights_file = '%s_LDpred_fast_p%0.4e.txt' % (p_dict['rf'], p)
if os.path.isfile(weights_file):
print('')
print('Calculating LDpred-fast risk scores using f=%0.3e' % p)
rs_id_map = parse_ldpred_res(weights_file)
out_file = '%s_LDpred_fast_p%0.4e.txt' % (p_dict['out'], p)
method_str = 'LDpred_fast_p%0.4e' % (p)
res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file,
split_by_chrom=p_dict['split_by_chrom'],
adjust_for_pcs=adjust_for_pcs,
adjust_for_covariates=adjust_for_covs,
only_score=p_dict['only_score'],
verbose=verbose, summary_dict=summary_dict)
if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_ldpred_fast_pred_r2:
best_ldpred_fast_pred_r2 = res_dict[method_str]['pred_r2']
best_p = p
prs_file_is_missing=False
if best_ldpred_fast_pred_r2>0 and not p_dict['only_score']:
summary_dict[5.4]={'name':'Best LDpred-fast (f=%0.2e) (unadjusted) R2:'%(best_p),'value':'%0.4f'%best_ldpred_fast_pred_r2}
# Plot results?
if p_dict['rf_format'] == 'P+T' or p_dict['rf_format']=='ANY':
best_pt_pred_r2 = 0
best_t = None
best_r2 = None
for max_r2 in p_dict['r2']:
for p_thres in p_dict['p']:
weights_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['rf'], max_r2, p_thres)
if os.path.isfile(weights_file):
print('Calculating P+T risk scores using p-value threshold of %0.3e, and r2 threshold of %0.2f' % (p_thres, max_r2))
rs_id_map, non_zero_chromosomes = parse_pt_res(weights_file)
if len(rs_id_map)>0:
out_file = '%s_P+T_r%0.2f_p%0.4e.txt' % (p_dict['out'], max_r2, p_thres)
method_str = 'P+T_r%0.2f_p%0.4e' % (max_r2,p_thres)
res_dict[method_str] = calc_risk_scores(p_dict['gf'], rs_id_map, phen_map, out_file=out_file,
split_by_chrom=p_dict['split_by_chrom'],
non_zero_chromosomes=non_zero_chromosomes,
adjust_for_pcs=adjust_for_pcs,
adjust_for_covariates=adjust_for_covs,
only_score=p_dict['only_score'],
verbose=verbose, summary_dict=summary_dict)
if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_pt_pred_r2:
best_pt_pred_r2 = res_dict[method_str]['pred_r2']
best_t = p_thres
best_r2 = max_r2
else:
print('No SNPs found with p-values below the given threshold.')
prs_file_is_missing=False
if best_pt_pred_r2>0 and not p_dict['only_score']:
summary_dict[5.5]={'name':'Best P+T (r2=%0.2f, p=%0.2e) (unadjusted) R2:'%(best_r2, best_t),'value':'%0.4f'%best_pt_pred_r2}
# Plot results?
assert not prs_file_is_missing, 'No SNP weights file was found. A prefix to these should be provided via the --rf flag. Note that the prefix should exclude the _LDpred_.. extension or file ending. '
res_summary_file = p_dict['summary_file']
if res_summary_file is not None and not p_dict['only_score']:
with open(res_summary_file,'w') as f:
if verbose:
print ('Writing Results Summary to file %s'%res_summary_file)
out_str = 'Pred_Method Pred_corr Pred_R2 SNPs_used\n'
f.write(out_str)
for method_str in sorted(res_dict):
out_str = '%s %0.4f %0.4f %i\n'%(method_str, res_dict[method_str]['corr_r2'], res_dict[method_str]['pred_r2'], res_dict[method_str]['num_snps'])
f.write(out_str)
#Identifying the best prediction
if not p_dict['only_score']:
best_pred_r2 = 0
best_method_str = None
for method_str in res_dict:
if len(res_dict[method_str]) and (res_dict[method_str]['pred_r2']) >best_pred_r2:
best_pred_r2 = res_dict[method_str]['pred_r2']
best_method_str = method_str
if best_method_str is not None:
print('The highest (unadjusted) Pearson R2 was %0.4f, and provided by %s'%(best_pred_r2,best_method_str))
summary_dict[5.99]={'name':'dash', 'value':'Optimal polygenic score'}
summary_dict[6]={'name':'Method with highest (unadjusted) Pearson R2:','value':best_method_str}
summary_dict[6.1]={'name':'Best (unadjusted) Pearson R2:','value':'%0.4f'%best_pred_r2}
if verbose:
summary_dict[6.2]={'name':'Number of SNPs used','value':'%d'%res_dict[best_method_str]['num_snps']}
summary_dict[6.3]={'name':'Number of SNPs flipped','value':'%d'%res_dict[best_method_str]['num_flipped_nts']}
summary_dict[6.4]={'name':'Fraction of SNPs not found in validation data','value':'%0.4f'%res_dict[best_method_str]['perc_missing']}
summary_dict[6.5]={'name':'Number of duplicated SNPs','value':'%d'%res_dict[best_method_str]['duplicated_snps']}
summary_dict[6.6]={'name':'Number of non-matching nucleotides SNPs','value':'%d'%res_dict[best_method_str]['num_non_matching_nts']}
t1 = time.time()
t = (t1 - t0)
summary_dict[4.9]={'name':'dash', 'value':'Scoring'}
summary_dict[5.9]={'name':'Running time for calculating scores:','value':'%d min and %0.2f secs'% (t / 60, t % 60)}
if prs_file_is_missing:
print('SNP weights files were not found. This could be due to a mis-specified --rf flag, or other issues.')
reporting.print_summary(summary_dict,'Scoring Summary')