https://github.com/bvilhjal/ldpred
Tip revision: aa6b27c8ba9b88e77731ca3ccceb585f7f2362fc authored by Bjarni J. Vilhjalmsson on 21 November 2019, 07:31:08 UTC
Merge pull request #126 from foresitelabs/master
Merge pull request #126 from foresitelabs/master
Tip revision: aa6b27c
sum_stats_parsers.py
import scipy as sp
from scipy import stats
from scipy import isfinite
import gzip
from ldpred import util
import time
import sys
def parse_sum_stats(h5f, p_dict, bimfile, summary_dict):
t0 = time.time()
ss_args = {'filename':p_dict['ssf'], 'bimfile':bimfile, 'hdf5_file':h5f,
'only_hm3':p_dict['only_hm3'], 'n':p_dict['N'],
'debug':p_dict['debug'], 'summary_dict':summary_dict,
'match_genomic_pos':p_dict['match_genomic_pos'],
'z_from_se':p_dict['z_from_se']}
if p_dict['ssf_format'] == 'LDPRED':
"""
CHR POS SNP_ID REF ALT REF_FRQ PVAL BETA SE N
chr1 1020428 rs6687776 C T 0.85083 0.0587 -0.0100048507289348 0.0100 8000
"""
parse_sum_stats_custom(ch='CHR', pos='POS', A1='REF', A2='ALT', reffreq='REF_FRQ', info='info',
rs='SNP_ID', pval='PVAL', eff='BETA', se='SE', ncol='N', eff_type='LINREG', **ss_args)
elif p_dict['ssf_format'] == 'STANDARD':
if p_dict['N'] is None:
raise Exception('This summary statistics format requires summary statistics sample size to be given, i.e. set --N flag.')
"""
chr pos ref alt reffrq info rs pval effalt
chr1 1020428 C T 0.85083 0.98732 rs6687776 0.0587 -0.0100048507289348
"""
parse_sum_stats_custom(ch='chr', pos='pos', A1='ref', A2='alt', reffreq='reffrq', info='info',
rs='rs', pval='pval', eff='effalt', eff_type='LINREG', **ss_args)
elif p_dict['ssf_format'] == 'PGC_OLD':
if p_dict['N'] is None:
raise Exception('This summary statistics format requires summary statistics sample size to be given, i.e. set --N flag.')
"""
hg19chrc snpid a1 a2 bp info or se p ngt
chr1 rs4951859 C G 729679 0.631 0.97853 0.0173 0.2083 0
"""
parse_sum_stats_custom(ch='hg19chrc', pos='bp', A1='a1', A2='a2', info='info', rs='snpid', pval='p', eff='or', se='se',
eff_type='OR', **ss_args)
elif p_dict['ssf_format'] == 'BASIC':
if p_dict['N'] is None:
raise Exception('This summary statistics format requires summary statistics sample size to be given, i.e. set --N flag.')
"""
hg19chrc snpid a1 a2 bp or p
chr1 rs4951859 C G 729679 0.97853 0.2083
"""
parse_sum_stats_custom(ch='hg19chrc', pos='bp', A1='a1', A2='a2', info='info', rs='snpid', pval='p',
eff='or', eff_type='OR', **ss_args)
elif p_dict['ssf_format'] == 'PGC':
"""
CHR SNP BP A1 A2 FRQ_A_30232 FRQ_U_40578 INFO OR SE P ngt Direction HetISqt HetChiSq HetDf HetPVa
...
"""
parse_sum_stats_custom(ch='CHR', pos='BP', A1='A1', A2='A2', reffreq='Freq.Hapmap.Ceu',
case_freq='FRQ_A_34129', control_freq='FRQ_U_45512', case_n='Nca',
control_n='Nco', info='INFO', rs='SNP', pval='P', eff='OR', se='SE',
eff_type='OR', **ss_args)
elif p_dict['ssf_format'] == 'GIANT':
"""
MarkerName Allele1 Allele2 Freq.Allele1.HapMapCEU p N
rs10 a c 0.0333 0.8826 78380
"""
parse_sum_stats_custom(A1='Allele1', A2='Allele2', reffreq='Freq.Allele1.HapMapCEU', rs='MarkerName',
pval='p', eff='b', ncol='N', eff_type='LINREG', **ss_args)
elif p_dict['ssf_format'] == 'GIANT2':
"""
MarkerName A1 A2 Freq.Hapmap.Ceu BETA SE.2gc P.2gc N
rs4747841 a g 0.55 0.0025 0.0061 0.68 60558.2
"""
parse_sum_stats_custom(ncol='N', A1='A1', A2='A2', reffreq='Freq.Hapmap.Ceu', rs='MarkerName',
pval='P.2gc', eff='BETA', se='SE.2gc', eff_type='LINREG', **ss_args)
elif p_dict['ssf_format'] == 'CUSTOM':
parse_sum_stats_custom(ch=p_dict['chr'],
A1=p_dict['A1'], A2=p_dict['A2'], reffreq=p_dict['reffreq'], info=p_dict['info'],
rs=p_dict['rs'], pval=p_dict['pval'], eff=p_dict['eff'], ncol=p_dict['ncol'],
pos=p_dict['pos'], eff_type=p_dict['eff_type'], case_freq=p_dict['case_freq'],
control_freq=p_dict['control_freq'], case_n=p_dict['case_n'], se=p_dict['se'],
control_n=p_dict['control_n'], **ss_args)
else:
raise Exception('Unknown Summary Statistics Format.')
t1 = time.time()
t = (t1 - t0)
summary_dict[14]={'name':'Run time for parsing summary stats:','value': '%d min and %0.2f sec'%(t / 60, t % 60)}
def parse_sample_size(line_dict,n,ncol,case_n,control_n,header_dict,):
if n is None:
if ncol not in header_dict:
case_N = float(line_dict[header_dict[case_n]])
control_N = float(line_dict[header_dict[control_n]])
N = case_N + control_N
else:
N = float(line_dict[header_dict[ncol]])
else:
N = n
return N
def parse_nucleotides(line_dict, header_dict, A1, A2):
return [line_dict[header_dict[A1]].upper(), line_dict[header_dict[A2]].upper()]
def parse_freq(line_dict, header_dict, reffreq, control_freq, case_freq, control_n, case_n):
if reffreq is not None and reffreq in header_dict:
if line_dict[header_dict[reffreq]] == '.' or line_dict[header_dict[reffreq]] == 'NA':
return -1
else:
return float(line_dict[header_dict[reffreq]])
elif (case_freq is not None and control_freq is not None
and case_freq in header_dict and control_freq in header_dict):
if (case_n is not None and control_n is not None
and case_n in header_dict and control_n in header_dict) :
if (line_dict[header_dict[control_n]] == '.' or line_dict[header_dict[control_n]] == 'NA'
or line_dict[header_dict[case_n]] == '.' or line_dict[header_dict[case_n]] == 'NA'
or line_dict[header_dict[control_freq]] == '.' or line_dict[header_dict[control_freq]] == 'NA'
or line_dict[header_dict[case_freq]] == '.' or line_dict[header_dict[case_freq]] == 'NA'):
return -1
else:
case_N = float(line_dict[header_dict[case_n]])
control_N = float(line_dict[header_dict[control_n]])
tot_N = case_N + control_N
a_scalar = case_N / float(tot_N)
u_scalar = control_N / float(tot_N)
freq = float(line_dict[header_dict[case_freq]]) * a_scalar + float(line_dict[header_dict[control_freq]]) * u_scalar
return freq
else:
if (line_dict[header_dict[case_freq]] == '.' or line_dict[header_dict[case_freq]] == 'NA'
or line_dict[header_dict[control_freq]] == '.' or line_dict[header_dict[control_freq]] == 'NA'):
return -1
else:
return (float(line_dict[header_dict[case_freq]]) + float(line_dict[header_dict[control_freq]]))/2.0
else:
return -1
def get_raw_beta(beta_read, eff_type):
if eff_type=='OR':
raw_beta = sp.log(beta_read)
elif eff_type=='LINREG' or eff_type=='LOGOR' or eff_type=='BLUP':
raw_beta = beta_read
else:
raise Exception('Unknown effect type')
return raw_beta
def get_beta_from_se(beta_read, se_read, eff_type, raw_beta, N):
if se_read==0:
return
if eff_type=='LINREG' or eff_type=='LOGOR':
abs_beta = sp.absolute(beta_read)/se_read
elif eff_type=='OR':
abs_beta = sp.absolute(1-beta_read)/se_read
else:
raise Exception('Unknown effect type')
return sp.sign(raw_beta) * abs_beta/ sp.sqrt(N)
def get_beta(pval_read, raw_beta, beta_read, line_dict, header_dict, se,
z_from_se, N, eff_type, se_inferred_zscores):
if eff_type=='BLUP':
return raw_beta
if z_from_se:
assert se in header_dict, "SE was not specified in summary statistics provided, which is necessary when the 'z_from_se' flag is used."
se_read = float(line_dict[header_dict[se]])
if se_read==0 or not isfinite(se_read):
return None
se_inferred_zscores += 1
return get_beta_from_se(beta_read, se_read, eff_type, raw_beta, N)
else:
if pval_read==0 or not isfinite(stats.norm.ppf(pval_read)):
#Attempt to Parse SEs to infer Z-score
if not se in header_dict:
return None
else:
se_read = float(line_dict[header_dict[se]])
if se_read==0 or not isfinite(se_read):
return None
se_inferred_zscores += 1
return get_beta_from_se(beta_read, se_read, eff_type, raw_beta, N)
else:
return sp.sign(raw_beta) * stats.norm.ppf(pval_read / 2.0)/ sp.sqrt(N)
def parse_sum_stats_custom(filename=None, bimfile=None, only_hm3=False, hdf5_file=None, n=None, ch=None, pos=None,
A1=None, A2=None, reffreq=None, case_freq=None, control_freq=None, case_n=None,
control_n=None, info=None, rs=None, pval=None, eff=None, ncol=None, se=None,
eff_type='OR', match_genomic_pos=False, debug=False,
z_from_se=False, summary_dict = None):
# Check required fields are here
assert not A2 is None, 'Require header for non-effective allele'
assert not A1 is None, 'Require header for effective allele'
assert not rs is None, 'Require header for RS ID'
assert not eff is None, 'Require header for Statistics'
assert not pval is None, 'Require header for pval'
assert not ncol is None or not n is None or (control_n is not None and case_n is not None), 'Require either N or NCOL information'
if ch is None:
assert not bimfile is None, 'Require bimfile when chromosome header not provided'
print("Chromosome Header not provided, will use info from bim file")
if pos is None:
assert not bimfile is None, 'Require bimfile when position header not provided'
print("Position Header not provided, will use info from bim file")
num_lines = util.count_lines(filename)
snps_pos_map = {}
if only_hm3:
if debug:
print('Loading HapMap3 SNPs')
hm3_sids = util.load_hapmap_SNPs()
if bimfile is not None:
valid_sids = set()
if debug:
print('Parsing bim file: %s' % bimfile)
with open(bimfile) as f:
for line in f:
l = line.split()
chrom = util.get_chrom_num(l[0])
if chrom not in util.ok_chromosomes:
continue
sid = l[1]
if only_hm3:
if sid in hm3_sids:
valid_sids.add(sid)
snps_pos_map[sid] = {'pos':int(l[3]), 'chrom':chrom}
else:
valid_sids.add(sid)
snps_pos_map[sid] = {'pos':int(l[3]), 'chrom':chrom}
if len(valid_sids)==0:
raise Exception('Unable to parse BIM file')
else:
raise Exception('BIM file missing. Please check genotype paths provided.')
invalid_chr = 0
invalid_pos = 0
invalid_p = 0
invalid_beta = 0
se_inferred_zscores = 0
chrom_dict = {}
opener = open
if util.is_gz(filename):
opener = gzip.open
print('Parsing summary statistics file: %s' % filename)
with opener(filename) as f:
header = f.readline()
if util.is_gz(filename):
header = header.decode('utf-8')
if debug:
print('File header:')
print(header)
header_dict = {}
columns = (header.strip()).split()
index = 0
for col in columns:
header_dict[col] = index
index += 1
assert ch is None or ch in header_dict, 'Chromosome header cannot be found in summary statistic file'
assert A2 in header_dict, 'Non-effective allele column cannot be found in summary statistic file'
assert A1 in header_dict, 'Effective allele column cannot be found in summary statistic file'
assert eff in header_dict, 'Effect size column not found in summary statistic file'
assert rs in header_dict, 'SNP ID column not found in summary statistic file'
assert pos is None or pos in header_dict, 'Position column not found in summary statistic file'
assert pval in header_dict, 'P Value column not found in summary statistic file'
assert not n is None or ncol in header_dict or (control_n in header_dict and case_n in header_dict), 'Sample size column not found in summary statistic ' \
'file and N not provided'
if z_from_se:
assert se is not None, 'SE column must be specified to infer z-scores from SEs'
assert se in header_dict, 'SE column not found in summary stats file, this is required to infer z-scores from SEs'
# header_dict now contains the header column name for each corresponding input
bad_chromosomes = set()
line_i = 1
for line in f:
line_i +=1
if line_i%1000==0 and num_lines>0:
sys.stdout.write('\r%0.2f%%' % (100.0 * (float(line_i) / (num_lines))))
sys.stdout.flush()
if util.is_gz(filename):
line = line.decode('utf-8')
l = (line.strip()).split()
# get the SNP ID first
sid = l[header_dict[rs]]
# check the SNP ID
if sid in valid_sids:
# Get the chromosome information
chrom = 0
if not ch is None and ch in header_dict:
chrom = util.get_chrom_num(l[header_dict[ch]])
# Check if the chromosome of the SNP is correct
if not chrom == snps_pos_map[sid]['chrom']:
invalid_chr += 1
continue
else:
chrom = snps_pos_map[sid]['chrom']
#Parse position
pos_read = 0
if not pos is None and pos in header_dict:
pos_read = int(l[header_dict[pos]])
if not pos_read == snps_pos_map[sid]['pos']:
invalid_pos += 1
if match_genomic_pos:
continue
else:
pos_read = snps_pos_map[sid]['pos']
#Get the sample size
N = parse_sample_size(l,n,ncol,case_n,control_n,header_dict)
#Parse raw beta
beta_read = float(l[header_dict[eff]])
if not isfinite(beta_read):
invalid_beta += 1
continue
raw_beta = get_raw_beta(beta_read, eff_type)
#Parse p-value and effect size
pval_read = float(l[header_dict[pval]])
if pval_read==0 or not isfinite(stats.norm.ppf(pval_read)):
invalid_p += 1
beta = get_beta(pval_read, raw_beta, beta_read, l, header_dict, se,
z_from_se, N, eff_type, se_inferred_zscores)
if beta==None:
continue
#All necessary information was found, so we should store things
if not chrom in chrom_dict:
chrom_dict[chrom] = {'ps':[], 'infos':[], 'freqs':[],
'nts': [], 'sids': [], 'positions': [], 'ns':[],
'log_odds':[], 'betas':[]}
chrom_dict[chrom]['sids'].append(sid)
chrom_dict[chrom]['positions'].append(pos_read)
chrom_dict[chrom]['ps'].append(pval_read)
chrom_dict[chrom]['ns'].append(N)
chrom_dict[chrom]['log_odds'].append(raw_beta)
chrom_dict[chrom]['betas'].append(beta)
# Get the INFO score
info_sc = -1
if info is not None and info in header_dict:
info_sc = float(l[header_dict[info]])
chrom_dict[chrom]['infos'].append(info_sc)
#Parse nucleotides
nt = parse_nucleotides(l, header_dict, A1, A2)
chrom_dict[chrom]['nts'].append(nt)
# Parse the frequency
freq = parse_freq(l, header_dict, reffreq, control_freq, case_freq, control_n, case_n)
chrom_dict[chrom]['freqs'].append(freq)
if len(bad_chromosomes) > 0:
if debug:
print('Ignored chromosomes: %s' % (','.join(list(bad_chromosomes))))
print('Please note that only data on chromosomes 1-23, and X are parsed.')
if num_lines>0:
sys.stdout.write('\r%0.2f%%\n' % (100.0))
sys.stdout.flush()
print('SS file loaded, now sorting and storing in HDF5 file.')
assert not 'sum_stats' in hdf5_file, 'Something is wrong with HDF5 file?'
ssg = hdf5_file.create_group('sum_stats')
num_snps = 0
num_non_finite = 0
for chrom in chrom_dict:
if debug:
print ('%d SNPs on chromosome %s' % (len(chrom_dict[chrom]['positions']), chrom))
assert len(chrom_dict[chrom]['positions'])==len(chrom_dict[chrom]['betas'])==len(chrom_dict[chrom]['ps'])==len(chrom_dict[chrom]['nts']), 'Problems with parsing summary stats'
sl = list(zip(chrom_dict[chrom]['positions'], chrom_dict[chrom]['sids'], chrom_dict[chrom]['nts'],
chrom_dict[chrom]['betas'], chrom_dict[chrom]['log_odds'], chrom_dict[chrom]['infos'],
chrom_dict[chrom]['freqs'], chrom_dict[chrom]['ps'], chrom_dict[chrom]['ns']))
sl.sort()
ps = []
betas = []
nts = []
sids = []
positions = []
log_odds = []
infos = []
freqs = []
ns = []
prev_pos = -1
for pos, sid, nt, beta, lo, info, frq, p, num_ind in sl:
if pos == prev_pos:
if debug:
print('duplicated position %d' % pos)
continue
else:
prev_pos = pos
if not sp.isfinite(beta):
num_non_finite += 1
continue
ps.append(p)
betas.append(beta)
nts.append(nt)
sids.append(sid)
positions.append(pos)
log_odds.append(lo)
infos.append(info)
freqs.append(frq)
ns.append(num_ind)
nts = sp.array(nts, dtype=util.nts_dtype)
sids = sp.array(sids, dtype=util.sids_dtype)
if debug:
if not num_non_finite == 0:
print('%d SNPs have non-finite statistics on chromosome %s' % (num_non_finite, chrom))
print ('Still %d SNPs on chromosome %s' % (len(ps), chrom))
g = ssg.create_group('chrom_%s' % chrom)
g.create_dataset('ps', data=sp.array(ps))
g.create_dataset('freqs', data=freqs)
g.create_dataset('betas', data=betas)
g.create_dataset('log_odds', data=log_odds)
num_snps += len(log_odds)
g.create_dataset('infos', data=infos)
g.create_dataset('nts', data=nts)
g.create_dataset('sids', data=sids)
g.create_dataset('positions', data=positions)
g.create_dataset('ns', data=ns)
hdf5_file.flush()
summary_dict[3.09]={'name':'dash', 'value':'Summary statistics'}
summary_dict[3.1]={'name':'Num SNPs parsed from sum stats file','value':num_snps}
if invalid_p>0 or debug:
summary_dict[3.2]={'name':'Num invalid P-values in sum stats','value':invalid_p}
if invalid_beta>0 or debug:
summary_dict[3.21]={'name':'Num invalid betas in sum stats','value':invalid_beta}
if se_inferred_zscores>0 or debug:
summary_dict[3.22]={'name':'Num z-scores inferred from SEs and effects','value':se_inferred_zscores}
if invalid_chr>0 or debug:
summary_dict[3.4]={'name':'SNPs w non-matching chromosomes excluded','value':invalid_chr}
if invalid_pos>0 or debug:
if match_genomic_pos:
summary_dict[3.3]={'name':'SNPs w non-matching positions excluded','value':invalid_pos}
else:
summary_dict[3.3]={'name':'SNPs w non-matching positions (not excluded)','value':invalid_pos}