Revision 49cb69c5a18fdb262964fbfeb47ab2099eb32c5c authored by Wesley Tansey on 03 May 2018, 19:46:59 UTC, committed by Wesley Tansey on 03 May 2018, 19:46:59 UTC
utils.py
import numpy as np
import csv
from collections import defaultdict
from scipy.sparse import csc_matrix, lil_matrix
import scipy.stats as st
class ProxyDistribution:
'''Simple proxy distribution to enable specifying signal distributions from the command-line'''
def __init__(self, name, pdf_method, sample_method):
self.name = name
self.pdf_method = pdf_method
self.sample_method = sample_method
def pdf(self, x):
return self.pdf_method(x)
def sample(self, count=1):
if count == 1:
return self.sample_method()
return np.array([self.sample_method() for _ in range(count)])
def __repr__(self):
return self.name
def generate_data_helper(flips, null_mean, null_stdev, signal_dist):
'''Recursively builds multi-dimensional datasets.'''
if len(flips.shape) > 1:
return np.array([generate_data_helper(row, null_mean, null_stdev, signal_dist) for row in flips])
# If we're on the last dimension, return the vector
return np.array([signal_dist.sample() if flip else 0 for flip in flips]) + np.random.normal(loc=null_mean, scale=null_stdev, size=len(flips))
def generate_data(null_mean, null_stdev, signal_dist, signal_weights):
'''Create a synthetic dataset.'''
# Flip biased coins to decide which distribution to draw each sample from
flips = np.random.random(size=signal_weights.shape) < signal_weights
# Recursively generate the dataset
samples = generate_data_helper(flips, null_mean, null_stdev, signal_dist)
# Observed z-scores
z = (samples - null_mean) / null_stdev
return (z, flips)
def calc_fdr(probs, fdr_level):
'''Calculates the detected signals at a specific false discovery rate given the posterior probabilities of each point.'''
pshape = probs.shape
if len(probs.shape) > 1:
probs = probs.flatten()
post_orders = np.argsort(probs)[::-1]
avg_fdr = 0.0
end_fdr = 0
for idx in post_orders:
test_fdr = (avg_fdr * end_fdr + (1.0 - probs[idx])) / (end_fdr + 1.0)
if test_fdr > fdr_level:
break
avg_fdr = test_fdr
end_fdr += 1
is_finding = np.zeros(probs.shape, dtype=int)
is_finding[post_orders[0:end_fdr]] = 1
if len(pshape) > 1:
is_finding = is_finding.reshape(pshape)
return is_finding
def filter_nonrectangular_data(data, filter_value=0):
'''Convert the square matrix to a vector containing only the values different than the filter values.'''
x = data != filter_value
nonrect_vals = np.arange(x.sum())
nonrect_to_data = np.zeros(data.shape, dtype=int) - 1
data_to_nonrect = np.where(x.T)
data_to_nonrect = (data_to_nonrect[1],data_to_nonrect[0])
nonrect_to_data[data_to_nonrect] = nonrect_vals
nonrect_data = data[x]
return (nonrect_data, nonrect_to_data, data_to_nonrect)
def sparse_2d_penalty_matrix(data_shape, nonrect_to_data=None):
'''Create a sparse 2-d penalty matrix. Optionally takes a map to corrected indices, useful when dealing with non-rectangular data.'''
row_counter = 0
data = []
row = []
col = []
if nonrect_to_data is not None:
for j in range(data_shape[1]):
for i in range(data_shape[0]-1):
idx1 = nonrect_to_data[i,j]
idx2 = nonrect_to_data[i+1,j]
if idx1 < 0 or idx2 < 0:
continue
row.append(row_counter)
col.append(idx1)
data.append(-1)
row.append(row_counter)
col.append(idx2)
data.append(1.)
row_counter += 1
for j in range(data_shape[1]-1):
for i in range(data_shape[0]):
idx1 = nonrect_to_data[i,j]
idx2 = nonrect_to_data[i,j+1]
if idx1 < 0 or idx2 < 0:
continue
row.append(row_counter)
col.append(idx1)
data.append(-1)
row.append(row_counter)
col.append(idx2)
data.append(1.)
row_counter += 1
else:
for j in range(data_shape[1]):
for i in range(data_shape[0] - 1):
idx1 = i+j*data_shape[0]
idx2 = i+j*data_shape[0]+1
row.append(row_counter)
col.append(idx1)
data.append(-1.)
row.append(row_counter)
col.append(idx2)
data.append(1.)
row_counter += 1
col_counter = 0
for i in range(data_shape[0]):
for j in range(data_shape[1] - 1):
idx1 = col_counter
idx2 = col_counter+data_shape[0]
row.append(row_counter)
col.append(idx1)
data.append(-1.)
row.append(row_counter)
col.append(idx2)
data.append(1.)
row_counter += 1
col_counter += 1
num_rows = row_counter
num_cols = max(col) + 1
return csc_matrix((data, (row, col)), shape=(num_rows, num_cols))
def sparse_1d_penalty_matrix(data_len):
penalties = np.eye(data_len, dtype=float)[0:-1] * -1
for i in range(len(penalties)):
penalties[i,i+1] = 1
return csc_matrix(penalties)
def cube_trails(xmax, ymax, zmax):
'''Produces a list of trails following a simple row/col/aisle split strategy for a cube.'''
trails = []
for x in range(xmax):
for y in range(ymax):
trails.append([x * ymax * zmax + y * zmax + z for z in range(zmax)])
for y in range(ymax):
for z in range(zmax):
trails.append([x * ymax * zmax + y * zmax + z for x in range(xmax)])
for z in range(zmax):
for x in range(xmax):
trails.append([x * ymax * zmax + y * zmax + z for y in range(ymax)])
return trails
def val_present(data, x, missing_val):
return missing_val is None or x
def cube_edges(data, missing_val=None):
'''Produces a list of edges for a cube with potentially missing data.
If missing_val is specified, entries with that value will be considered
missing and no edges will be connected to them.'''
edges = []
xmax, ymax, zmax = data.shape
for y in range(ymax):
for z in range(zmax):
edges.extend([((x1, y, z), (x2, y, z))
for x1, x2 in zip(range(data.shape[0]-1), range(1,data.shape[0]))
if missing_val is None or (data[x1,y,z] != missing_val and data[x2,y,z] != missing_val)])
for x in range(xmax):
for z in range(zmax):
edges.extend([((x, y1, z), (x, y2, z))
for y1, y2 in zip(range(data.shape[1]-1), range(1,data.shape[1]))
if missing_val is None or (data[x,y1,z] != missing_val and data[x,y2,z] != missing_val)])
for x in range(xmax):
for y in range(ymax):
edges.extend([((x, y, z1), (x, y, z2))
for z1, z2 in zip(range(data.shape[2]-1), range(1,data.shape[2]))
if missing_val is None or (data[x,y,z1] != missing_val and data[x,y,z2] != missing_val)])
return edges
def cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val):
if data[v1] == missing_val or data[v2] == missing_val:
if len(cur_trail) > 1:
trails.append(cur_trail)
cur_trail = []
else:
if len(cur_trail) == 0:
cur_trail.append(v1)
cur_trail.append(v2)
return cur_trail
def cube_trails_missing(data, missing_val=None):
'''Generates row/col/aisle trails for a cube when there may be missing data.'''
trails = []
xmax, ymax, zmax = data.shape
for y in range(ymax):
for z in range(zmax):
cur_trail = []
for x1, x2 in zip(range(data.shape[0]-1), range(1,data.shape[0])):
v1 = (x1,y,z)
v2 = (x2,y,z)
cur_trail = cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val)
if len(cur_trail) > 1:
trails.append(cur_trail)
for x in range(xmax):
for z in range(zmax):
cur_trail = []
for y1, y2 in zip(range(data.shape[1]-1), range(1,data.shape[1])):
v1 = (x,y1,z)
v2 = (x,y2,z)
cur_trail = cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val)
if len(cur_trail) > 1:
trails.append(cur_trail)
for x in range(xmax):
for y in range(ymax):
cur_trail = []
for z1, z2 in zip(range(data.shape[2]-1), range(1,data.shape[2])):
v1 = (x, y, z1)
v2 = (x, y, z2)
cur_trail = cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val)
if len(cur_trail) > 1:
trails.append(cur_trail)
return trails
def load_trails(filename):
with open(filename, 'rb') as f:
reader = csv.reader(f)
return load_trails_from_reader(reader)
def load_trails_from_reader(reader):
trails = []
breakpoints = []
edges = defaultdict(list)
for line in reader:
if len(trails) > 0:
breakpoints.append(len(trails))
nodes = [int(x) for x in line]
trails.extend(nodes)
for n1,n2 in zip(nodes[:-1], nodes[1:]):
edges[n1].append(n2)
edges[n2].append(n1)
if len(trails) > 0:
breakpoints.append(len(trails))
return (len(breakpoints), np.array(trails, dtype="int32"), np.array(breakpoints, dtype="int32"), edges)
def save_trails(filename, trails):
with open(filename, 'wb') as f:
writer = csv.writer(f)
writer.writerows(trails)
def pretty_str(p, decimal_places=2):
'''Pretty-print a matrix or vector.'''
if len(p.shape) == 1:
return vector_str(p, decimal_places)
if len(p.shape) == 2:
return matrix_str(p, decimal_places)
raise Exception('Invalid array with shape {0}'.format(p.shape))
def matrix_str(p, decimal_places=2):
'''Pretty-print the matrix.'''
return '[{0}]'.format("\n ".join([vector_str(a, decimal_places) for a in p]))
def vector_str(p, decimal_places=2):
'''Pretty-print the vector values.'''
style = '{0:.' + str(decimal_places) + 'f}'
return '[{0}]'.format(", ".join([style.format(a) for a in p]))
def mean_filter(pvals, edges, rescale=True):
'''Given a list of p-values and their neighbors, applies a mean filter
that replaces each p_i with p*_i where p*_i = mean(neighbors(p_i)).
If rescale is true, then the p-values are rescaled to be variance 1.'''
return np.array([np.mean(pvals[edges[i] + [i]]) * (np.sqrt(len(edges[i]) + 1) if rescale else 1) for i,p in enumerate(pvals)])
def median_filter(pvals, edges):
'''Given a list of p-values and their neighbors, applies a median filter
that replaces each p_i with p*_i where p*_i = median(neighbors(p_i)).'''
return np.array([np.median(pvals[edges[i] + [i]]) for i,p in enumerate(pvals)])
def _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmin, tmax, tmin_fdr, tmax_fdr, rel_tol=1e-4):
'''Finds the t-level via binary search.'''
if np.isclose(tmin, tmax, atol=rel_tol) or np.isclose(tmin_fdr, tmax_fdr, atol=rel_tol) or tmax_fdr <= fdr_level:
return (tmax, tmax_fdr) if tmax_fdr <= fdr_level else (tmin, tmin_fdr)
tmid = (tmax + tmin) / 2.
tmid_fdr = wstar_lambda * ghat(p_star, tmid) / (max(1,(p_star < tmid).sum()) * (1-ghat_lambda))
print('t: [{0}, {1}, {2}] => fdr: [{3}, {4}, {5}]'.format(tmin, tmid, tmax, tmin_fdr, tmid_fdr, tmax_fdr))
if tmid_fdr <= fdr_level:
return _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmid, tmax, tmid_fdr, tmax_fdr)
return _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmin, tmid, tmin_fdr, tmid_fdr)
def local_agg_fdr(pvals, edges, fdr_level, lmbda = 0.1):
'''Given a list of p-values and the graph connecting them, applies a median
filter to locally aggregate them and then performs a corrected FDR procedure
from Zhang, Fan, and Yu (Annals of Statistics, 2011). lmbda is a tuning
constant typically set to 0.1.'''
p_star = median_filter(pvals, edges) # aggregate p-values
ghat = lambda p, t: (p >= (1-t)).sum() / max(1., (2.0 * (p > 0.5).sum() + (p==0.5).sum())) # empirical null CDF
wstar_lambda = (p_star > lmbda).sum() # number of nonrejects at the level lambda
ghat_lambda = ghat(p_star, lmbda) # empirical null CDF at rejection level lambda
# Use binary search to find the highest t value that satisfies the fdr level
tmin = 0.
tmax = 1.
tmin_fdr = wstar_lambda * ghat(p_star, tmin) / (max(1,(p_star < tmin).sum()) * (1-ghat_lambda))
tmax_fdr = wstar_lambda * ghat(p_star, tmax) / (max(1,(p_star < tmax).sum()) * (1-ghat_lambda))
t, tfdr = _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmin, tmax, tmin_fdr, tmax_fdr)
print('t: {0} tfdr: {1}'.format(t, tfdr))
# Returns the indices of all discoveries
return np.where(p_star < t)[0]
def p_value(z, mu0=0., sigma0=1.):
return 2*(1.0 - st.norm.cdf(np.abs((z - mu0) / sigma0)))
def benjamini_hochberg(z, fdr, mu0=0., sigma0=1.):
'''Performs Benjamini-Hochberg multiple hypothesis testing on z at the given false discovery rate threshold.'''
z_shape = z.shape if len(z.shape) > 1 else None
if z_shape is not None:
z = z.flatten()
p = p_value(z, mu0=mu0, sigma0=sigma0)
p_orders = np.argsort(p)
discoveries = []
m = float(len(p_orders))
for k, s in enumerate(p_orders):
if p[s] <= (k+1) / m * fdr:
discoveries.append(s)
else:
break
discoveries = np.array(discoveries)
if z_shape is not None:
x = np.zeros(z.shape)
x[discoveries] = 1
discoveries = np.where(x.reshape(z_shape) == 1)
return discoveries
Computing file changes ...