https://github.com/tansey/smoothfdr
Revision c5b693d0a66e83c9387433b33c0eab481bd4a763 authored by Wesley Tansey on 08 May 2020, 15:42:20 UTC, committed by Wesley Tansey on 08 May 2020, 15:42:20 UTC
1 parent 49cb69c
Raw File
Tip revision: c5b693d0a66e83c9387433b33c0eab481bd4a763 authored by Wesley Tansey on 08 May 2020, 15:42:20 UTC
Fixed bug in easy that created too large a support for the alternative distribution
Tip revision: c5b693d
utils.py
import numpy as np
import csv
from collections import defaultdict
from scipy.sparse import csc_matrix, lil_matrix
import scipy.stats as st

class ProxyDistribution:
    '''Simple proxy distribution to enable specifying signal distributions from the command-line'''
    def __init__(self, name, pdf_method, sample_method):
        self.name = name
        self.pdf_method = pdf_method
        self.sample_method = sample_method

    def pdf(self, x):
        return self.pdf_method(x)

    def sample(self, count=1):
        if count == 1:
            return self.sample_method()
        return np.array([self.sample_method() for _ in range(count)])

    def __repr__(self):
        return self.name

def generate_data_helper(flips, null_mean, null_stdev, signal_dist):
    '''Recursively builds multi-dimensional datasets.'''
    if len(flips.shape) > 1:
        return np.array([generate_data_helper(row, null_mean, null_stdev, signal_dist) for row in flips])

    # If we're on the last dimension, return the vector
    return np.array([signal_dist.sample() if flip else 0 for flip in flips]) + np.random.normal(loc=null_mean, scale=null_stdev, size=len(flips))

def generate_data(null_mean, null_stdev, signal_dist, signal_weights):
    '''Create a synthetic dataset.'''
    # Flip biased coins to decide which distribution to draw each sample from
    flips = np.random.random(size=signal_weights.shape) < signal_weights

    # Recursively generate the dataset
    samples = generate_data_helper(flips, null_mean, null_stdev, signal_dist)

    # Observed z-scores
    z = (samples - null_mean) / null_stdev

    return (z, flips)

def calc_fdr(probs, fdr_level):
    '''Calculates the detected signals at a specific false discovery rate given the posterior probabilities of each point.'''
    pshape = probs.shape
    if len(probs.shape) > 1:
        probs = probs.flatten()
    post_orders = np.argsort(probs)[::-1]
    avg_fdr = 0.0
    end_fdr = 0
    
    for idx in post_orders:
        test_fdr = (avg_fdr * end_fdr + (1.0 - probs[idx])) / (end_fdr + 1.0)
        if test_fdr > fdr_level:
            break
        avg_fdr = test_fdr
        end_fdr += 1

    is_finding = np.zeros(probs.shape, dtype=int)
    is_finding[post_orders[0:end_fdr]] = 1
    if len(pshape) > 1:
        is_finding = is_finding.reshape(pshape)
    return is_finding

def filter_nonrectangular_data(data, filter_value=0):
    '''Convert the square matrix to a vector containing only the values different than the filter values.'''
    x = data != filter_value
    nonrect_vals = np.arange(x.sum())
    nonrect_to_data = np.zeros(data.shape, dtype=int) - 1
    data_to_nonrect = np.where(x.T)
    data_to_nonrect = (data_to_nonrect[1],data_to_nonrect[0])
    nonrect_to_data[data_to_nonrect] = nonrect_vals
    nonrect_data = data[x]
    return (nonrect_data, nonrect_to_data, data_to_nonrect)

def sparse_2d_penalty_matrix(data_shape, nonrect_to_data=None):
    '''Create a sparse 2-d penalty matrix. Optionally takes a map to corrected indices, useful when dealing with non-rectangular data.'''
    row_counter = 0
    data = []
    row = []
    col = []

    if nonrect_to_data is not None:
        for j in range(data_shape[1]):
            for i in range(data_shape[0]-1):            
                idx1 = nonrect_to_data[i,j]
                idx2 = nonrect_to_data[i+1,j]
                if idx1 < 0 or idx2 < 0:
                    continue
                row.append(row_counter)
                col.append(idx1)
                data.append(-1)

                row.append(row_counter)
                col.append(idx2)
                data.append(1.)

                row_counter += 1
        for j in range(data_shape[1]-1):
            for i in range(data_shape[0]):
                idx1 = nonrect_to_data[i,j]
                idx2 = nonrect_to_data[i,j+1]
                if idx1 < 0 or idx2 < 0:
                    continue
                row.append(row_counter)
                col.append(idx1)
                data.append(-1)

                row.append(row_counter)
                col.append(idx2)
                data.append(1.)

                row_counter += 1
    else:
        for j in range(data_shape[1]):
            for i in range(data_shape[0] - 1):
                idx1 = i+j*data_shape[0]
                idx2 = i+j*data_shape[0]+1

                row.append(row_counter)
                col.append(idx1)
                data.append(-1.)

                row.append(row_counter)
                col.append(idx2)
                data.append(1.)

                row_counter += 1

        col_counter = 0
        for i in range(data_shape[0]):
            for j in range(data_shape[1] - 1):
                idx1 = col_counter
                idx2 = col_counter+data_shape[0]

                row.append(row_counter)
                col.append(idx1)
                data.append(-1.)

                row.append(row_counter)
                col.append(idx2)
                data.append(1.)

                row_counter += 1
                col_counter += 1

    num_rows = row_counter
    num_cols = max(col) + 1
    return csc_matrix((data, (row, col)), shape=(num_rows, num_cols))
    
def sparse_1d_penalty_matrix(data_len):
    penalties = np.eye(data_len, dtype=float)[0:-1] * -1
    for i in range(len(penalties)):
        penalties[i,i+1] = 1
    return csc_matrix(penalties)

def cube_trails(xmax, ymax, zmax):
    '''Produces a list of trails following a simple row/col/aisle split strategy for a cube.'''
    trails = []
    for x in range(xmax):
        for y in range(ymax):
            trails.append([x * ymax * zmax + y * zmax + z for z in range(zmax)])
    for y in range(ymax):
        for z in range(zmax):
            trails.append([x * ymax * zmax + y * zmax + z for x in range(xmax)])
    for z in range(zmax):
        for x in range(xmax):
            trails.append([x * ymax * zmax + y * zmax + z for y in range(ymax)])
    return trails

def val_present(data, x, missing_val):
    return missing_val is None or x

def cube_edges(data, missing_val=None):
    '''Produces a list of edges for a cube with potentially missing data.
    If missing_val is specified, entries with that value will be considered
    missing and no edges will be connected to them.'''
    edges = []
    xmax, ymax, zmax = data.shape
    for y in range(ymax):
        for z in range(zmax):
            edges.extend([((x1, y, z), (x2, y, z)) 
                            for x1, x2 in zip(range(data.shape[0]-1), range(1,data.shape[0]))
                            if missing_val is None or (data[x1,y,z] != missing_val and data[x2,y,z] != missing_val)])
    for x in range(xmax):
        for z in range(zmax):
            edges.extend([((x, y1, z), (x, y2, z))
                            for y1, y2 in zip(range(data.shape[1]-1), range(1,data.shape[1]))
                            if missing_val is None or (data[x,y1,z] != missing_val and data[x,y2,z] != missing_val)])
    for x in range(xmax):
        for y in range(ymax):
            edges.extend([((x, y, z1), (x, y, z2)) 
                            for z1, z2 in zip(range(data.shape[2]-1), range(1,data.shape[2]))
                            if missing_val is None or (data[x,y,z1] != missing_val and data[x,y,z2] != missing_val)])
    return edges

def cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val):
    if data[v1] == missing_val or data[v2] == missing_val:
        if len(cur_trail) > 1:
            trails.append(cur_trail)
            cur_trail = []
    else:
        if len(cur_trail) == 0:
            cur_trail.append(v1)
        cur_trail.append(v2)
    return cur_trail

def cube_trails_missing(data, missing_val=None):
    '''Generates row/col/aisle trails for a cube when there may be missing data.'''
    trails = []
    xmax, ymax, zmax = data.shape
    for y in range(ymax):
        for z in range(zmax):
            cur_trail = []
            for x1, x2 in zip(range(data.shape[0]-1), range(1,data.shape[0])):
                v1 = (x1,y,z)
                v2 = (x2,y,z)
                cur_trail = cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val)
            if len(cur_trail) > 1:
                trails.append(cur_trail)
                
    for x in range(xmax):
        for z in range(zmax):
            cur_trail = []
            for y1, y2 in zip(range(data.shape[1]-1), range(1,data.shape[1])):
                v1 = (x,y1,z)
                v2 = (x,y2,z)
                cur_trail = cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val)
            if len(cur_trail) > 1:
                trails.append(cur_trail)

    for x in range(xmax):
        for y in range(ymax):
            cur_trail = []
            for z1, z2 in zip(range(data.shape[2]-1), range(1,data.shape[2])):
                v1 = (x, y, z1)
                v2 = (x, y, z2)
                cur_trail = cube_trails_missing_helper(data, trails, cur_trail, v1, v2, missing_val)
            if len(cur_trail) > 1:
                trails.append(cur_trail)
                            
    return trails


def load_trails(filename):
    with open(filename, 'rb') as f:
        reader = csv.reader(f)
        return load_trails_from_reader(reader)

def load_trails_from_reader(reader):
    trails = []
    breakpoints = []
    edges = defaultdict(list)
    for line in reader:
        if len(trails) > 0:
            breakpoints.append(len(trails))
        nodes = [int(x) for x in line]
        trails.extend(nodes)
        for n1,n2 in zip(nodes[:-1], nodes[1:]):
            edges[n1].append(n2)
            edges[n2].append(n1)
    if len(trails) > 0:
        breakpoints.append(len(trails))
    return (len(breakpoints), np.array(trails, dtype="int32"), np.array(breakpoints, dtype="int32"), edges)

def save_trails(filename, trails):
    with open(filename, 'wb') as f:
        writer = csv.writer(f)
        writer.writerows(trails)

def pretty_str(p, decimal_places=2):
    '''Pretty-print a matrix or vector.'''
    if len(p.shape) == 1:
        return vector_str(p, decimal_places)
    if len(p.shape) == 2:
        return matrix_str(p, decimal_places)
    raise Exception('Invalid array with shape {0}'.format(p.shape))

def matrix_str(p, decimal_places=2):
    '''Pretty-print the matrix.'''
    return '[{0}]'.format("\n  ".join([vector_str(a, decimal_places) for a in p]))

def vector_str(p, decimal_places=2):
    '''Pretty-print the vector values.'''
    style = '{0:.' + str(decimal_places) + 'f}'
    return '[{0}]'.format(", ".join([style.format(a) for a in p]))

def mean_filter(pvals, edges, rescale=True):
    '''Given a list of p-values and their neighbors, applies a mean filter
    that replaces each p_i with p*_i where p*_i = mean(neighbors(p_i)).
    If rescale is true, then the p-values are rescaled to be variance 1.'''
    return np.array([np.mean(pvals[edges[i] + [i]]) * (np.sqrt(len(edges[i]) + 1) if rescale else 1) for i,p in enumerate(pvals)])

def median_filter(pvals, edges):
    '''Given a list of p-values and their neighbors, applies a median filter
    that replaces each p_i with p*_i where p*_i = median(neighbors(p_i)).'''
    return np.array([np.median(pvals[edges[i] + [i]]) for i,p in enumerate(pvals)])

def _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmin, tmax, tmin_fdr, tmax_fdr, rel_tol=1e-4):
    '''Finds the t-level via binary search.'''
    if np.isclose(tmin, tmax, atol=rel_tol) or np.isclose(tmin_fdr, tmax_fdr, atol=rel_tol) or tmax_fdr <= fdr_level:
        return (tmax, tmax_fdr) if tmax_fdr <= fdr_level else (tmin, tmin_fdr)
    tmid = (tmax + tmin) / 2.
    tmid_fdr = wstar_lambda * ghat(p_star, tmid) / (max(1,(p_star < tmid).sum()) * (1-ghat_lambda))
    print('t: [{0}, {1}, {2}] => fdr: [{3}, {4}, {5}]'.format(tmin, tmid, tmax, tmin_fdr, tmid_fdr, tmax_fdr))
    if tmid_fdr <= fdr_level:
        return _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmid, tmax, tmid_fdr, tmax_fdr)
    return _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmin, tmid, tmin_fdr, tmid_fdr)

def local_agg_fdr(pvals, edges, fdr_level, lmbda = 0.1):
    '''Given a list of p-values and the graph connecting them, applies a median
    filter to locally aggregate them and then performs a corrected FDR procedure
    from Zhang, Fan, and Yu (Annals of Statistics, 2011). lmbda is a tuning
    constant typically set to 0.1.'''
    p_star = median_filter(pvals, edges) # aggregate p-values
    ghat = lambda p, t: (p >= (1-t)).sum() / max(1., (2.0 * (p > 0.5).sum() + (p==0.5).sum())) # empirical null CDF
    wstar_lambda = (p_star > lmbda).sum() # number of nonrejects at the level lambda
    ghat_lambda = ghat(p_star, lmbda) # empirical null CDF at rejection level lambda    
    # Use binary search to find the highest t value that satisfies the fdr level
    tmin = 0.
    tmax = 1.
    tmin_fdr = wstar_lambda * ghat(p_star, tmin) / (max(1,(p_star < tmin).sum()) * (1-ghat_lambda))
    tmax_fdr = wstar_lambda * ghat(p_star, tmax) / (max(1,(p_star < tmax).sum()) * (1-ghat_lambda))
    t, tfdr = _local_agg_fdr_helper(fdr_level, p_star, ghat, ghat_lambda, wstar_lambda, tmin, tmax, tmin_fdr, tmax_fdr)
    print('t: {0} tfdr: {1}'.format(t, tfdr))
    # Returns the indices of all discoveries
    return np.where(p_star < t)[0]

def p_value(z, mu0=0., sigma0=1.):
    return 2*(1.0 - st.norm.cdf(np.abs((z - mu0) / sigma0)))

def benjamini_hochberg(z, fdr, mu0=0., sigma0=1.):
    '''Performs Benjamini-Hochberg multiple hypothesis testing on z at the given false discovery rate threshold.'''
    z_shape = z.shape if len(z.shape) > 1 else None
    if z_shape is not None:
        z = z.flatten()
    p = p_value(z, mu0=mu0, sigma0=sigma0)
    p_orders = np.argsort(p)
    discoveries = []
    m = float(len(p_orders))
    for k, s in enumerate(p_orders):
        if p[s] <= (k+1) / m * fdr:
            discoveries.append(s)
        else:
            break
    discoveries = np.array(discoveries)
    if z_shape is not None:
        x = np.zeros(z.shape)
        x[discoveries] = 1
        discoveries = np.where(x.reshape(z_shape) == 1)
    return discoveries














back to top