# import itertools # from functools import partial # from scipy.stats import norm # from scipy.sparse import csc_matrix, linalg as sla # from scipy import sparse # from scipy.optimize import minimize, minimize_scalar # from collections import deque, namedtuple import numpy as np from networkx import Graph from pygfl.solver import TrailSolver from pygfl.trails import decompose_graph, save_chains from pygfl.utils import chains_to_trails, calc_plateaus, hypercube_edges from smoothfdr.smoothed_fdr import GaussianKnown from smoothfdr.normix import * from smoothfdr.utils import calc_fdr def smooth_fdr(data, fdr_level, edges=None, initial_values=None, verbose=0, null_dist=None, signal_dist=None, num_sweeps=10, missing_val=None): flat_data = data.flatten() nonmissing_flat_data = flat_data if edges is None: if verbose: print('Using default edge set of a grid of same shape as the data: {0}'.format(data.shape)) edges = hypercube_edges(data.shape) if missing_val is not None: if verbose: print('Removing all data points whose data value is {0}'.format(missing_val)) edges = [(e1,e2) for (e1,e2) in edges if flat_data[e1] != missing_val and flat_data[e2] != missing_val] nonmissing_flat_data = flat_data[flat_data != missing_val] # Decompose the graph into trails g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) if null_dist is None: # empirical null estimation mu0, sigma0 = empirical_null(nonmissing_flat_data, verbose=max(0,verbose-1)) elif isinstance(null_dist,GaussianKnown): mu0, sigma0 = null_dist.mean, null_dist.stdev else: mu0, sigma0 = null_dist null_dist = GaussianKnown(mu0, sigma0) if verbose: print('Empirical null: {0}'.format(null_dist)) # signal distribution estimation if verbose: print('Running predictive recursion for {0} sweeps'.format(num_sweeps)) if signal_dist is None: grid_x = np.linspace(min(-20, nonmissing_flat_data.min() - 1), max(nonmissing_flat_data.max() + 1, 20), 220) pr_results = predictive_recursion(nonmissing_flat_data, num_sweeps, grid_x, mu0=mu0, sig0=sigma0) signal_dist = GridDistribution(pr_results['grid_x'], pr_results['y_signal']) if verbose: print('Smoothing priors via solution path algorithm') solver = TrailSolver() solver.set_data(flat_data, edges, ntrails, trails, breakpoints) results = solution_path_smooth_fdr(flat_data, solver, null_dist, signal_dist, verbose=max(0, verbose-1)) results['discoveries'] = calc_fdr(results['posteriors'], fdr_level) results['null_dist'] = null_dist results['signal_dist'] = signal_dist # Reshape everything back to the original data shape results['betas'] = results['betas'].reshape(data.shape) results['priors'] = results['priors'].reshape(data.shape) results['posteriors'] = results['posteriors'].reshape(data.shape) results['discoveries'] = results['discoveries'].reshape(data.shape) results['beta_iters'] = np.array([x.reshape(data.shape) for x in results['beta_iters']]) results['prior_iters'] = np.array([x.reshape(data.shape) for x in results['prior_iters']]) results['posterior_iters'] = np.array([x.reshape(data.shape) for x in results['posterior_iters']]) return results def smooth_fdr_known_dists(data, fdr_level, null_dist, signal_dist, edges=None, initial_values=None, verbose=0, missing_val=None): '''FDR smoothing where the null and alternative distributions are known (and not necessarily Gaussian). Both must define the function pdf.''' flat_data = data.flatten() nonmissing_flat_data = flat_data if edges is None: if verbose: print('Using default edge set of a grid of same shape as the data: {0}'.format(data.shape)) edges = hypercube_edges(data.shape) if missing_val is not None: if verbose: print('Removing all data points whose data value is {0}'.format(missing_val)) edges = [(e1,e2) for (e1,e2) in edges if flat_data[e1] != missing_val and flat_data[e2] != missing_val] nonmissing_flat_data = flat_data[flat_data != missing_val] # Decompose the graph into trails g = Graph() g.add_edges_from(edges) chains = decompose_graph(g, heuristic='greedy') ntrails, trails, breakpoints, edges = chains_to_trails(chains) if verbose: print('Smoothing priors via solution path algorithm') solver = TrailSolver() solver.set_data(flat_data, edges, ntrails, trails, breakpoints) results = solution_path_smooth_fdr(flat_data, solver, null_dist, signal_dist, verbose=max(0, verbose-1)) results['discoveries'] = calc_fdr(results['posteriors'], fdr_level) results['null_dist'] = null_dist results['signal_dist'] = signal_dist # Reshape everything back to the original data shape results['betas'] = results['betas'].reshape(data.shape) results['priors'] = results['priors'].reshape(data.shape) results['posteriors'] = results['posteriors'].reshape(data.shape) results['discoveries'] = results['discoveries'].reshape(data.shape) results['beta_iters'] = np.array([x.reshape(data.shape) for x in results['beta_iters']]) results['prior_iters'] = np.array([x.reshape(data.shape) for x in results['prior_iters']]) results['posterior_iters'] = np.array([x.reshape(data.shape) for x in results['posterior_iters']]) return results def solution_path_smooth_fdr(data, solver, null_dist, signal_dist, min_lambda=0.20, max_lambda=1.5, lambda_bins=30, verbose=0, initial_values=None): '''Follows the solution path of the generalized lasso to find the best lambda value.''' lambda_grid = np.exp(np.linspace(np.log(max_lambda), np.log(min_lambda), lambda_bins)) aic_trace = np.zeros(lambda_grid.shape) # The AIC score for each lambda value aicc_trace = np.zeros(lambda_grid.shape) # The AICc score for each lambda value (correcting for finite sample size) bic_trace = np.zeros(lambda_grid.shape) # The BIC score for each lambda value dof_trace = np.zeros(lambda_grid.shape) # The degrees of freedom of each final solution log_likelihood_trace = np.zeros(lambda_grid.shape) beta_trace = [] u_trace = [] w_trace = [] c_trace = [] results_trace = [] best_idx = None best_plateaus = None for i, _lambda in enumerate(lambda_grid): if verbose: print('#{0} Lambda = {1}'.format(i, _lambda)) # Fit to the final values results = fixed_penalty_smooth_fdr(data, solver, _lambda, null_dist, signal_dist, verbose=max(0,verbose - 1), initial_values=initial_values) if verbose: print('Calculating degrees of freedom') plateaus = calc_plateaus(results['beta'], solver.edges) dof_trace[i] = len(plateaus) if verbose: print('Calculating AIC') # Get the negative log-likelihood log_likelihood_trace[i] = -_data_negative_log_likelihood(data, results['c'], null_dist, signal_dist) # Calculate AIC = 2k - 2ln(L) aic_trace[i] = 2. * dof_trace[i] - 2. * log_likelihood_trace[i] # Calculate AICc = AIC + 2k * (k+1) / (n - k - 1) aicc_trace[i] = aic_trace[i] + 2 * dof_trace[i] * (dof_trace[i]+1) / (data.shape[0] - dof_trace[i] - 1.) # Calculate BIC = -2ln(L) + k * (ln(n) - ln(2pi)) bic_trace[i] = -2 * log_likelihood_trace[i] + dof_trace[i] * (np.log(len(data)) - np.log(2 * np.pi)) # Track the best model thus far if best_idx is None or bic_trace[i] < bic_trace[best_idx]: best_idx = i best_plateaus = plateaus # Save the final run parameters to use for warm-starting the next iteration initial_values = results # Save the trace of all the resulting parameters beta_trace.append(results['beta']) w_trace.append(results['w']) c_trace.append(results['c']) if verbose: print('DoF: {0} AIC: {1} AICc: {2} BIC: {3}'.format(dof_trace[i], aic_trace[i], aicc_trace[i], bic_trace[i])) if verbose: print('Best setting (by BIC): lambda={0} [DoF: {1}, AIC: {2}, AICc: {3} BIC: {4}]'.format(lambda_grid[best_idx], dof_trace[best_idx], aic_trace[best_idx], aicc_trace[best_idx], bic_trace[best_idx])) return {'aic': aic_trace, 'aicc': aicc_trace, 'bic': bic_trace, 'dof': dof_trace, 'loglikelihood': log_likelihood_trace, 'beta_iters': np.array(beta_trace), 'posterior_iters': np.array(w_trace), 'prior_iters': np.array(c_trace), 'lambda_iters': lambda_grid, 'best': best_idx, 'betas': beta_trace[best_idx], 'priors': c_trace[best_idx], 'posteriors': w_trace[best_idx], 'lambda': lambda_grid[best_idx], 'plateaus': best_plateaus} def fixed_penalty_smooth_fdr(data, solver, _lambda, null_dist, signal_dist, initial_values=None, verbose=0): converge = 1e-6 max_steps = 30 m_steps = 1 m_converge = 1e-6 w_iters = [] beta_iters = [] c_iters = [] delta_iters = [] delta = converge + 1 if initial_values is None: beta = np.zeros(data.shape) prior_prob = np.exp(beta) / (1 + np.exp(beta)) else: beta = initial_values['beta'] prior_prob = initial_values['c'] prev_nll = 0 cur_step = 0 while delta > converge and cur_step < max_steps: if verbose: print('Step #{0}'.format(cur_step)) if verbose: print('\tE-step...') # Get the likelihood weights vector (E-step) post_prob = _e_step(data, prior_prob, null_dist, signal_dist) if verbose: print('\tM-step...') # Find beta using an alternating Taylor approximation and convex optimization (M-step) beta, initial_values = _m_step(beta, prior_prob, post_prob, _lambda, solver, m_converge, m_steps, max(0,verbose-1), initial_values) # Get the signal probabilities prior_prob = ilogit(beta) cur_nll = _data_negative_log_likelihood(data, prior_prob, null_dist, signal_dist) # Track the change in log-likelihood to see if we've converged delta = np.abs(cur_nll - prev_nll) / (prev_nll + converge) if verbose: print('\tDelta: {0}'.format(delta)) # Track the step w_iters.append(post_prob) beta_iters.append(beta) c_iters.append(prior_prob) delta_iters.append(delta) # Increment the step counter cur_step += 1 # Update the negative log-likelihood tracker prev_nll = cur_nll # DEBUGGING if verbose: print('\tbeta: [{0:.4f}, {1:.4f}]'.format(beta.min(), beta.max())) print('\tprior_prob: [{0:.4f}, {1:.4f}]'.format(prior_prob.min(), prior_prob.max())) print('\tpost_prob: [{0:.4f}, {1:.4f}]'.format(post_prob.min(), post_prob.max())) w_iters = np.array(w_iters) beta_iters = np.array(beta_iters) c_iters = np.array(c_iters) delta_iters = np.array(delta_iters) # Return the results of the run return {'beta': beta, 'w': post_prob, 'c': prior_prob, 'z': initial_values['z'], 'u': initial_values['u'], 'w_iters': w_iters, 'beta_iters': beta_iters, 'c_iters': c_iters, 'delta_iters': delta_iters} def _data_negative_log_likelihood(data, prior_prob, null_dist, signal_dist): '''Calculate the negative log-likelihood of the data given the weights.''' signal_weight = prior_prob * signal_dist.pdf(data) null_weight = (1-prior_prob) * null_dist.pdf(data) return -np.log(signal_weight + null_weight).sum() def _e_step(data, prior_prob, null_dist, signal_dist): '''Calculate the complete-data sufficient statistics (weights vector).''' signal_weight = prior_prob * signal_dist.pdf(data) null_weight = (1-prior_prob) * null_dist.pdf(data) post_prob = signal_weight / (signal_weight + null_weight) return post_prob def _m_step(beta, prior_prob, post_prob, _lambda, solver, converge, max_steps, verbose, initial_values): ''' Alternating Second-order Taylor-series expansion about the current iterate ''' prev_nll = _m_log_likelihood(post_prob, beta) delta = converge + 1 cur_step = 0 while delta > converge and cur_step < max_steps: if verbose: print('\t\tM-Step iteration #{0}'.format(cur_step)) print('\t\tTaylor approximation...') # Cache the exponentiated beta exp_beta = np.exp(beta) # Form the parameters for our weighted least squares weights = (prior_prob * (1 - prior_prob)) y = beta - (prior_prob - post_prob) / weights solver.set_values_only(y, weights=weights) if initial_values is None: initial_values = {'beta': solver.beta, 'z': solver.z, 'u': solver.u} else: solver.beta = initial_values['beta'] solver.z = initial_values['z'] solver.u = initial_values['u'] solver.solve(_lambda) # if np.abs(beta).max() > 20: # beta = np.clip(beta, -20, 20) # u = None beta = initial_values['beta'] # Get the current log-likelihood cur_nll = _m_log_likelihood(post_prob, beta) # Track the convergence delta = np.abs(prev_nll - cur_nll) / (prev_nll + converge) if verbose: print('\t\tM-step delta: {0}'.format(delta)) # Increment the step counter cur_step += 1 # Update the negative log-likelihood tracker prev_nll = cur_nll return beta, initial_values def _m_log_likelihood(post_prob, beta): '''Calculate the log-likelihood of the betas given the weights and data.''' return (np.log(1 + np.exp(beta)) - post_prob * beta).sum() def ilogit(x): return 1. / (1. + np.exp(-x))