Skip to main content
  • Home
  • Development
  • Documentation
  • Donate
  • Operational login
  • Browse the archive

swh logo
SoftwareHeritage
Software
Heritage
Archive
Features
  • Search

  • Downloads

  • Save code now

  • Add forge now

  • Help

Revision 8598e9e321cd47edfecd7e5e83482ee0e4486e94 authored by Wesley Tansey on 23 June 2016, 01:57:28 UTC, committed by Wesley Tansey on 23 June 2016, 01:57:28 UTC
Merge branch 'master' of github.com:tansey/smoothfdr
2 parent s 907bf5c + a7ab0a6
  • Files
  • Changes
  • f04a17d
  • /
  • smoothfdr
  • /
  • easy.py
Raw File Download
Permalinks

To reference or cite the objects present in the Software Heritage archive, permalinks based on SoftWare Hash IDentifiers (SWHIDs) must be used.
Select below a type of object currently browsed in order to display its associated SWHID and permalink.

  • revision
  • directory
  • content
revision badge
swh:1:rev:8598e9e321cd47edfecd7e5e83482ee0e4486e94
directory badge Iframe embedding
swh:1:dir:a6c0fd806e00b75984345b63925b36633898d701
content badge Iframe embedding
swh:1:cnt:a77ba2ea2a0208dd9409c6ec3dbd38a0e127af36
Citations

This interface enables to generate software citations, provided that the root directory of browsed objects contains a citation.cff or codemeta.json file.
Select below a type of object currently browsed in order to generate citations for them.

  • revision
  • directory
  • content
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
Generate software citation in BibTex format (requires biblatex-software package)
Generating citation ...
easy.py
import itertools
import numpy as np
from scipy import sparse
from scipy.stats import norm
from scipy.optimize import minimize, minimize_scalar
from scipy.sparse import csc_matrix, linalg as sla
from functools import partial
from collections import deque, namedtuple
from pygfl.solver import TrailSolver
from smoothed_fdr import GaussianKnown
from pygfl.trails import decompose_graph, save_chains
from pygfl.utils import chains_to_trails, calc_plateaus
from networkx import Graph, connected_components
from smoothfdr.normix import *
from smoothfdr.utils import calc_fdr

def smooth_fdr(data, edges, fdr_level, initial_values=None, verbose=0):
    # Decompose the graph into trails
    g = Graph()
    g.add_edges_from(edges)
    chains = decompose_graph(g, heuristic='greedy')
    ntrails, trails, breakpoints, edges = chains_to_trails(chains)

    # empirical null estimation
    mu0, sigma0 = empirical_null(data)
    null_dist = GaussianKnown(mu0, sigma0)

    # signal distribution estimation
    num_sweeps = 10
    grid_x = np.linspace(min(-20, data.min() - 1), max(data.max() + 1, 20), 220)
    pr_results = predictive_recursion(data, num_sweeps, grid_x, mu0=mu0, sig0=sigma0)
    signal_dist = GridDistribution(pr_results['grid_x'], pr_results['y_signal'])

    solver = TrailSolver()
    solver.set_data(data, edges, ntrails, trails, breakpoints)

    results = solution_path_smooth_fdr(data, solver, null_dist, signal_dist, verbose=max(0, verbose-1))

    results['discoveries'] = calc_fdr(results['posteriors'], fdr_level)

    return results

def solution_path_smooth_fdr(data, solver, null_dist, signal_dist, min_lambda=0.20, max_lambda=1.5, lambda_bins=30, verbose=0, initial_values=None):
        '''Follows the solution path of the generalized lasso to find the best lambda value.'''
        lambda_grid = np.exp(np.linspace(np.log(max_lambda), np.log(min_lambda), lambda_bins))
        aic_trace = np.zeros(lambda_grid.shape) # The AIC score for each lambda value
        aicc_trace = np.zeros(lambda_grid.shape) # The AICc score for each lambda value (correcting for finite sample size)
        bic_trace = np.zeros(lambda_grid.shape) # The BIC score for each lambda value
        dof_trace = np.zeros(lambda_grid.shape) # The degrees of freedom of each final solution
        log_likelihood_trace = np.zeros(lambda_grid.shape)
        beta_trace = []
        u_trace = []
        w_trace = []
        c_trace = []
        results_trace = []
        best_idx = None
        best_plateaus = None
        for i, _lambda in enumerate(lambda_grid):
            if verbose:
                print '#{0} Lambda = {1}'.format(i, _lambda)

            # Fit to the final values
            results = fixed_penalty_smooth_fdr(data, solver, _lambda, null_dist, signal_dist,
                                               verbose=max(0,verbose - 1),
                                               initial_values=initial_values)

            if verbose:
                print 'Calculating degrees of freedom'

            plateaus = calc_plateaus(results['beta'], solver.edges)
            dof_trace[i] = len(plateaus)

            if verbose:
                print 'Calculating AIC'

            # Get the negative log-likelihood
            log_likelihood_trace[i] = -_data_negative_log_likelihood(data, results['c'], null_dist, signal_dist)

            # Calculate AIC = 2k - 2ln(L)
            aic_trace[i] = 2. * dof_trace[i] - 2. * log_likelihood_trace[i]
            
            # Calculate AICc = AIC + 2k * (k+1) / (n - k - 1)
            aicc_trace[i] = aic_trace[i] + 2 * dof_trace[i] * (dof_trace[i]+1) / (data.shape[0] - dof_trace[i] - 1.)

            # Calculate BIC = -2ln(L) + k * (ln(n) - ln(2pi))
            bic_trace[i] = -2 * log_likelihood_trace[i] + dof_trace[i] * (np.log(len(data)) - np.log(2 * np.pi))

            # Track the best model thus far
            if best_idx is None or bic_trace[i] < bic_trace[best_idx]:
                best_idx = i
                best_plateaus = plateaus

            # Save the final run parameters to use for warm-starting the next iteration
            initial_values = results

            # Save the trace of all the resulting parameters
            beta_trace.append(results['beta'])
            w_trace.append(results['w'])
            c_trace.append(results['c'])

            if verbose:
                print 'DoF: {0} AIC: {1} AICc: {2} BIC: {3}'.format(dof_trace[i], aic_trace[i], aicc_trace[i], bic_trace[i])

        if verbose:
            print 'Best setting (by BIC): lambda={0} [DoF: {1}, AIC: {2}, AICc: {3} BIC: {4}]'.format(lambda_grid[best_idx], dof_trace[best_idx], aic_trace[best_idx], aicc_trace[best_idx], bic_trace[best_idx])

        return {'aic': aic_trace,
                'aicc': aicc_trace,
                'bic': bic_trace,
                'dof': dof_trace,
                'loglikelihood': log_likelihood_trace,
                'beta_iters': np.array(beta_trace),
                'posterior_iters': np.array(w_trace),
                'prior_iters': np.array(c_trace),
                'lambda_iters': lambda_grid,
                'best': best_idx,
                'betas': beta_trace[best_idx],
                'priors': c_trace[best_idx],
                'posteriors': w_trace[best_idx],
                'lambda': lambda_grid[best_idx],
                'plateaus': best_plateaus}

def fixed_penalty_smooth_fdr(data, solver, _lambda, null_dist, signal_dist, initial_values=None, verbose=0):
    converge = 1e-6
    max_steps = 30
    m_steps = 1
    m_converge = 1e-6

    w_iters = []
    beta_iters = []
    c_iters = []
    delta_iters = []    

    delta = converge + 1
        
    if initial_values is None:
        beta = np.zeros(data.shape)
        prior_prob = np.exp(beta) / (1 + np.exp(beta))
    else:
        beta = initial_values['beta']
        prior_prob = initial_values['c']

    prev_nll = 0
    cur_step = 0
    
    while delta > converge and cur_step < max_steps:
        if verbose:
            print 'Step #{0}'.format(cur_step)

        if verbose:
            print '\tE-step...'

        # Get the likelihood weights vector (E-step)
        post_prob = _e_step(data, prior_prob, null_dist, signal_dist)

        if verbose:
            print '\tM-step...'

        # Find beta using an alternating Taylor approximation and convex optimization (M-step)
        beta, initial_values = _m_step(beta, prior_prob, post_prob, _lambda,
                                       solver, m_converge, m_steps,
                                       max(0,verbose-1), initial_values)

        # Get the signal probabilities
        prior_prob = ilogit(beta)
        cur_nll = _data_negative_log_likelihood(data, prior_prob, null_dist, signal_dist)
        
        # Track the change in log-likelihood to see if we've converged
        delta = np.abs(cur_nll - prev_nll) / (prev_nll + converge)

        if verbose:
            print '\tDelta: {0}'.format(delta)

        # Track the step
        w_iters.append(post_prob)
        beta_iters.append(beta)
        c_iters.append(prior_prob)
        delta_iters.append(delta)

        # Increment the step counter
        cur_step += 1

        # Update the negative log-likelihood tracker
        prev_nll = cur_nll

        # DEBUGGING
        if verbose:
            print '\tbeta: [{0:.4f}, {1:.4f}]'.format(beta.min(), beta.max())
            print '\tprior_prob:    [{0:.4f}, {1:.4f}]'.format(prior_prob.min(), prior_prob.max())
            print '\tpost_prob:    [{0:.4f}, {1:.4f}]'.format(post_prob.min(), post_prob.max())
            
    w_iters = np.array(w_iters)
    beta_iters = np.array(beta_iters)
    c_iters = np.array(c_iters)
    delta_iters = np.array(delta_iters)

    # Return the results of the run
    return {'beta': beta, 'w': post_prob, 'c': prior_prob,
            'z': initial_values['z'], 'u': initial_values['u'],
            'w_iters': w_iters, 'beta_iters': beta_iters,
            'c_iters': c_iters, 'delta_iters': delta_iters}

def _data_negative_log_likelihood(data, prior_prob, null_dist, signal_dist):
    '''Calculate the negative log-likelihood of the data given the weights.'''
    signal_weight = prior_prob * signal_dist.pdf(data)
    null_weight = (1-prior_prob) * null_dist.pdf(data)
    return -np.log(signal_weight + null_weight).sum()

def _e_step(data, prior_prob, null_dist, signal_dist):
    '''Calculate the complete-data sufficient statistics (weights vector).'''
    signal_weight = prior_prob * signal_dist.pdf(data)
    null_weight = (1-prior_prob) * null_dist.pdf(data)
    post_prob = signal_weight / (signal_weight + null_weight)
    return post_prob

def _m_step(beta, prior_prob, post_prob, _lambda,
                solver, converge, max_steps,
                verbose, initial_values):
    '''
    Alternating Second-order Taylor-series expansion about the current iterate
    and coordinate descent to optimize Beta.
    '''
    prev_nll = _m_log_likelihood(post_prob, beta)
    delta = converge + 1
    cur_step = 0
    while delta > converge and cur_step < max_steps:
        if verbose:
            print '\t\tM-Step iteration #{0}'.format(cur_step)
            print '\t\tTaylor approximation...'

        # Cache the exponentiated beta
        exp_beta = np.exp(beta)

        # Form the parameters for our weighted least squares
        weights = (prior_prob * (1 - prior_prob))
        y = beta - (prior_prob - post_prob) / weights

        solver.set_values_only(y, weights=weights)
        if initial_values is None:
            initial_values = {'beta': solver.beta, 'z': solver.z, 'u': solver.u}
        else:
            solver.beta = initial_values['beta']
            solver.z = initial_values['z']
            solver.u = initial_values['u']
        solver.solve(_lambda)
        # if np.abs(beta).max() > 20:
        #     beta = np.clip(beta, -20, 20)
        #     u = None

        beta = initial_values['beta']

        # Get the current log-likelihood
        cur_nll = _m_log_likelihood(post_prob, beta)

        # Track the convergence
        delta = np.abs(prev_nll - cur_nll) / (prev_nll + converge)

        if verbose:
            print '\t\tM-step delta: {0}'.format(delta)

        # Increment the step counter
        cur_step += 1

        # Update the negative log-likelihood tracker
        prev_nll = cur_nll

    return beta, initial_values

def _m_log_likelihood(post_prob, beta):
    '''Calculate the log-likelihood of the betas given the weights and data.'''
    return (np.log(1 + np.exp(beta)) - post_prob * beta).sum()

def ilogit(x):
    return 1. / (1. + np.exp(-x))


The diff you're trying to view is too large. Only the first 1000 changed files have been loaded.
Showing with 0 additions and 0 deletions (0 / 0 diffs computed)
swh spinner

Computing file changes ...

back to top

Software Heritage — Copyright (C) 2015–2025, The Software Heritage developers. License: GNU AGPLv3+.
The source code of Software Heritage itself is available on our development forge.
The source code files archived by Software Heritage are available under their own copyright and licenses.
Terms of use: Archive access, API— Contact— JavaScript license information— Web API