https://github.com/tansey/smoothfdr
Revision 49cb69c5a18fdb262964fbfeb47ab2099eb32c5c authored by Wesley Tansey on 03 May 2018, 19:46:59 UTC, committed by Wesley Tansey on 03 May 2018, 19:46:59 UTC
2 parent s b713c52 + 63bb329
Raw File
Tip revision: 49cb69c5a18fdb262964fbfeb47ab2099eb32c5c authored by Wesley Tansey on 03 May 2018, 19:46:59 UTC
Merged
Tip revision: 49cb69c
gendata.py
import numpy as np
import os
import argparse
import csv
from subprocess import Popen

def make_directory(base, subdir):
    if not base.endswith('/'):
        base += '/'
    directory = base + subdir
    if not os.path.exists(directory):
        os.makedirs(directory)
    if not directory.endswith('/'):
        directory = directory + '/'
    return directory

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Create batch synthetic data experiments.')
    parser.add_argument('experiment_name', help='The name of the experiment')
    parser.add_argument('--trials', type=int, default=30, help='The number of independent trials to run.')
    parser.add_argument('--scratch', default='/scratch/cluster/tansey/')
    
    parser.set_defaults()

    # Get the arguments from the command line
    args = parser.parse_args()

    exp_dir = make_directory(args.scratch, args.experiment_name)

    dims = (128, 128)
    signal_regions = [('big', (30, 30), (90, 90)), ('small', (50, 50), (80, 80))]
    signal_dists = [('well_separated', 2), ('flat_unimodal', 1)]
    signal_densities = [1., 0.5]
    noise_densities = [0., 0.05]

    gen_script = open(exp_dir + 'gendata.sh', 'wb')
    sfdr_script = open(exp_dir + 'sfdr_jobs', 'wb')
    hmrf_prep1_script = open(exp_dir + 'hmrf_prep1.sh', 'wb')
    hmrf_prep2_script = open(exp_dir + 'hmrf_prep2.sh', 'wb')
    hmrf_run_script = open(exp_dir + 'hmrf_run_jobs', 'wb')
    hmrf_post_script = open(exp_dir + 'hmrf_post.sh', 'wb')
    fdrl_script = open(exp_dir + 'fdrl_jobs.sh', 'wb')
    bh_script = open(exp_dir + 'bh_jobs.sh', 'wb')
    oracle_script = open(exp_dir + 'oracle_jobs.sh', 'wb')
    score_script = open(exp_dir + 'score.sh', 'wb')

    sfdr_script.write("""universe = vanilla
    Executable=/lusr/bin/python
    Requirements = InMastodon
    getenv = True
    +Group   = "GRAD"
    +Project = "AI_ROBOTICS"
    +ProjectDescription = "{0} sfdr benchmarks"
    """.format(args.experiment_name))

    sfdr_job = """Log = {0}sfdr_job.log
    Arguments = sfdr_run.py --data_file {0}data.csv --no_data_header --save_weights {0}sfdr_weights.csv --save_posteriors {0}sfdr_posteriors.csv --save_plateaus {0}sfdr_plateaus.csv --save_signal {0}sfdr_estimated_signal.csv --save_discoveries {0}sfdr_discoveries.csv --empirical_null --estimate_signal --solution_path --dual_solver graph graph --trails {0}trails.csv
    Output = {0}sfdr_job.out
    Error = {0}sfdr_job.error
    Queue 1
    """

    hmrf_run_script.write("""universe = vanilla
    Executable=run_hmrf.sh
    Requirements = InMastodon
    getenv = True
    +Group   = "GRAD"
    +Project = "AI_ROBOTICS"
    +ProjectDescription = "{0} hmrf benchmarks"
    """.format(args.experiment_name))

    hmrf_job = """Log = {0}hmrf_job.log
    Arguments = {0} {1}
    Output = {0}hmrf_job.out
    Error = {0}hmrf_job.error
    Queue 1
    """

    for region_name, region_start, region_end in signal_regions:
        for signal_dist, L in signal_dists:
            for signal_density in signal_densities:
                for noise_density in noise_densities:
                    subexp_dir = make_directory(exp_dir, '{0}_{1}_{2}_{3}'.format(region_name, signal_dist, signal_density, noise_density))
                    for trial in xrange(args.trials):
                        # Make the directories for the trial
                        trial_dir = make_directory(subexp_dir, str(trial))
                        plot_dir = make_directory(trial_dir, 'plots')

                        # Write the script that generates the data
                        gen_script.write('echo {0}\n'.format(trial_dir))
                        gen_script.write('gen2d {0}data.csv {0}true_weights.csv {0}true_signals.csv {0}oracle_posteriors.csv {0}edges.csv {0}trails.csv --width {1} --height {2} --region_min_x {3} --region_max_x {4} --region_min_y {5} --region_max_y {6} --region_weights {7} --default_weight {8} --signal_dist_name {9} --plot {10}data.pdf'.format(trial_dir, dims[0], dims[1], region_start[0], region_end[0], region_start[1], region_end[1], signal_density, noise_density, signal_dist, plot_dir))
                        gen_script.write('\n\n')

                        # Write the script that runs the smoothed fdr algorithm
                        sfdr_script.write(sfdr_job.format(trial_dir))
                        sfdr_script.write('\n\n')

                        # Write the script that preps the data for the HMRF benchmark
                        hmrf_prep1_script.write('matlab -r "hmrfPrep1 {0}";\n'.format(trial_dir))
                        hmrf_prep2_script.write('python hmrfPrep2.py {0}\n\n'.format(trial_dir))

                        # Write the script that runs the HMRF routine
                        hmrf_run_script.write(hmrf_job.format(trial_dir, L))
                        hmrf_run_script.write('\n\n')

                        # Write the script that processes the results of the HMRF routine and converts it to discoveries
                        hmrf_post_script.write('python hmrfPost.py {0}\n\n'.format(trial_dir))

                        # Write the script that runs the FDR-L routine
                        fdrl_script.write('python fdrl_run.py {0}\n\n'.format(trial_dir))

                        # Write the script that runs the Benjamini-Hochberg routine
                        bh_script.write('Rscript run_bh.r {0}data.csv {0}bh_discoveries.csv\n\n'.format(trial_dir))

                        # Write the script that processes the results of the oracle and converts it to discoveries
                        oracle_script.write('python oracle_run.py {0}\n\n'.format(trial_dir))

                        # Write the script that tallies the results and calculates TPR and FDR for each trial
                        score_script.write('python score.py {0} {1}scores.csv\n\n'.format(trial_dir, subexp_dir))

                    # Aggregate all the independent trials
                    score_script.write('python aggregate_scores.py {0}scores.csv {0}aggregate_scores.csv\n\n'.format(subexp_dir))

    gen_script.flush()
    gen_script.close()

    sfdr_script.flush()
    sfdr_script.close()

    hmrf_prep1_script.flush()
    hmrf_prep1_script.close()

    hmrf_prep2_script.flush()
    hmrf_prep2_script.close()

    hmrf_run_script.flush()
    hmrf_run_script.close()

    hmrf_post_script.flush()
    hmrf_post_script.close()

    fdrl_script.flush()
    fdrl_script.close()

    bh_script.flush()
    bh_script.close()

    oracle_script.flush()
    oracle_script.close()

    score_script.flush()
    score_script.close()


    # Create the surrogate script that condor uses to run the fdr smoothing algorithm on each job
    with open(exp_dir + 'sfdr_run.py', 'wb') as f:
        f.write('import sys; from smoothfdr import main; main()')

    test_dir = os.path.join(os.path.abspath(os.path.join(os.getcwd(), '..')), 'test/')
    Popen('cp {0} {1}'.format(test_dir + 'hmrf*.py', exp_dir), shell=True)
    Popen('cp {0} {1}'.format(test_dir + '*.m', exp_dir), shell=True)
    Popen('cp {0} {1}'.format(test_dir + 'fdrl*.py', exp_dir), shell=True)
    Popen('cp {0} {1}'.format(test_dir + 'score.py', exp_dir), shell=True)
    Popen('cp {0} {1}'.format(test_dir + 'oracle_run.py', exp_dir), shell=True)
    Popen('cp {0} {1}'.format(test_dir + 'aggregate_scores.py', exp_dir), shell=True)
    #Popen('cp {0} {1}'.format(test_dir + '*.mexa64', exp_dir), shell=True)


    with open(exp_dir + 'README.txt', 'wb') as f:
        f.write("""Instructions for running benchmarks on Condor are as follows:
            -1) Make sure both pygfl and smoothfdr are properly installed and in your PATH. (local install: "pip install --user -e .")
            0) Go to the test directory, run matlab, and execute mex_setup()
            1) Run gendata.py [expname]
            2) Go to the [scratch]/[expname] directory
            3) Submit the sfdr jobs: condor_submit sfdr_jobs
            4) Make all the shell scripts runnable: chmod 777 *.sh
            5) Run the first hidden MRF prep script: ./hmrf_prep1.sh
            6) Run the second hidden MRF prep script: ./hmrf_prep2.sh
            7) Submit the hidden MRF jobs: condor_submit hmrf_run_jobs
            8) Run the hidden MRF post-processsing script: ./hmrf_post.sh
            9) Run the scoring script: ./score.sh
        """)
    

# python gen2d.py {0}data.csv {0}weights.csv {0}truth.csv {0}oracle_posteriors.csv {0}edges.csv --width {1} --height {2} --region_min_x {3} --region_max_x {4} --region_min_y {5} --region_max_y {6} --region_weights {7} --default_weight {8} --signal_dist_name {9} --plot {10}data.pdf

# trial_dir, dims[0], dims[1], region_start[0], region_end[0], region_start[1], region_end[1], signal_density, noise_density, signal_dist, plot_dir

#     # Plot results
#     parser.add_argument('--plot', help='Plot the resulting data and save to the specified file.')


# python __init__.py --data_file {0}data.csv --generate_data --signals_file {0}truth.csv --save_weights {0}weights.csv --save_posteriors {0}posteriors.csv --save_plateaus {0}plateaus.csv --save_signal {0}fdrs_discoveries.csv --save_oracle_posteriors {0}oracle_posteriors.csv --empirical_null --estimate_signal --solution_path 2d --width {1} --height
back to top