https://github.com/estherjulien/HybridML
Raw File
Tip revision: 9985e6d930e8b98eb03330a964c2c3fc8788630c authored by estherjulien on 01 August 2022, 11:54:59 UTC
HybridCode deleted from test_data_gen.py
Tip revision: 9985e6d
test_data_gen.py
from NetworkGen.NetworkToTree import *
from NetworkGen.LGT_network import *
from NetworkGen.tree_to_newick import *

from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import time
import sys

'''
Code used for generating test instances
'''


def make_data_fun(net_num, l=20, exact=False, ret=None, num_trees=None):
    # PARAMS OF LGT GENERATOR
    beta = 1
    distances = True

    if exact:
        tree_info = f"_L{l}_R{ret}_exact"
    else:
        tree_info = f"_L{l}_T{num_trees}"

    now = datetime.now().time()
    st = time.time()

    # make network
    network_gen_st = time.time()
    if exact:
        n = l - 2 + ret
        trials_per_n = 50
        print(f"JOB {net_num} ({now}): Start creating NETWORK (In-Sample, L = {l}, R = {ret}, n = {n})")
        while True:
            if l <= 20:
                alpha = np.random.uniform(0.1, 0.5)
            elif l <= 50:
                alpha = np.random.uniform(0.1, 0.3)
            else:
                alpha = np.random.uniform(0.1, 0.2)
            net, ret_num = simulation(n, alpha, 1, beta, ret)
            num_leaves = len(leaves(net))
            if num_leaves == l and ret_num == ret:
                break
            else:
                if trials_per_n:
                    trials_per_n -= 1
                else:
                    trials_per_n = 20
                    n += 1
                    print(f"JOB {net_num} ({now}): Start creating NETWORK (In-Sample, L = {l}, R = {ret}, n = {n})")

            if time.time() - network_gen_st > 60*1:
                print(f"JOB {net_num} ({now}): FAILED (In-Sample, L = {l}, R = {ret}, n = {n})")
                return None

    else:
        # randomize reticulation!
        min_ret = int(np.ceil(np.log2(num_trees)))
        max_ret = int(min([5*np.ceil(np.log2(num_trees)), 60]))
        ret = np.random.randint(min_ret, max_ret)
        n = l - 2 + ret     # preferably a reticulation number of at least 3 + minimum
        print(min_ret, ret, max_ret)
        trials_per_n = 20
        print(f"JOB {net_num} ({now}): Start creating NETWORK (Out-of-Sample, L = {l}, T = {num_trees}, n = {n})")
        while True:
            alpha = np.random.uniform(0.3, 0.5)
            net, ret_num = simulation(n, alpha, 1, beta, ret)
            num_leaves = len(leaves(net))
            print(ret, ret_num, num_leaves, alpha)
            if num_leaves == l:
                break
            else:
                if trials_per_n:
                    trials_per_n -= 1
                else:
                    trials_per_n = 10
                    n -= 1
                    print(f"JOB {net_num} ({now}): Start creating NETWORK (Out-of-Sample, L = {l}, T = {num_trees}, n = {n})")

            if time.time() - network_gen_st > 60*1:
                print(f"JOB {net_num} ({now}): FAILED (Out-of-Sample, L = {l}, T = {num_trees})")
                return None

    net_nodes = int(len(net.nodes))
    now = datetime.now().time()
    if exact:
        print(f"JOB {net_num} ({now}): Start creating TREE SET (L = {num_leaves}, T = {2**ret_num}, R = {ret_num})")
    else:
        print(f"JOB {net_num} ({now}): Start creating TREE SET (L = {num_leaves}, T = {num_trees}, R = {ret_num})")

    tree_set, tree_lvs = net_to_tree(net, num_trees, distances=distances, net_lvs=num_leaves)

    if num_trees is None:
        num_trees = 2 ** ret_num

    tree_to_newick_fun(tree_set, net_num, tree_info=tree_info)

    tree_child = is_tree_child(net)
    metadata_index = ["exact", "rets", "nodes", "net_leaves", "tree_child", "chers", "ret_chers", "trees", "n", "alpha",
                      "beta", "runtime"]

    net_cher, net_ret_cher = network_cherries(net)
    metadata = pd.Series([exact, ret_num, net_nodes, num_leaves, tree_child, len(net_cher)/2, len(net_ret_cher),
                          len(tree_set), n, alpha, beta, time.time() - st],
                         index=metadata_index,
                         dtype=float)
    output = {"net": net, "forest": tree_set, "metadata": metadata}
    with open(
            f"Data/Test/inst_results/tree_data{tree_info}_{net_num}.pickle", "wb") as handle:
        pickle.dump(output, handle)
    now = datetime.now().time()
    if exact:
        print(f"JOB {net_num} ({now}): FINISHED in {np.round(time.time() - st, 3)}s (In-Sample, L = {num_leaves}, "
              f"R = {ret_num}, n = {n})")
    else:
        print(f"JOB {net_num} ({now}): FINISHED in {np.round(time.time() - st, 3)}s (Out-of-Sample, L = {num_leaves}, "
              f"T = {num_trees}, n = {n})")
    return output


def is_tree_child(net):
    for n in net.nodes:
        if net.out_degree(n) == 2:
            two_rets = []
            for c in net.successors(n):
                if net.out_degree(c) == 1:
                    two_rets.append(True)
                else:
                    two_rets.append(False)
            if all(two_rets):
                return False
        elif net.out_degree(n) == 1:
            for c in net.successors(n):
                if net.out_degree(c) == 1:
                    return False
    return True


if __name__ == "__main__":
    net_num = int(sys.argv[1])
    l = int(sys.argv[2])
    exact_input = int(sys.argv[3])

    if exact_input:
        exact = True
        ret = int(sys.argv[4])
        num_trees = None
    else:
        exact = False
        ret = None
        num_trees = int(sys.argv[4])

    make_data_fun(net_num, l, exact, ret, num_trees)
back to top