test_data_gen.py
from NetworkGen.NetworkToTree import *
from NetworkGen.LGT_network import *
from NetworkGen.tree_to_newick import *
from datetime import datetime
import pandas as pd
import numpy as np
import pickle
import time
import sys
'''
Code used for generating test instances
'''
def make_data_fun(net_num, l=20, exact=False, ret=None, num_trees=None):
# PARAMS OF LGT GENERATOR
beta = 1
distances = True
if exact:
tree_info = f"_L{l}_R{ret}_exact"
else:
tree_info = f"_L{l}_T{num_trees}"
now = datetime.now().time()
st = time.time()
# make network
network_gen_st = time.time()
if exact:
n = l - 2 + ret
trials_per_n = 50
print(f"JOB {net_num} ({now}): Start creating NETWORK (In-Sample, L = {l}, R = {ret}, n = {n})")
while True:
if l <= 20:
alpha = np.random.uniform(0.1, 0.5)
elif l <= 50:
alpha = np.random.uniform(0.1, 0.3)
else:
alpha = np.random.uniform(0.1, 0.2)
net, ret_num = simulation(n, alpha, 1, beta, ret)
num_leaves = len(leaves(net))
if num_leaves == l and ret_num == ret:
break
else:
if trials_per_n:
trials_per_n -= 1
else:
trials_per_n = 20
n += 1
print(f"JOB {net_num} ({now}): Start creating NETWORK (In-Sample, L = {l}, R = {ret}, n = {n})")
if time.time() - network_gen_st > 60*1:
print(f"JOB {net_num} ({now}): FAILED (In-Sample, L = {l}, R = {ret}, n = {n})")
return None
else:
# randomize reticulation!
min_ret = int(np.ceil(np.log2(num_trees)))
max_ret = int(min([5*np.ceil(np.log2(num_trees)), 60]))
ret = np.random.randint(min_ret, max_ret)
n = l - 2 + ret # preferably a reticulation number of at least 3 + minimum
print(min_ret, ret, max_ret)
trials_per_n = 20
print(f"JOB {net_num} ({now}): Start creating NETWORK (Out-of-Sample, L = {l}, T = {num_trees}, n = {n})")
while True:
alpha = np.random.uniform(0.3, 0.5)
net, ret_num = simulation(n, alpha, 1, beta, ret)
num_leaves = len(leaves(net))
print(ret, ret_num, num_leaves, alpha)
if num_leaves == l:
break
else:
if trials_per_n:
trials_per_n -= 1
else:
trials_per_n = 10
n -= 1
print(f"JOB {net_num} ({now}): Start creating NETWORK (Out-of-Sample, L = {l}, T = {num_trees}, n = {n})")
if time.time() - network_gen_st > 60*1:
print(f"JOB {net_num} ({now}): FAILED (Out-of-Sample, L = {l}, T = {num_trees})")
return None
net_nodes = int(len(net.nodes))
now = datetime.now().time()
if exact:
print(f"JOB {net_num} ({now}): Start creating TREE SET (L = {num_leaves}, T = {2**ret_num}, R = {ret_num})")
else:
print(f"JOB {net_num} ({now}): Start creating TREE SET (L = {num_leaves}, T = {num_trees}, R = {ret_num})")
tree_set, tree_lvs = net_to_tree(net, num_trees, distances=distances, net_lvs=num_leaves)
if num_trees is None:
num_trees = 2 ** ret_num
tree_to_newick_fun(tree_set, net_num, tree_info=tree_info)
tree_child = is_tree_child(net)
metadata_index = ["exact", "rets", "nodes", "net_leaves", "tree_child", "chers", "ret_chers", "trees", "n", "alpha",
"beta", "runtime"]
net_cher, net_ret_cher = network_cherries(net)
metadata = pd.Series([exact, ret_num, net_nodes, num_leaves, tree_child, len(net_cher)/2, len(net_ret_cher),
len(tree_set), n, alpha, beta, time.time() - st],
index=metadata_index,
dtype=float)
output = {"net": net, "forest": tree_set, "metadata": metadata}
with open(
f"Data/Test/inst_results/tree_data{tree_info}_{net_num}.pickle", "wb") as handle:
pickle.dump(output, handle)
now = datetime.now().time()
if exact:
print(f"JOB {net_num} ({now}): FINISHED in {np.round(time.time() - st, 3)}s (In-Sample, L = {num_leaves}, "
f"R = {ret_num}, n = {n})")
else:
print(f"JOB {net_num} ({now}): FINISHED in {np.round(time.time() - st, 3)}s (Out-of-Sample, L = {num_leaves}, "
f"T = {num_trees}, n = {n})")
return output
def is_tree_child(net):
for n in net.nodes:
if net.out_degree(n) == 2:
two_rets = []
for c in net.successors(n):
if net.out_degree(c) == 1:
two_rets.append(True)
else:
two_rets.append(False)
if all(two_rets):
return False
elif net.out_degree(n) == 1:
for c in net.successors(n):
if net.out_degree(c) == 1:
return False
return True
if __name__ == "__main__":
net_num = int(sys.argv[1])
l = int(sys.argv[2])
exact_input = int(sys.argv[3])
if exact_input:
exact = True
ret = int(sys.argv[4])
num_trees = None
else:
exact = False
ret = None
num_trees = int(sys.argv[4])
make_data_fun(net_num, l, exact, ret, num_trees)