https://github.com/bxshi/ProjE
Revision f370fb6484651f01563a242aa84f0389476d879b authored by bxshi on 29 April 2016, 19:10:10 UTC, committed by bxshi on 29 April 2016, 19:10:10 UTC
1 parent e155d39
Raw File
Tip revision: f370fb6484651f01563a242aa84f0389476d879b authored by bxshi on 29 April 2016, 19:10:10 UTC
done
Tip revision: f370fb6
SimpleNN.py
import tensorflow as tf
import numpy as np
import math
import timeit

import DataReader

flags = tf.flags
logging = tf.logging

flags.DEFINE_string("dataset", 'fb15k', "Dataset, [fb15k|wn18]")
flags.DEFINE_integer("topk", 1, "relation Hits@topk, default 1.")
flags.DEFINE_integer("ent_topk", 10, "entity Hits@topk, default 10.")
flags.DEFINE_integer("batch", 500, "mini batch size, default 500.")
flags.DEFINE_integer('embed', 100, "embedding size, default 100.")
flags.DEFINE_integer('max_iter', 1000, 'max iteration, default 1000.')
flags.DEFINE_string("load", "", "load data from disk")
flags.DEFINE_float("e", 1e-8, "epsilon, default 1e-8.")
flags.DEFINE_float("beta1", 0.9, "beta1, default 0.9.")
flags.DEFINE_float("beta2", 0.999, "beta2, default 0.999.")
flags.DEFINE_float("lr", 0.001, "learning rate, default 0.001.")

# these features are ignored, but you still could try if you want.
flags.DEFINE_string("amie", "./fb15k_amie_rules.csv", "AMIE rule file, only contains Rule,Confidence,PCA.Confidence.")
flags.DEFINE_float("pca", 1.0, "PCA confidence threshold, default 1.0.")
flags.DEFINE_float("confidence", 0.7, "confidence threshold, default 0.8.")
flags.DEFINE_boolean("association", False, "use amie")

# following are the settings for different designs
flags.DEFINE_boolean("simple", True, "Use simple projection (weighted plus) or matrix projection.")
flags.DEFINE_float("entropy_weight", 1.0, "wrong class entropy weight")
flags.DEFINE_string("activation", "softmax", "activation of output layer, default is softmax. It can be sigmoid.")
flags.DEFINE_float("sampling", -1.0, "probability that a false class will not be selected.")

FLAGS = flags.FLAGS


class SimpleNN:
    """ Basic ProjE model

    This model combines two entities using a compositional matrix or weights and then project the combined embedding
    onto each relation space.
    """
    __initialized = False
    __simple_projection = False
    __trainable = []

    def __init__(self, k_embeddings, n_rel, n_ent, prefix="", simple=False):
        """ Initialize neural network with given parameters.
        :param k_embeddings: The size of embeddings (vector representations).
        :param n_rel: Number of relations.
        :param n_ent: Number of entities.
        :param simple: Use simple weighted combination or matrix combination.
        :return: N/A
        """
        self.__k_embeddings = k_embeddings
        self.__n_rel = n_rel
        self.__n_ent = n_ent
        self.__simple_projection = simple

        bound = math.sqrt(6) / math.sqrt(k_embeddings)
        bound_proj = math.sqrt(6) / math.sqrt(k_embeddings * 2 + k_embeddings)
        bound_simple_proj = math.sqrt(6) / math.sqrt(k_embeddings * 2)
        bound_h3 = math.sqrt(6) / math.sqrt(k_embeddings)
        bound_bias = math.sqrt(6) / math.sqrt(n_rel)

        # Create embeddings
        with tf.device("/cpu:0"):
            self.__ent_embeddings = tf.get_variable(prefix + "ent_embeddings", [n_ent, k_embeddings],
                                                    initializer=tf.random_uniform_initializer(minval=-bound,
                                                                                              maxval=bound,
                                                                                              seed=250))
            # relation embedding, also served as the relation projection layer
            self.__rel_embeddings = tf.get_variable(prefix + "rel_embeddings", [n_rel, k_embeddings],
                                                    initializer=tf.random_uniform_initializer(minval=-bound_h3,
                                                                                              maxval=bound_h3,
                                                                                              seed=255))

            if self.__simple_projection:
                # combination layer. This is a simple, weighted combination.
                self.__combination_layer = tf.get_variable(prefix + "nn_ent_combination_layer", [1, k_embeddings * 2],
                                                           initializer=tf.random_uniform_initializer(
                                                               minval=-bound_simple_proj, maxval=bound_simple_proj,
                                                               seed=233))
            else:
                # combination layer, this will combine two entities using an
                # `unknown operator` which defined by this layer.
                self.__combination_layer = tf.get_variable(prefix + "nn_ent_combination_layer",
                                                           [k_embeddings * 2, k_embeddings],
                                                           initializer=tf.random_uniform_initializer(minval=-bound_proj,
                                                                                                     maxval=bound_proj,
                                                                                                     seed=283))
            # bias of combination layer
            self.__comb_bias = tf.get_variable(prefix + "comb_bias", [k_embeddings],
                                               initializer=tf.random_uniform_initializer(minval=-bound, maxval=bound,
                                                                                         seed=863))

            # bias of relation projection layer
            self.__bias = tf.get_variable(prefix + "nn_bias", [n_rel],
                                          initializer=tf.random_uniform_initializer(minval=-bound_bias,
                                                                                    maxval=bound_bias, seed=9876))

            self.__trainable.append(self.__ent_embeddings)
            self.__trainable.append(self.__rel_embeddings)
            self.__trainable.append(self.__combination_layer)
            self.__trainable.append(self.__bias)
            self.__trainable.append(self.__comb_bias)

    @property
    def ent_embedding(self):
        return self.__ent_embeddings

    @property
    def rel_embedding(self):
        return self.__rel_embeddings

    def __call__(self, inputs, scope=None):
        """Run NN with given inputs. This function will only return the result of this NN,
         it will not modify any parameters.
        :param inputs: a tensor with shape [BATCH_SIZE, 2], BATCH_SIZE can be any positive integer.
                        A row [1, 2] in `inputs` equals to the id of two entities. This NN will convert
                        them into a concatenation of two entity embeddings,
                        [1, (HEAD_NODE_EMBEDDING, TAIL_NODE_EMBEDDING)].
        :param scope: If there is only one NN in the program then this could be omitted. Otherwise each NN should have
                        a unique scope to make sure they do not share the same hidden layers.
        :return: a [1, n_rel] tensor, before use the output one should use `tf.softmax` to squash the
                  output to a [1, n_rel] tensor which has a sum of 1.0 and the value of each cell lies in [0,1].
        """
        with tf.variable_scope(scope or type(self).__name__) as scp:
            # After first execution in which all hidden layer variables are created, we reuse all variables.
            if self.__initialized:
                scp.reuse_variables()

            # convert entity id into embeddings
            x = tf.reshape(tf.nn.embedding_lookup(self.__ent_embeddings, inputs), [-1, self.__k_embeddings * 2])

            # relation projection layer, this is also the output layer which transform the shape of
            # a tensor to [BATCH_SIZE, n_rel].
            rel_layer = tf.transpose(self.__rel_embeddings)

            if self.__simple_projection:  # weighted combination
                weighted_embedding = x * self.__combination_layer
                head_embedding, tail_embedding = tf.split(1, 2, weighted_embedding)
                y = tf.nn.bias_add(tf.matmul(tf.tanh(head_embedding + tail_embedding), rel_layer), self.__bias)
            else:  # matrix combination
                tmp1 = tf.nn.bias_add(tf.matmul(x, self.__combination_layer), self.__comb_bias)
                y = tf.nn.bias_add(tf.matmul(tf.tanh(tmp1), rel_layer), self.__bias)

        return y


def load_amie_rules(raw_data):
    """ Load association rules from the result of AMIE
    """
    rule_map = dict()

    with open(FLAGS.amie) as f:
        _ = f.readline()  # skip column titles
        for line in f:
            rule, confidence, pca_confidence = line.rstrip().split(',')
            confidence = float(confidence)
            pca_confidence = float(pca_confidence)

            # skip if this rule has a score lower than threshold
            if confidence < FLAGS.confidence or pca_confidence < FLAGS.pca:
                continue

            rules = [x.strip().split() for x in rule.split('=>')]
            assert len(rules) == 2

            # skip if this is not a simple len-1 rule
            if len(rules[0]) != 3 or len(rules[1]) != 3:
                continue

            # both are a->b relation
            if rules[0][0] == rules[1][0] and rules[0][2] == rules[1][2]:
                rule_id = raw_data.rel2id[rules[0][1]]
                if rule_id not in rule_map:
                    rule_map[rule_id] = set()
                rule_map[rule_id].add(raw_data.rel2id[rules[1][1]])
            else:  # this is a->b and b->a relation
                rule_id = raw_data.rel2id[rules[0][1]]
                if rule_id not in rule_map:
                    rule_map[rule_id] = set()
                rule_map[rule_id].add(-raw_data.rel2id[rules[1][1]])

    return rule_map


def gen_inputs(raw_data):
    """ Generate [[head, tail], ...] from raw input data
    """
    inputs = []
    for (head, tail, rel) in raw_data.train['path']:
        inputs.append([head, tail])
    return np.asarray(inputs)


def gen_targets(inputs, n_rel, raw_data, rule_map):
    """ Generate [[rel], ...] w.r.t. generated inputs
    """
    targets = np.zeros([len(inputs), n_rel], dtype=np.float32)
    weights = np.zeros([len(inputs), n_rel], dtype=np.float32)
    weights[:, :] = FLAGS.entropy_weight
    raw_nominator = np.zeros([len(inputs), n_rel], dtype=np.float32)
    raw_denominator = np.zeros([len(inputs)], dtype=np.float32)

    ht_input_map = dict()  # find all edge ids with head and tail info
    association_rule_tasks = list()
    for i in range(0, len(inputs)):
        head, tail = inputs[i]
        if head not in ht_input_map:
            ht_input_map[head] = dict()
        if tail not in ht_input_map[head]:
            ht_input_map[head][tail] = list()

        ht_input_map[head][tail].append(i)

        for rel in raw_data.train['adj'][head][tail]:
            raw_nominator[i][rel] = len(raw_data.hlmap[head][rel].union(raw_data.tlmap[tail][rel]))
            raw_denominator[i] += raw_nominator[i][rel]
            weights[i][rel] = 1.0

            if rel in rule_map:
                associated_rules = rule_map[rel]
                for associated_rule in associated_rules:
                    if associated_rule < 0:
                        association_rule_tasks.append([tail, head, rel, associated_rule])
                    else:
                        association_rule_tasks.append([head, tail, rel, associated_rule])

    print "find", len(association_rule_tasks), "potential association rule tasks"
    task_completed = 0

    for task in association_rule_tasks:
        head, tail, orig, rule = task
        try:
            for edge_id in ht_input_map[head][tail]:
                if raw_nominator[edge_id][abs(rule)] == 0:
                    try:
                        raw_nominator[edge_id][abs(rule)] += len(
                            raw_data.hlmap[head][abs(rule)].union(raw_data.tlmap[tail][abs(rule)]))
                    except KeyError:
                        if rule < 0:
                            raw_nominator[edge_id][abs(rule)] += len(
                                raw_data.hlmap[tail][abs(orig)].union(raw_data.tlmap[head][abs(orig)]))
                        else:
                            raw_nominator[edge_id][abs(rule)] += len(
                                raw_data.hlmap[head][abs(orig)].union(raw_data.tlmap[tail][abs(orig)]))

                    raw_denominator[edge_id] += raw_nominator[edge_id][abs(rule)]
                    task_completed += 1
                    weights[edge_id][abs(rule)] = 1.0
            pass
        except KeyError:
            continue

    print task_completed, "tasks are completed."

    if FLAGS.activation == 'softmax':
        for i in range(len(inputs)):
            targets[i] = raw_nominator[i] / raw_denominator[i]
    elif FLAGS.activation == 'sigmoid':
        for i in range(len(inputs)):
            targets[i] = np.minimum(raw_nominator[i], 1)

    else:
        print "activation is not a valid value, expected softmax|sigmoid, actual", FLAGS.activation
        exit(-1)

    # for i in range(0, len(inputs)):
    #     head, tail = inputs[i]
    #     nrel = sum(
    #         [len(raw_data.hlmap[head][l].union(raw_data.tlmap[tail][l])) for l in raw_data.train['adj'][head][tail]])
    #     test_sum = 0.
    #     for rel in raw_data.train['adj'][head][tail]:
    #         targets[i][rel] = float(len(raw_data.hlmap[head][rel].union(raw_data.tlmap[tail][rel]))) / float(nrel)
    #         test_sum += targets[i][rel]
    #     try:
    #         assert abs(test_sum - 1.) <= 1e-5
    #     except AssertionError:
    #         raise AssertionError("expect " + str(1.) + " actual " + str(test_sum))

    return np.asarray(targets), np.asarray(weights)


def gen_weights(targets):
    weights = np.zeros([len(targets), len(targets[0])], dtype=np.float32)

    for i in range(len(targets)):
        for j in range(len(targets[i])):
            weights[i][j] = 0.5 if targets[i][j] < 0.01 else 1.0

    return weights


def gen_filtered_rels(raw_data):
    """ Generate [[all rels], ...] w.r.t. generated test inputs, i-th element contains all the relations
    connect i-th entity pair in test inputs.
    """
    filtered_rels = []
    max_rel_offset = 0
    for p in raw_data.test['path']:
        head, tail, rel = p
        filtered_rel = set()
        try:
            for r in raw_data.test['adj'][head][tail]:
                filtered_rel.add(r)
        except KeyError:
            pass
        try:
            for r in raw_data.train['adj'][head][tail]:
                filtered_rel.add(r)
        except KeyError:
            pass
        try:
            for r in raw_data.valid['adj'][head][tail]:
                filtered_rel.add(r)
        except KeyError:
            pass
        max_rel_offset = max(max_rel_offset, len(filtered_rel))
        filtered_rels.append(filtered_rel)

    print "max rel offset", max_rel_offset
    return np.asarray(filtered_rels), max_rel_offset


def gen_filtered_tails(raw_data):
    """ Generate [[all tails], ] w.r.t generated test inputs, i-th element contains all the tails
    connect i-th head and rel in test inputs.
    """

    filtered_tails = []
    max_tail_offset = 0
    for p in raw_data.test['path']:
        head, tail, rel = p
        filtered_tail = set()
        try:
            for tail in raw_data.hl_test_map[head][rel]:
                filtered_tail.add(tail)
        except KeyError:
            pass
        try:
            for tail in raw_data.hlmap[head][rel]:
                filtered_tail.add(tail)
        except KeyError:
            pass
        try:
            for tail in raw_data.hl_valid_map[head][rel]:
                filtered_tail.add(tail)
        except KeyError:
            pass

        max_tail_offset = max(max_tail_offset, len(filtered_tail))
        filtered_tails.append(filtered_tail)

    print "max tail offset", max_tail_offset
    return np.asarray(filtered_tails), max_tail_offset


def run(raw_data):
    """ Construct operators for training and evaluating SimpleNN model.
    """
    model = SimpleNN(FLAGS.embed, raw_data.rel_id_max + 1, raw_data.entity_id_max + 1, simple=FLAGS.simple)

    with tf.device("/cpu:0"):
        ph_input = tf.placeholder(tf.int32, [None, 2])
        ph_target = tf.placeholder(tf.float32, [None, raw_data.rel_id_max + 1])
        ph_weight = tf.placeholder(tf.float32, [None, raw_data.rel_id_max + 1])
        y = model(ph_input)

        if FLAGS.activation == 'softmax':
            loss = -tf.reduce_sum(ph_target * tf.log(tf.nn.softmax(y)) * ph_weight)
        elif FLAGS.activation == 'sigmoid':
            loss = -tf.reduce_sum(ph_target * tf.log(tf.sigmoid(y)) * ph_weight)
        else:
            print "activation is not a valid value, expected softmax|sigmoid, actual", FLAGS.activation
            exit(-1)

        optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.lr, beta1=FLAGS.beta1, beta2=FLAGS.beta2,
                                           epsilon=FLAGS.e)

        grads = optimizer.compute_gradients(loss, tf.trainable_variables())
        op_train = optimizer.apply_gradients(grads)

        if FLAGS.activation == 'softmax':
            op_test = tf.nn.softmax(y)
        elif FLAGS.activation == 'sigmoid':
            op_test = tf.sigmoid(y)
        else:
            print "activation is not a valid value, expected softmax|sigmoid, actual", FLAGS.activation
            exit(-1)

    return model, ph_input, ph_target, ph_weight, loss, op_train, op_test


def run_entity_eval(session, ph_input, op_test, raw_data, filtered_tails, k=10, max_offset=10):
    total = 0
    raw_hits = 0
    filtered_hits = 0

    inputs = np.zeros([raw_data.entity_id_max + 1, 2], dtype=np.int32)

    inputs[:, 1] = range(0, raw_data.entity_id_max + 1)  # put all tail candidates into inputs

    test_data = raw_data.test['path']

    with tf.device("/cpu:0"):
        ph_rel = tf.placeholder(tf.int32, shape=[1])
        op_entity_test = tf.reshape(tf.nn.embedding_lookup(tf.transpose(op_test), ph_rel),
                                    [1, raw_data.entity_id_max + 1])

    for (i, path) in enumerate(test_data):
        head, tail, rel = path
        inputs[:, 0] = head

        _, op_top_tails = tf.nn.top_k(op_entity_test, len(filtered_tails[i]) + k)
        top_tails = session.run(op_top_tails, {ph_input: inputs, ph_rel: [rel]})[0]

        total += 1
        idx = 0
        filtered_idx = 0
        while filtered_idx < k and idx < len(top_tails):
            if top_tails[idx] == tail:
                filtered_hits += 1
                break
            elif top_tails[idx] not in filtered_tails[i]:
                filtered_idx += 1
            idx += 1
        raw_hits += tail in top_tails[:k]

    return float(raw_hits) / float(total), float(filtered_hits) / float(total)


def run_relation_eval(session, ph_input, op_test, raw_data, filtered_rels, k=1, max_offset=11):
    """ Executes evaluation and returns accuracy score.
    """
    total = 0
    raw_hits = 0
    filtered_hits = 0
    rank = 0
    filtered_rank = 0

    inputs = raw_data.test['path'][:, 0:2]
    targets = raw_data.test['path'][:, 2]

    with tf.device("/cpu:0"):
        # _, top_rels = tf.nn.top_k(op_test, max_offset + k)
        _, top_rels = tf.nn.top_k(op_test, raw_data.rel_id_max + 1)
        top_rels = session.run(top_rels, {ph_input: inputs})

    for i in range(len(top_rels)):
        total += 1
        idx = 0
        filtered_idx = 0

        while idx < len(top_rels[i]):
            if top_rels[i][idx] == targets[i]:
                rank += idx + 1
                filtered_rank += filtered_idx + 1
                break
            elif top_rels[i][idx] not in filtered_rels[i]:
                filtered_idx += 1
            idx += 1

        idx = 0
        filtered_idx = 0

        while filtered_idx < k and idx < len(top_rels[i]):
            if top_rels[i][idx] == targets[i]:
                filtered_hits += 1
                break
            elif top_rels[i][idx] not in filtered_rels[i]:
                filtered_idx += 1
            idx += 1

        raw_hits += targets[i] in top_rels[i][:k]

    return float(raw_hits) / float(total), float(filtered_hits) / float(total), float(rank) / float(total), float(
        filtered_rank) / float(total)


def main(_):
    raw_data = DataReader.MetaPathData()

    if FLAGS.dataset == 'fb15k':
        raw_data.load_data('./data/FB15k/')
    else:
        print "unknown dataset"
        exit(-1)

    model, ph_input, ph_target, ph_weight, loss, op_train, op_test = run(raw_data)

    inputs = gen_inputs(raw_data)
    rule_map = load_amie_rules(raw_data) if FLAGS.association else dict()
    targets, weights = gen_targets(inputs, raw_data.rel_id_max + 1, raw_data, rule_map)
    # weights = gen_weights(targets)
    print "start filtered rel"
    filtered_rels, max_rel_offset = gen_filtered_rels(raw_data)
    print "end filtered rel"
    # filtered_tails, max_ent_offset = gen_filtered_tails(raw_data)

    best_raw_acc = 0.
    best_raw_iter = -1

    best_filtered_acc = 0.
    best_filtered_iter = -1

    best_mean_rank = 99999
    best_mean_rank_iter = -1
    best_filtered_rank = 99999
    best_filtered_rank_iter = -1

    if FLAGS.batch <= 0:
        FLAGS.batch = len(raw_data.train['path'])

    with tf.Session() as session:

        tf.initialize_all_variables().run()

        for it in range(FLAGS.max_iter):
            print "--- Iteration", it, "---"

            start_time = timeit.default_timer()

            new_order = range(0, len(inputs))
            np.random.shuffle(new_order)
            inputs = inputs[new_order, :]
            targets = targets[new_order, :]
            weights = weights[new_order, :]

            accu_loss = 0.

            start = 0
            while start < len(inputs):
                end = min(start + FLAGS.batch, len(inputs))
                tmp_weight = weights[start:end, :]
                if FLAGS.sampling > 0.:
                    tmp_weight = np.minimum(
                        tmp_weight + np.random.choice([0, 1], size=[len(tmp_weight), len(tmp_weight[0])], replace=True,
                                                    p=[1.0 - FLAGS.sampling, FLAGS.sampling]), 1)
                l, _ = session.run([loss, op_train], {ph_input: inputs[start:end, :],
                                                      ph_target: targets[start:end, :],
                                                      ph_weight: tmp_weight})
                accu_loss += l
                start = end

            print "\n\tloss:", accu_loss, "cost:", timeit.default_timer() - start_time, "seconds.\n"

            print "--- relation prediction ---"

            raw_rel_acc, filtered_rel_acc, raw_rank, filtered_rank = run_relation_eval(session, ph_input, op_test,
                                                                                       raw_data, filtered_rels,
                                                                                       k=FLAGS.topk,
                                                                                       max_offset=max_rel_offset)

            print "\n\traw", raw_rel_acc, "\t\tfiltered", filtered_rel_acc
            print "\n\traw", raw_rank, "\t\tfiltered", filtered_rank, "\n"

            if raw_rel_acc > best_raw_acc:
                best_raw_acc = raw_rel_acc
                best_raw_iter = it

            if filtered_rel_acc > best_filtered_acc:
                best_filtered_acc = filtered_rel_acc
                best_filtered_iter = it

                # rel_embeding = session.run(model.rel_embedding)
                # print "start writing relation embedding to disk..."
                # with open("./rel_embedding.csv", 'w+') as f_embed:
                #     for (i, rel_embed) in enumerate(rel_embeding):
                #         f_embed.write(str(raw_data.id2rel[i]) + ',')
                #         f_embed.write(",".join([str(x) for x in rel_embed]))
                #         f_embed.write("\n")
                # print "write done"

            if raw_rank < best_mean_rank:
                best_mean_rank = raw_rank
                best_mean_rank_iter = it

            if filtered_rank < best_filtered_rank:
                best_filtered_rank = filtered_rank
                best_filtered_rank_iter = it

            print "\tbest\n\traw", best_raw_acc, "(", best_raw_iter, ")", \
                "\tfiltered", best_filtered_acc, "(", best_filtered_iter, ")"
            print "\tbest\n\traw", best_mean_rank, "(", best_mean_rank_iter, ")", \
                "\tfiltered", best_filtered_rank, "(", best_filtered_rank_iter, ")", "\n"

            # print "--- entity (tail) prediction ---"
            #
            # raw_ent_acc, filtered_ent_acc = run_entity_eval(session, ph_input, op_test, raw_data, filtered_tails,
            #                                                 k=FLAGS.ent_topk, max_offset=max_ent_offset)
            #
            # print "\n\traw", raw_ent_acc, "\t\tfiltered", filtered_ent_acc, "\n"

            print "--------------------------"


if __name__ == '__main__':
    tf.app.run()
back to top