Content - 7fe33cc639d5a86a8aa4d0a829bfb5dfbe71a4ba - c1af252/proNE.py

visit type:
https://github.com/THUDM/ProNE

05 April 2024, 19:07:15 UTC
Tip revision: 5c8a0eb804641307d1316672abf4d80b7bc53055 authored by lykeven on 04 August 2020, 14:33:04 UTC
add inner C++ function call
Tip revision: 5c8a0eb
proNE.py
# encoding=utf8
import numpy as np
import networkx as nx

import scipy.sparse
import scipy.sparse as sp
from scipy import linalg
from scipy.special import iv

from sklearn import preprocessing
from sklearn.utils.extmath import randomized_svd

import argparse
import time

from ctypes import *

class ProNE():
    def __init__(self, graph_file, emb_file1, emb_file2, dimension):
        self.graph = graph_file
        self.emb1 = emb_file1
        self.emb2 = emb_file2
        self.dimension = dimension

        self.G = nx.read_edgelist(self.graph, nodetype=int, create_using=nx.DiGraph())
        self.G = self.G.to_undirected()
        self.node_number = self.G.number_of_nodes()
        matrix0 = scipy.sparse.lil_matrix((self.node_number, self.node_number))

        for e in self.G.edges():
            if e[0] != e[1]:
                matrix0[e[0], e[1]] = 1
                matrix0[e[1], e[0]] = 1
        self.matrix0 = scipy.sparse.csr_matrix(matrix0)
        print(matrix0.shape)

    def get_embedding_rand(self, matrix):
        # Sparse randomized tSVD for fast embedding
        t1 = time.time()
        l = matrix.shape[0]
        smat = scipy.sparse.csc_matrix(matrix)  # convert to sparse CSC format
        print('svd sparse', smat.data.shape[0] * 1.0 / l ** 2)
        U, Sigma, VT = randomized_svd(smat, n_components=self.dimension, n_iter=5, random_state=None)
        U = U * np.sqrt(Sigma)
        U = preprocessing.normalize(U, "l2")
        print('sparsesvd time', time.time() - t1)
        return U

    def get_embedding_dense(self, matrix, dimension):
        # get dense embedding via SVD
        t1 = time.time()
        U, s, Vh = linalg.svd(matrix, full_matrices=False, check_finite=False, overwrite_a=True)
        U = np.array(U)
        U = U[:, :dimension]
        s = s[:dimension]
        s = np.sqrt(s)
        U = U * s
        U = preprocessing.normalize(U, "l2")
        print('densesvd time', time.time() - t1)
        return U

    def pre_factorization(self, tran, mask):
        # Network Embedding as Sparse Matrix Factorization
        t1 = time.time()
        l1 = 0.75
        C1 = preprocessing.normalize(tran, "l1")
        neg = np.array(C1.sum(axis=0))[0] ** l1

        neg = neg / neg.sum()

        neg = scipy.sparse.diags(neg, format="csr")
        neg = mask.dot(neg)
        print("neg", time.time() - t1)

        C1.data[C1.data <= 0] = 1
        neg.data[neg.data <= 0] = 1

        C1.data = np.log(C1.data)
        neg.data = np.log(neg.data)

        C1 -= neg
        F = C1
        features_matrix = self.get_embedding_rand(F)
        return features_matrix

    def chebyshev_gaussian(self, A, a, order=10, mu=0.5, s=0.5):
        # NE Enhancement via Spectral Propagation
        print('Chebyshev Series -----------------')
        t1 = time.time()

        if order == 1:
            return a

        A = sp.eye(self.node_number) + A
        DA = preprocessing.normalize(A, norm='l1')
        L = sp.eye(self.node_number) - DA

        M = L - mu * sp.eye(self.node_number)

        Lx0 = a
        Lx1 = M.dot(a)
        Lx1 = 0.5 * M.dot(Lx1) - a

        conv = iv(0, s) * Lx0
        conv -= 2 * iv(1, s) * Lx1
        for i in range(2, order):
            Lx2 = M.dot(Lx1)
            Lx2 = (M.dot(Lx2) - 2 * Lx1) - Lx0
            #         Lx2 = 2*L.dot(Lx1) - Lx0
            if i % 2 == 0:
                conv += 2 * iv(i, s) * Lx2
            else:
                conv -= 2 * iv(i, s) * Lx2
            Lx0 = Lx1
            Lx1 = Lx2
            del Lx2
            print('Bessell time', i, time.time() - t1)
        mm = A.dot(a - conv)
        emb = self.get_embedding_dense(mm, self.dimension)
        return emb


def parse_args():
    parser = argparse.ArgumentParser(description="Run ProNE.")
    parser.add_argument('-graph', nargs='?', default='data/blogcatalog.ungraph',
                        help='Graph path')
    parser.add_argument('-emb1', nargs='?', default='emb/blogcatalog.emb',
                        help='Output path of sparse embeddings')
    parser.add_argument('-emb2', nargs='?', default='emb/blogcatalog_enhanced.emb',
                        help='Output path of enhanced embeddings')
    parser.add_argument('-dimension', type=int, default=128,
                        help='Number of dimensions. Default is 128.')
    parser.add_argument('-step', type=int, default=10,
                        help='Step of recursion. Default is 10.')
    parser.add_argument('-theta', type=float, default=0.5,
                        help='Parameter of ProNE. Default is 0.5.')
    parser.add_argument('-mu', type=float, default=0.2,
                        help='Parameter of ProNE. Default is 0.2')
    parser.add_argument("-fast", action="store_false", help='Use C++ implementation to speed up')
    return parser.parse_args()


def main(args):
    t_0 = time.time()
    model = ProNE(args.graph, args.emb1, args.emb2, args.dimension)
    t_1 = time.time()

    features_matrix = model.pre_factorization(model.matrix0, model.matrix0)
    t_2 = time.time()

    embeddings_matrix = model.chebyshev_gaussian(model.matrix0, features_matrix, args.step, args.mu, args.theta)
    t_3 = time.time()


    print('---', model.node_number)
    print('total time', t_3 - t_0)
    print('sparse NE time', t_2 - t_1)
    print('spectral Pro time', t_3 - t_2)

    np.save(args.emb1, features_matrix)
    np.save(args.emb2, embeddings_matrix)

def main_c(args):
    args = parse_args()
    model = ProNE(args.graph, args.emb1, args.emb2, args.dimension)
    num_node = model.node_number
    num_thread = 20

    test = cdll.LoadLibrary("libprone.so")
    prone_main = test.main_f2
    prone_main.argtypes = [c_char_p, c_char_p, c_char_p, c_int, c_int, c_int, c_int, c_float, c_float]
    res = prone_main(args.graph.encode("utf-8"), args.emb1.encode("utf-8"), args.emb2.encode("utf-8"), num_node, args.step, num_thread, args.dimension, args.theta, args.mu)


if __name__ == '__main__':
    args = parse_args()
    if args.fast is True:
        main_c(args)
    else:
         main(args)