From 07cfe7ba673e7bb69d64784ec1a19b289981673e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Lerique?= Date: Thu, 16 Nov 2017 13:19:15 +0100 Subject: [PATCH] python3 compatibility and PEP8 conformity --- src/main.py | 143 +++++++++++++----------- src/node2vec.py | 287 +++++++++++++++++++++++++----------------------- 2 files changed, 224 insertions(+), 206 deletions(-) diff --git a/src/main.py b/src/main.py index 82ac735..4a64d3e 100644 --- a/src/main.py +++ b/src/main.py @@ -1,104 +1,115 @@ ''' -Reference implementation of node2vec. +Reference implementation of node2vec. Author: Aditya Grover For more details, refer to the paper: node2vec: Scalable Feature Learning for Networks -Aditya Grover and Jure Leskovec +Aditya Grover and Jure Leskovec Knowledge Discovery and Data Mining (KDD), 2016 ''' import argparse -import numpy as np import networkx as nx import node2vec from gensim.models import Word2Vec + def parse_args(): - ''' - Parses the node2vec arguments. - ''' - parser = argparse.ArgumentParser(description="Run node2vec.") + ''' + Parses the node2vec arguments. + ''' + parser = argparse.ArgumentParser(description="Run node2vec.") + + parser.add_argument('--input', nargs='?', default='graph/karate.edgelist', + help='Input graph path') - parser.add_argument('--input', nargs='?', default='graph/karate.edgelist', - help='Input graph path') + parser.add_argument('--output', nargs='?', default='emb/karate.emb', + help='Embeddings path') - parser.add_argument('--output', nargs='?', default='emb/karate.emb', - help='Embeddings path') + parser.add_argument('--dimensions', type=int, default=128, + help='Number of dimensions. Default is 128.') - parser.add_argument('--dimensions', type=int, default=128, - help='Number of dimensions. Default is 128.') + parser.add_argument('--walk-length', type=int, default=80, + help='Length of walk per source. Default is 80.') - parser.add_argument('--walk-length', type=int, default=80, - help='Length of walk per source. Default is 80.') + parser.add_argument('--num-walks', type=int, default=10, + help='Number of walks per source. Default is 10.') - parser.add_argument('--num-walks', type=int, default=10, - help='Number of walks per source. Default is 10.') + parser.add_argument('--window-size', type=int, default=10, + help='Context size for optimization. Default is 10.') - parser.add_argument('--window-size', type=int, default=10, - help='Context size for optimization. Default is 10.') + parser.add_argument('--iter', default=1, type=int, + help='Number of epochs in SGD') - parser.add_argument('--iter', default=1, type=int, - help='Number of epochs in SGD') + parser.add_argument('--workers', type=int, default=8, + help='Number of parallel workers. Default is 8.') - parser.add_argument('--workers', type=int, default=8, - help='Number of parallel workers. Default is 8.') + parser.add_argument('--p', type=float, default=1, + help='Return hyperparameter. Default is 1.') - parser.add_argument('--p', type=float, default=1, - help='Return hyperparameter. Default is 1.') + parser.add_argument('--q', type=float, default=1, + help='Inout hyperparameter. Default is 1.') - parser.add_argument('--q', type=float, default=1, - help='Inout hyperparameter. Default is 1.') + parser.add_argument('--weighted', dest='weighted', action='store_true', + help=('Boolean specifying (un)weighted. ' + 'Default is unweighted.')) + parser.add_argument('--unweighted', dest='unweighted', + action='store_false') + parser.set_defaults(weighted=False) - parser.add_argument('--weighted', dest='weighted', action='store_true', - help='Boolean specifying (un)weighted. Default is unweighted.') - parser.add_argument('--unweighted', dest='unweighted', action='store_false') - parser.set_defaults(weighted=False) + parser.add_argument('--directed', dest='directed', action='store_true', + help='Graph is (un)directed. Default is undirected.') + parser.add_argument('--undirected', dest='undirected', + action='store_false') + parser.set_defaults(directed=False) - parser.add_argument('--directed', dest='directed', action='store_true', - help='Graph is (un)directed. Default is undirected.') - parser.add_argument('--undirected', dest='undirected', action='store_false') - parser.set_defaults(directed=False) + return parser.parse_args() - return parser.parse_args() def read_graph(): - ''' - Reads the input network in networkx. - ''' - if args.weighted: - G = nx.read_edgelist(args.input, nodetype=int, data=(('weight',float),), create_using=nx.DiGraph()) - else: - G = nx.read_edgelist(args.input, nodetype=int, create_using=nx.DiGraph()) - for edge in G.edges(): - G[edge[0]][edge[1]]['weight'] = 1 + ''' + Reads the input network in networkx. + ''' + if args.weighted: + G = nx.read_edgelist(args.input, nodetype=int, + data=(('weight', float),), + create_using=nx.DiGraph()) + else: + G = nx.read_edgelist(args.input, nodetype=int, + create_using=nx.DiGraph()) + for edge in G.edges(): + G[edge[0]][edge[1]]['weight'] = 1 + + if not args.directed: + G = G.to_undirected() - if not args.directed: - G = G.to_undirected() + return G - return G def learn_embeddings(walks): - ''' - Learn embeddings by optimizing the Skipgram objective using SGD. - ''' - walks = [map(str, walk) for walk in walks] - model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, sg=1, workers=args.workers, iter=args.iter) - model.save_word2vec_format(args.output) - - return + ''' + Learn embeddings by optimizing the Skipgram objective using SGD. + ''' + walks = [list(map(str, walk)) for walk in walks] + model = Word2Vec(walks, size=args.dimensions, window=args.window_size, + min_count=0, sg=1, workers=args.workers, iter=args.iter) + model.wv.save_word2vec_format(args.output) + + return + def main(args): - ''' - Pipeline for representational learning for all nodes in a graph. - ''' - nx_G = read_graph() - G = node2vec.Graph(nx_G, args.directed, args.p, args.q) - G.preprocess_transition_probs() - walks = G.simulate_walks(args.num_walks, args.walk_length) - learn_embeddings(walks) + ''' + Pipeline for representational learning for all nodes in a graph. + ''' + nx_G = read_graph() + G = node2vec.Graph(nx_G, args.directed, args.p, args.q) + G.preprocess_transition_probs() + walks = G.simulate_walks(args.num_walks, args.walk_length) + learn_embeddings(walks) + if __name__ == "__main__": - args = parse_args() - main(args) + args = parse_args() + main(args) diff --git a/src/node2vec.py b/src/node2vec.py index 0293411..b092354 100644 --- a/src/node2vec.py +++ b/src/node2vec.py @@ -1,149 +1,156 @@ import numpy as np -import networkx as nx import random -class Graph(): - def __init__(self, nx_G, is_directed, p, q): - self.G = nx_G - self.is_directed = is_directed - self.p = p - self.q = q - - def node2vec_walk(self, walk_length, start_node): - ''' - Simulate a random walk starting from start node. - ''' - G = self.G - alias_nodes = self.alias_nodes - alias_edges = self.alias_edges - - walk = [start_node] - - while len(walk) < walk_length: - cur = walk[-1] - cur_nbrs = sorted(G.neighbors(cur)) - if len(cur_nbrs) > 0: - if len(walk) == 1: - walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])]) - else: - prev = walk[-2] - next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], - alias_edges[(prev, cur)][1])] - walk.append(next) - else: - break - - return walk - - def simulate_walks(self, num_walks, walk_length): - ''' - Repeatedly simulate random walks from each node. - ''' - G = self.G - walks = [] - nodes = list(G.nodes()) - print 'Walk iteration:' - for walk_iter in range(num_walks): - print str(walk_iter+1), '/', str(num_walks) - random.shuffle(nodes) - for node in nodes: - walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node)) - - return walks - - def get_alias_edge(self, src, dst): - ''' - Get the alias edge setup lists for a given edge. - ''' - G = self.G - p = self.p - q = self.q - - unnormalized_probs = [] - for dst_nbr in sorted(G.neighbors(dst)): - if dst_nbr == src: - unnormalized_probs.append(G[dst][dst_nbr]['weight']/p) - elif G.has_edge(dst_nbr, src): - unnormalized_probs.append(G[dst][dst_nbr]['weight']) - else: - unnormalized_probs.append(G[dst][dst_nbr]['weight']/q) - norm_const = sum(unnormalized_probs) - normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] - - return alias_setup(normalized_probs) - - def preprocess_transition_probs(self): - ''' - Preprocessing of transition probabilities for guiding the random walks. - ''' - G = self.G - is_directed = self.is_directed - - alias_nodes = {} - for node in G.nodes(): - unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))] - norm_const = sum(unnormalized_probs) - normalized_probs = [float(u_prob)/norm_const for u_prob in unnormalized_probs] - alias_nodes[node] = alias_setup(normalized_probs) - - alias_edges = {} - triads = {} - - if is_directed: - for edge in G.edges(): - alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) - else: - for edge in G.edges(): - alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) - alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0]) - - self.alias_nodes = alias_nodes - self.alias_edges = alias_edges - - return +class Graph: + + def __init__(self, nx_G, is_directed, p, q): + self.G = nx_G + self.is_directed = is_directed + self.p = p + self.q = q + + def node2vec_walk(self, walk_length, start_node): + ''' + Simulate a random walk starting from start node. + ''' + G = self.G + alias_nodes = self.alias_nodes + alias_edges = self.alias_edges + + walk = [start_node] + + while len(walk) < walk_length: + cur = walk[-1] + cur_nbrs = sorted(G.neighbors(cur)) + if len(cur_nbrs) > 0: + if len(walk) == 1: + walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], + alias_nodes[cur][1])]) + else: + prev = walk[-2] + next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], + alias_edges[(prev, cur)][1])] + walk.append(next) + else: + break + + return walk + + def simulate_walks(self, num_walks, walk_length): + ''' + Repeatedly simulate random walks from each node. + ''' + G = self.G + walks = [] + nodes = list(G.nodes()) + print('Walk iteration:') + for walk_iter in range(num_walks): + print(str(walk_iter+1), '/', str(num_walks)) + random.shuffle(nodes) + for node in nodes: + walks.append(self.node2vec_walk(walk_length=walk_length, + start_node=node)) + + return walks + + def get_alias_edge(self, src, dst): + ''' + Get the alias edge setup lists for a given edge. + ''' + G = self.G + p = self.p + q = self.q + + unnormalized_probs = [] + for dst_nbr in sorted(G.neighbors(dst)): + if dst_nbr == src: + unnormalized_probs.append(G[dst][dst_nbr]['weight'] / p) + elif G.has_edge(dst_nbr, src): + unnormalized_probs.append(G[dst][dst_nbr]['weight']) + else: + unnormalized_probs.append(G[dst][dst_nbr]['weight'] / q) + norm_const = sum(unnormalized_probs) + normalized_probs = [float(u_prob) / norm_const + for u_prob in unnormalized_probs] + + return alias_setup(normalized_probs) + + def preprocess_transition_probs(self): + ''' + Preprocessing of transition probabilities for guiding the random walks. + ''' + G = self.G + is_directed = self.is_directed + + alias_nodes = {} + for node in G.nodes(): + unnormalized_probs = [G[node][nbr]['weight'] + for nbr in sorted(G.neighbors(node))] + norm_const = sum(unnormalized_probs) + normalized_probs = [float(u_prob) / norm_const + for u_prob in unnormalized_probs] + alias_nodes[node] = alias_setup(normalized_probs) + + alias_edges = {} + + if is_directed: + for edge in G.edges(): + alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + else: + for edge in G.edges(): + alias_edges[edge] = self.get_alias_edge(edge[0], edge[1]) + alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], + edge[0]) + + self.alias_nodes = alias_nodes + self.alias_edges = alias_edges + + return def alias_setup(probs): - ''' - Compute utility lists for non-uniform sampling from discrete distributions. - Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ - for details - ''' - K = len(probs) - q = np.zeros(K) - J = np.zeros(K, dtype=np.int) - - smaller = [] - larger = [] - for kk, prob in enumerate(probs): - q[kk] = K*prob - if q[kk] < 1.0: - smaller.append(kk) - else: - larger.append(kk) - - while len(smaller) > 0 and len(larger) > 0: - small = smaller.pop() - large = larger.pop() - - J[small] = large - q[large] = q[large] + q[small] - 1.0 - if q[large] < 1.0: - smaller.append(large) - else: - larger.append(large) - - return J, q + ''' + Compute utility lists for non-uniform sampling from discrete distributions. + Refer to + https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/ + for details + ''' + K = len(probs) + q = np.zeros(K) + J = np.zeros(K, dtype=np.int) + + smaller = [] + larger = [] + for kk, prob in enumerate(probs): + q[kk] = K*prob + if q[kk] < 1.0: + smaller.append(kk) + else: + larger.append(kk) + + while len(smaller) > 0 and len(larger) > 0: + small = smaller.pop() + large = larger.pop() + + J[small] = large + q[large] = q[large] + q[small] - 1.0 + if q[large] < 1.0: + smaller.append(large) + else: + larger.append(large) + + return J, q + def alias_draw(J, q): - ''' - Draw sample from a non-uniform discrete distribution using alias sampling. - ''' - K = len(J) - - kk = int(np.floor(np.random.rand()*K)) - if np.random.rand() < q[kk]: - return kk - else: - return J[kk] \ No newline at end of file + ''' + Draw sample from a non-uniform discrete distribution using alias sampling. + ''' + K = len(J) + + kk = int(np.floor(np.random.rand()*K)) + if np.random.rand() < q[kk]: + return kk + else: + return J[kk]