diff --git a/.gitignore b/.gitignore index 98f12e98..3c79a046 100644 --- a/.gitignore +++ b/.gitignore @@ -1,109 +1,108 @@ -# Custom -*.idea -*.png -*.pdf -tmp/ -*.txt -*swp* -*.sw? -gcn_back -.DS_STORE -*.aux -*.log -*.out -*.bbl -*.synctex.gz - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -env/ -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -*.egg-info/ -.installed.cfg -*.egg - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*,cover -.hypothesis/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# IPython Notebook -.ipynb_checkpoints - -# pyenv -.python-version - -# celery beat schedule file -celerybeat-schedule - -# dotenv -.env - -# virtualenv -venv/ -ENV/ - -# Spyder project settings -.spyderproject - -# Rope project settings -.ropeproject - -*.pickle -*.pkl - +# Custom +*.idea +*.pdf +tmp/ +*.txt +*swp* +*.sw? +gcn_back +.DS_STORE +*.aux +*.log +*.out +*.bbl +*.synctex.gz + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# dotenv +.env + +# virtualenv +venv/ +ENV/ + +# Spyder project settings +.spyderproject + +# Rope project settings +.ropeproject + +*.pickle +*.pkl + diff --git a/graphsage/aggregators.py b/graphsage/aggregators.py index 7dbd2523..fdf62f63 100644 --- a/graphsage/aggregators.py +++ b/graphsage/aggregators.py @@ -3,14 +3,16 @@ from .layers import Layer, Dense from .inits import glorot, zeros + class MeanAggregator(Layer): """ Aggregates via mean followed by matmul and non-linearity. """ + # mean聚合 def __init__(self, input_dim, output_dim, neigh_input_dim=None, - dropout=0., bias=False, act=tf.nn.relu, - name=None, concat=False, **kwargs): + dropout=0., bias=False, act=tf.nn.relu, + name=None, concat=False, **kwargs): super(MeanAggregator, self).__init__(**kwargs) self.dropout = dropout @@ -25,12 +27,12 @@ def __init__(self, input_dim, output_dim, neigh_input_dim=None, name = '/' + name else: name = '' - + # 权重矩阵设置对应伪代码中的W,这里自身节点和输入节点采用了不同的W,推测是为了防止过拟合 with tf.variable_scope(self.name + name + '_vars'): self.vars['neigh_weights'] = glorot([neigh_input_dim, output_dim], - name='neigh_weights') + name='neigh_weights') self.vars['self_weights'] = glorot([input_dim, output_dim], - name='self_weights') + name='self_weights') if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') @@ -40,18 +42,21 @@ def __init__(self, input_dim, output_dim, neigh_input_dim=None, self.input_dim = input_dim self.output_dim = output_dim + # 输入维度[batchSize, numNeigh, numNeighDim],依次为batch大小,每一跳节点数量,节点特征数 def _call(self, inputs): + self_vecs, neigh_vecs = inputs - neigh_vecs = tf.nn.dropout(neigh_vecs, 1-self.dropout) - self_vecs = tf.nn.dropout(self_vecs, 1-self.dropout) + neigh_vecs = tf.nn.dropout(neigh_vecs, 1 - self.dropout) + self_vecs = tf.nn.dropout(self_vecs, 1 - self.dropout) + # 均值聚合后neigh_mean shape变为[batchSize,numNeighDim],原来每个batchSize的向量均值聚合为1个 neigh_means = tf.reduce_mean(neigh_vecs, axis=1) - - # [nodes] x [out_dim] + + # [nodes] x W,相乘后shape变为[batchSize, outputDim] from_neighs = tf.matmul(neigh_means, self.vars['neigh_weights']) from_self = tf.matmul(self_vecs, self.vars["self_weights"]) - + if not self.concat: output = tf.add_n([from_self, from_neighs]) else: @@ -60,9 +65,10 @@ def _call(self, inputs): # bias if self.bias: output += self.vars['bias'] - + return self.act(output) + class GCNAggregator(Layer): """ Aggregates via mean followed by matmul and non-linearity. @@ -70,7 +76,7 @@ class GCNAggregator(Layer): """ def __init__(self, input_dim, output_dim, neigh_input_dim=None, - dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): + dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): super(GCNAggregator, self).__init__(**kwargs) self.dropout = dropout @@ -88,7 +94,7 @@ def __init__(self, input_dim, output_dim, neigh_input_dim=None, with tf.variable_scope(self.name + name + '_vars'): self.vars['weights'] = glorot([neigh_input_dim, output_dim], - name='neigh_weights') + name='neigh_weights') if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') @@ -101,26 +107,30 @@ def __init__(self, input_dim, output_dim, neigh_input_dim=None, def _call(self, inputs): self_vecs, neigh_vecs = inputs - neigh_vecs = tf.nn.dropout(neigh_vecs, 1-self.dropout) - self_vecs = tf.nn.dropout(self_vecs, 1-self.dropout) - means = tf.reduce_mean(tf.concat([neigh_vecs, - tf.expand_dims(self_vecs, axis=1)], axis=1), axis=1) - - # [nodes] x [out_dim] + neigh_vecs = tf.nn.dropout(neigh_vecs, 1 - self.dropout) + self_vecs = tf.nn.dropout(self_vecs, 1 - self.dropout) + # 这里做了两个操作,首先取并集,首先在原来shape为[batchSize,numSelfDim]增加一个维度变为[batchSize,1,numSelfDim] + # 然后在2维上连接邻居向量原来的shape变为[batchSize,numNeigh+numSelf,numNeighDim] + # 最后在2维上均值聚合后,shape变为[batchSize,numNeighDim] + means = tf.reduce_mean(tf.concat([neigh_vecs, + tf.expand_dims(self_vecs, axis=1)], axis=1), axis=1) + + # 因为二者已经合为并集,故只需要一个权重矩阵即可,[nodes] x W后,shape变为[batchSize, outputDim] output = tf.matmul(means, self.vars['weights']) # bias if self.bias: output += self.vars['bias'] - + return self.act(output) class MaxPoolingAggregator(Layer): """ Aggregates via max-pooling over MLP functions. """ + def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None, - dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): + dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): super(MaxPoolingAggregator, self).__init__(**kwargs) self.dropout = dropout @@ -143,18 +153,18 @@ def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=No self.mlp_layers = [] self.mlp_layers.append(Dense(input_dim=neigh_input_dim, - output_dim=hidden_dim, - act=tf.nn.relu, - dropout=dropout, - sparse_inputs=False, - logging=self.logging)) + output_dim=hidden_dim, + act=tf.nn.relu, + dropout=dropout, + sparse_inputs=False, + logging=self.logging)) with tf.variable_scope(self.name + name + '_vars'): self.vars['neigh_weights'] = glorot([hidden_dim, output_dim], - name='neigh_weights') - + name='neigh_weights') + self.vars['self_weights'] = glorot([input_dim, output_dim], - name='self_weights') + name='self_weights') if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') @@ -173,16 +183,20 @@ def _call(self, inputs): batch_size = dims[0] num_neighbors = dims[1] # [nodes * sampled neighbors] x [hidden_dim] + # 将邻居矩阵由3维降为2维,此时shape为[batch_size * numNeighbors,neighDim] h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim)) + # 将降为2维后的邻居矩阵与池化层相乘,池化层shape为[neigh_input_dim,hidden_dim] + # 相乘后矩阵shape变为[batch_size * num_neighbors,hidden_dim] for l in self.mlp_layers: h_reshaped = l(h_reshaped) + # 将经过池化层的矩阵还原为[batch_size, num_neighbors, hidden_dim],然后进行最大聚合降维为[batch_size, hidden_dim] neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim)) neigh_h = tf.reduce_max(neigh_h, axis=1) - + from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights']) from_self = tf.matmul(self_vecs, self.vars["self_weights"]) - + if not self.concat: output = tf.add_n([from_self, from_neighs]) else: @@ -191,14 +205,16 @@ def _call(self, inputs): # bias if self.bias: output += self.vars['bias'] - + return self.act(output) +# 与最大池化基本相同,只是采用取均值的方式聚合向量集合 class MeanPoolingAggregator(Layer): """ Aggregates via mean-pooling over MLP functions. """ + def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None, - dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): + dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): super(MeanPoolingAggregator, self).__init__(**kwargs) self.dropout = dropout @@ -221,18 +237,18 @@ def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=No self.mlp_layers = [] self.mlp_layers.append(Dense(input_dim=neigh_input_dim, - output_dim=hidden_dim, - act=tf.nn.relu, - dropout=dropout, - sparse_inputs=False, - logging=self.logging)) + output_dim=hidden_dim, + act=tf.nn.relu, + dropout=dropout, + sparse_inputs=False, + logging=self.logging)) with tf.variable_scope(self.name + name + '_vars'): self.vars['neigh_weights'] = glorot([hidden_dim, output_dim], - name='neigh_weights') - + name='neigh_weights') + self.vars['self_weights'] = glorot([input_dim, output_dim], - name='self_weights') + name='self_weights') if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') @@ -251,16 +267,18 @@ def _call(self, inputs): batch_size = dims[0] num_neighbors = dims[1] # [nodes * sampled neighbors] x [hidden_dim] - h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim)) + h_reshaped = tf.reshape( + neigh_h, (batch_size * num_neighbors, self.neigh_input_dim)) for l in self.mlp_layers: h_reshaped = l(h_reshaped) - neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim)) + neigh_h = tf.reshape( + h_reshaped, (batch_size, num_neighbors, self.hidden_dim)) neigh_h = tf.reduce_mean(neigh_h, axis=1) - + from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights']) from_self = tf.matmul(self_vecs, self.vars["self_weights"]) - + if not self.concat: output = tf.add_n([from_self, from_neighs]) else: @@ -269,15 +287,16 @@ def _call(self, inputs): # bias if self.bias: output += self.vars['bias'] - - return self.act(output) + return self.act(output) +# 与最大池化基本相同,经过两次mlp layer,最后shape为[batch_size,hidden_dim_2] class TwoMaxLayerPoolingAggregator(Layer): """ Aggregates via pooling over two MLP functions. """ + def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None, - dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): + dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): super(TwoMaxLayerPoolingAggregator, self).__init__(**kwargs) self.dropout = dropout @@ -302,25 +321,24 @@ def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=No self.mlp_layers = [] self.mlp_layers.append(Dense(input_dim=neigh_input_dim, - output_dim=hidden_dim_1, - act=tf.nn.relu, - dropout=dropout, - sparse_inputs=False, - logging=self.logging)) + output_dim=hidden_dim_1, + act=tf.nn.relu, + dropout=dropout, + sparse_inputs=False, + logging=self.logging)) self.mlp_layers.append(Dense(input_dim=hidden_dim_1, - output_dim=hidden_dim_2, - act=tf.nn.relu, - dropout=dropout, - sparse_inputs=False, - logging=self.logging)) - + output_dim=hidden_dim_2, + act=tf.nn.relu, + dropout=dropout, + sparse_inputs=False, + logging=self.logging)) with tf.variable_scope(self.name + name + '_vars'): self.vars['neigh_weights'] = glorot([hidden_dim_2, output_dim], - name='neigh_weights') - + name='neigh_weights') + self.vars['self_weights'] = glorot([input_dim, output_dim], - name='self_weights') + name='self_weights') if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') @@ -339,16 +357,18 @@ def _call(self, inputs): batch_size = dims[0] num_neighbors = dims[1] # [nodes * sampled neighbors] x [hidden_dim] - h_reshaped = tf.reshape(neigh_h, (batch_size * num_neighbors, self.neigh_input_dim)) + h_reshaped = tf.reshape( + neigh_h, (batch_size * num_neighbors, self.neigh_input_dim)) for l in self.mlp_layers: h_reshaped = l(h_reshaped) - neigh_h = tf.reshape(h_reshaped, (batch_size, num_neighbors, self.hidden_dim_2)) + neigh_h = tf.reshape( + h_reshaped, (batch_size, num_neighbors, self.hidden_dim_2)) neigh_h = tf.reduce_max(neigh_h, axis=1) - + from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights']) from_self = tf.matmul(self_vecs, self.vars["self_weights"]) - + if not self.concat: output = tf.add_n([from_self, from_neighs]) else: @@ -357,14 +377,16 @@ def _call(self, inputs): # bias if self.bias: output += self.vars['bias'] - + return self.act(output) + class SeqAggregator(Layer): """ Aggregates via a standard LSTM. """ + def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=None, - dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): + dropout=0., bias=False, act=tf.nn.relu, name=None, concat=False, **kwargs): super(SeqAggregator, self).__init__(**kwargs) self.dropout = dropout @@ -387,10 +409,10 @@ def __init__(self, input_dim, output_dim, model_size="small", neigh_input_dim=No with tf.variable_scope(self.name + name + '_vars'): self.vars['neigh_weights'] = glorot([hidden_dim, output_dim], - name='neigh_weights') - + name='neigh_weights') + self.vars['self_weights'] = glorot([input_dim, output_dim], - name='self_weights') + name='self_weights') if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') @@ -408,33 +430,51 @@ def _call(self, inputs): dims = tf.shape(neigh_vecs) batch_size = dims[0] initial_state = self.cell.zero_state(batch_size, tf.float32) + # 将neigh_vecs 将向量每个维度值取为正数,然后进行最大值降维,降维后shape为[batch_size, num_neighbors] + # 再通过tf.sign转化为0,1的矩阵 used = tf.sign(tf.reduce_max(tf.abs(neigh_vecs), axis=2)) + # 在2维上将所有数进行求和降维及获得1维向量[batch_size],这样就获得了每个batch_size的序列值 length = tf.reduce_sum(used, axis=1) + # 为了防止某个batch_size序列值为0的情况,将其强行转为1,作为lstm的输入序列步长 length = tf.maximum(length, tf.constant(1.)) length = tf.cast(length, tf.int32) + # 进行lstm聚合rnn_outputs为结果值,rnn_states为最后一个单元的状态,这里用不上 + # 聚合后rnn_outputs shape为[batch_size,max_len,hidden_dim] + # lstm每个cell的输入为上个cell的状态c和上个cell的结果h与这一层的输入x相连的向量v + # lstm通过3个门控制每个cell的输出,3个门实际上就是3个sigmoid函数,每个门通过各自的权重矩阵控制输出的结果. + # 第一个门是遗忘门g1用来控制传入v中那些信息会被保留,第二个门g2输入门用来控制v哪些信息会被输入到下一个状态,第三个门g3用来控制v哪些信息会被输出 + # 每一层的c计算方式为 (c * g1) contact g2 ,每一层的输出h计算方式为 tanh((c * g1) contact g2) * g3 + # 传递至最后一个cell的输出即为整个输出的结果 with tf.variable_scope(self.name) as scope: try: rnn_outputs, rnn_states = tf.nn.dynamic_rnn( - self.cell, neigh_vecs, - initial_state=initial_state, dtype=tf.float32, time_major=False, - sequence_length=length) + self.cell, neigh_vecs, + initial_state=initial_state, dtype=tf.float32, time_major=False, + sequence_length=length) except ValueError: scope.reuse_variables() rnn_outputs, rnn_states = tf.nn.dynamic_rnn( - self.cell, neigh_vecs, - initial_state=initial_state, dtype=tf.float32, time_major=False, - sequence_length=length) + self.cell, neigh_vecs, + initial_state=initial_state, dtype=tf.float32, time_major=False, + sequence_length=length) batch_size = tf.shape(rnn_outputs)[0] max_len = tf.shape(rnn_outputs)[1] out_size = int(rnn_outputs.get_shape()[2]) + # 生成索引,生成规则为如下:1.先生成shape为[batch_size]的1维向量,具体为[1,2,3...batch_size-2,batch_size-1] + # 2.每个元素乘以max_len + # 3.将原来的lstm序列步长减1后再相加(数组从0开始) + # 该index的意义就是为了取每个batch的最后一个聚合结果,因为lstm为序列化的聚合,训练的结果是逐步从第一个 + # 传递至最后一个,获得最后一个batch的结果就相当于获得了这个batch全部的lstm聚合结果 index = tf.range(0, batch_size) * max_len + (length - 1) + # 将rnn_outputs shape变为[-1,hidden_dim],-1代表自适应降维,应该是batch_size*max_len flat = tf.reshape(rnn_outputs, [-1, out_size]) + # 根据索引将对应元素从flat取出来shape变为[index.length,hidden_dim], index.length = batch_size neigh_h = tf.gather(flat, index) from_neighs = tf.matmul(neigh_h, self.vars['neigh_weights']) from_self = tf.matmul(self_vecs, self.vars["self_weights"]) - + output = tf.add_n([from_self, from_neighs]) if not self.concat: @@ -445,6 +485,5 @@ def _call(self, inputs): # bias if self.bias: output += self.vars['bias'] - - return self.act(output) + return self.act(output) diff --git "a/graphsage/doc/aliyun\346\234\272\345\231\250\344\277\241\346\201\257.md" "b/graphsage/doc/aliyun\346\234\272\345\231\250\344\277\241\346\201\257.md" new file mode 100644 index 00000000..2a330537 --- /dev/null +++ "b/graphsage/doc/aliyun\346\234\272\345\231\250\344\277\241\346\201\257.md" @@ -0,0 +1,55 @@ +## 阿里云机器信息 + +IP:敏感信息不放在网上 + +mag240原数据目录:`/mnt/ogb-dataset/mag240m/data/raw` + +``` +├── RELEASE_v1.txt +├── mapping //空文件夹 +├── meta.pt +├── processed +│ ├── author___affiliated_with___institution +│ │ └── edge_index.npy //作者和机构的边,shape=[2,num_edges] +│ ├── author___writes___paper +│ │ └── edge_index.npy //作者和论文的边,shape=[2,num_edges] +│ ├── paper +│ │ ├── node_feat.npy //论文节点的特征,shape=[num_node,768] +│ │ ├── node_label.npy // 论文的标签 +│ │ └── node_year.npy // 论文年份 +│ └── paper___cites___paper +│ └── edge_index.npy // 论文引用关系的边shape=[2,num_edges] +├── raw //空文件夹 +└── split_dict.pt //切分训练集、验证集、测试集方式的文件,用torch读取是一个字典,keys=[‘train’,’valid’,’test’], value是node_index + +``` + + + +### docker镜像 + +#### opeceipeno/dgl:v1.4 + +ogb代码的运行环境,想法是通过虚拟环境去激活各个方案的运行环境,当前做好了Google的mag240m运行环境 + +[GitHub地址](https://github.com/deepmind/deepmind-research/tree/master/ogb_lsc/mag) + +``` +docker run --gpus all -it -v /mnt:/mnt opeceipeno/dgl:v1.4 bash +# 启动容器后,激活Google代码的运行环境 +source /py3_venv/google_ogb_mag240m/bin/activate +# /workspace 目录有代码 +``` + +Google方案预处理后的数据目录:`/mnt/ogb-dataset/mag240m/data/preprocessed`,相当于执行完了`run_preprocessing.sh`脚本,下一步是可以复现实验, + + + +#### opeceipeno/graphsage:gpu + +graphSAGE的环境,[GitHub地址](https://github.com/qksidmx/GraphSAGE) + +``` +docker run --gpus all -it opeceipeno/graphsage:gpu bash +#/notebook目录下面有代码,运行实验参考readme文档 +``` diff --git "a/graphsage/doc/dgl-\345\274\200\345\217\221\347\216\257\345\242\203-\347\274\226\350\257\221\346\265\201\347\250\213.md" "b/graphsage/doc/dgl-\345\274\200\345\217\221\347\216\257\345\242\203-\347\274\226\350\257\221\346\265\201\347\250\213.md" new file mode 100644 index 00000000..214dffe9 --- /dev/null +++ "b/graphsage/doc/dgl-\345\274\200\345\217\221\347\216\257\345\242\203-\347\274\226\350\257\221\346\265\201\347\250\213.md" @@ -0,0 +1,38 @@ + + + + +### 镜像名:opeceipeno/dgl:devel-gpu + + + +#### 启动镜像 + +```bash +docker run --gpus all -ti opeceipeno/dgl:devel-gpu bash +``` + +#### 编译安装dgl + +```bash +# 查看环境列表 +conda env list +# 激活pytorch的编译环境 +conda activate pytorch-ci + +# 源码在/workspace/dgl 编译 +cd /workspace/dgl/build +cmake -DUSE_CUDA=ON -DBUILD_TORCH=ON .. +make -j4 + +# 安装pip包 +cd ../python +python setup.py install + +# 测试 +python -c "import dgl; print(dgl.__version__);import torch; print(torch.cuda.is_available())" + +# 一个官方的示例脚本 +cd /workspace && python dgl_introduction-gpu.py +``` + diff --git a/graphsage/doc/lstm.png b/graphsage/doc/lstm.png new file mode 100644 index 00000000..6a019413 Binary files /dev/null and b/graphsage/doc/lstm.png differ diff --git a/graphsage/doc/preprocess.ipynb b/graphsage/doc/preprocess.ipynb new file mode 100644 index 00000000..164c2615 --- /dev/null +++ b/graphsage/doc/preprocess.ipynb @@ -0,0 +1,446 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "graph.ipynb", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "" + ], + "metadata": { + "id": "lFBIUQovI53M" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 319 + }, + "id": "FXItxCYQ5xE2", + "outputId": "45d18b2d-50ad-4361-fb5c-1401166b8757" + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAb4AAAEuCAYAAADx63eqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nOzdd1iUR/c38C8iscSC0hYBUTAWCDbsihoBNUrswY7BghW7YqGDCjbQKEoQNYpd4xMLFiyoURBFxYIFK0W6FJG2y33eP/LqTyMgZXfvLfO5rud6QPaeORuWPTtzz5xRISICwzAMwyiJGnwHwDAMwzDSxBIfwzAMo1RY4mMYhmGUCkt8DMMwjFJhiY9hGIZRKizxMQzDMEqFJT6GYRhGqbDExzAMwygVlvgYhmEYpcISH8MwDKNUWOJjGIZhlApLfAzDMIxSYYmPYRiGUSos8TEMwzBKhSU+hmEYRqmwxMcwDMMoFZb4GIZhGKXCEh/DMAyjVFjiYxiGYZQKS3wMwzCMUmGJj2EYhlEqLPExDMMwSqUm3wEwkpGRV4Sj0Yl4kpKL3EIRGtSuidaCBvjVXB8a9WrxHR7DMAxvVIiI+A6CEZ+YhGxsDX+OK8/SAQBFIu7Tz2rXrAEC0LeVFmb1aYF2Buo8RckwDMMflvgUSEjka6wKfYJCUQnK+62qqAC1a6pi5aDWmNCtmdTiYxiGkQVsqlNB/Jv0HqNAyH3zsURAgbAEq0IfAwBLfgzDKBW2uEUBxCRkY1Xokwolvc8VCDmsCn2C+4nZEoqMYRhG9rDEpwC2hj9HoaikStcWikoQEP5czBExDMPILpb45FxGXhGuPEsv955eeYiAy0/TkZlXJN7AGIZhZBRLfHLuaHRitdtQAXD0TvXbYRiGkQcs8cm5Jym5X2xZqIpCEYcnye/FFBHDMIxsY4lPzuUWisTSzr3Ypzhx4gQePnyI/Px8sbTJMAwji9h2BjnXoLZ4foXCDzkIDNyDly9f4vXr11BXV4exsTGMjIxgZGT0xdcCgQAqKipi6ZdhGEbaWOKTc60FDVCrZkq1pjtr16yBiT9bYvp6BwAAx3F4+/YtXrx4gZcvX+Lly5c4c+bMp68/fPiA5s2bl5oYmzVrhtq1a4vr6TEMw4gdq9wi5zLyitDT91K1Ep+qCuH60p8gUP++Qo/Pzc3Fq1ev8PLly0/J8eP/x8fHQ1tb+6uE+PH/NTU12WiRYRhescSnABz23kbY49QqbWlQAVD3XRwKL/wOV1dXjB8/HjVrVn0iQCQSITEx8dPo8PNR44sXLyASiT4lxf9OoRoaGuK7776rct8MwzAVwRKfAohJyMaYoEgUCCu/ib2OmioOOXRD1osYuLi4IC0tDe7u7rC1tUWNGuJf+5SVlfUpEf43MSYlJUFXV7fUkaKRkREaN24s9ngYhlE+LPEpiJDI13D7+z5KoFrha+qo1cDKQW0+1eokIly4cAHOzs4oKCiAh4cHhg0bJrWpSaFQiPj4+FJHii9evICqqmqpI0VjY2MYGBhUa6TKMIzyYIlPQVy6dAl2nkGoZ2GHohKu/GlPjoOaqgrchvxYaoFqIsLp06fh4uICVVVVeHp64ueff+b13hwRITMzs8wp1NTUVBgYGJSZGBs0aMBb7AzDyBaW+BRARkYG2rdvj507d0Jg0gV+52Nx6XEq6tSujcJSzuNrq6WKG0FueHTtDNTVyz6Tj+M4HD9+HK6urmjYsCG8vb3Rr18/KTyjyisqKsKbN2++Wmzz8X+1a9cucwpVT08PqqoVHykzDCPfWOKTc0SEIUOGoE2bNli7di0A4MCBAwg58j8Mmb8aT5LfI7dQiAa11dBatz5Gdfz3BPbp06dDTU0NW7Zs+WYfJSUlOHjwINzd3aGvrw8vLy/06tVL0k9NbIgIaWlppY4UX758iczMTBgaGpY6UmzevDnq1avH91NgGEaMWOKTc5s3b0ZISAj++eefTysiJ0yYgN69e8PBwaHM67KysmBiYoITJ06gc+fOFepLJBJhz5498PT0ROvWreHl5VXha2VZQUHBV9szPn796tUrNGjQoNSRorGxMQQCgUQWATEMIzks8cmxu3fvon///oiMjISxsTGAf0dnOjo6uHv3LgwMDMq9PiQkBBs3bkRUVFSlFoYUFxcjODgYq1atgrm5OTw9PdGuXbtqPRdZxXEcUlJSSh0pvnjxArm5uV9s5v88MTZv3hx16tTh+ykwDPMfLPHJqby8PJibm8PNzQ3jxo379O+RkZFwcHDA/fv3v9kGEcHS0hJDhw7FvHnzKh1DQUEBAgMD4evrCwsLC3h4eKBNmzaVbkee5eXl4dWrV6Umxjdv3kBDQ6PUkaKRkRG0tbXZZn6G4QFLfHJqypQpKCkpwe7du7/4dxcXF4hEIqxZs6ZC7Tx9+hQ9e/ZETEwM9PT0qhTLhw8fsGXLFmzYsAEDBgyAm5sbWrRoUaW2FElJSQmSkpLKvLdYWFhY6mZ+Y2NjGBoaolatWnw/BYZRSCzxyaGDBw/Czc0N0dHRXy28MDc3h7+/PywsLCrcnpubG2JjY3HkyJFqxZWbmwt/f39s3rwZw4cPh7OzMwwNDavVpiLLyckpdTP/ixcvkJiYCB0dnVJHisbGxmjcuDEbLTJMFbHEJ2devnyJrl274ty5c+jYseMXP0tOToapqSnS0tIqdc+usLAQZmZm2LRpEwYNGlTtGN+9e4cNGzZg+/btGDt2LFasWIEmTZpUu11lIhKJkJCQUOa9RSIqcwq1adOmUFNT4/spMIzMYolPjgiFQlhYWGD06NFYsGDBVz/ftWsXzp49i0OHDlW67bCwMDg4OODRo0eoW7euOMJFWloafH19sWvXLtjb28PJyQna2tpiaVvZvXv3rswp1OTkZOjp6ZWZGMvbu8kwyoAlPjmyfPly3L9/H6dOnSp1mmvUqFEYMmQI7OzsqtT+uHHj0KxZM6xevbq6oX7h7du3WL16NQ4cOIDp06dj8eLFrO6mBBUXF+PNmzeljhRfvnwJNTW1Ms9aNDAwYJv5GYXHEp+cuHDhAiZNmoS7d++WOmoSCoXQ0tLCs2fPqjyqSklJgZmZGcLDw2FqalrdkL8SHx8Pb29v/PXXX3B0dMSCBQtYKTEpIyJkZGSUOlJ8+fIl0tLS0LRp0zITY/369fl+CgxTbSzxyYG0tDR06NABe/bsgaWlZamPCQ8Px9KlSxEVFVWtvgICAnDgwAFcuXJFYhuznz9/Dk9PT5w9exYLFy6Eo6Mjvv++YmcBMpJVWFiI169flzpSfPnyJerVq1fmWYtNmjRRmM38GXlFOBqdiCcpucgtFKFB7ZpoLWiAX83/rXzEyDeW+GQcx3GwsbFBu3btyt2isGTJEtSrVw9ubm7V6q+kpATdu3fHzJkzYW9vX622vuXx48dwd3fHlStX4OTkhBkzZrAN3zKMiJCSklJmofDs7Gw0a9as1JGikZGR2O4dS1JMQja2hj/HlWfpAPDFAc8fa932baWFWX1aoJ0Bu1cqr1jik3F+fn44dOgQrl27Vu5KPVNTU+zevVssJcTu3r2LgQMH4tGjR9DU1Kx2e98SExMDNzc33L59GytWrMDUqVPZgbRy6MOHD59Kv/03Mb5+/Rrq6uplTqEKBALet2eERL7GqtAnKBSVlHu6iYoKULumKlYOal3q6SaM7GOJT4bduXMHAwYMQFRUFJo3b17m416/fo2uXbsiOTlZbFNNCxYsQE5ODnbu3CmW9iri1q1bcHV1xePHj+Hq6go7Ozt2xp6C4DgOb9++LXMK9cOHD2jevHmpK1GbNWuG2rVrSzS+f5PeYxQIuW8/+P/773mWjPxgiU9GvX///lMdzDFjxpT72ICAAERFRX1VxaW6/ZuYmGDfvn3o3bu32NqtiOvXr8PFxQUJCQlwd3fHmDFj2EpDBZebm1tmofD4+Hhoa2uXedaipqZmtUaLMQnZGBMUiQJhSaWvraOmikMO3dBWn017yhOW+GTUb7/9BlVVVQQHB3/zsTY2NrCzs4Otra1YYzh27BhcXV1x9+5dXqYeL126BBcXF2RnZ8PDwwMjRoxQmMUTTMWVlJQgMTGx1JHiixcvIBQKy5xCNTQ0/OZr12HvbYQ9Ti3/8OYyqKgAA0x0sH1Cpyo+O4YPLPHJoH379sHLywvR0dHfXO1YUFAAHR0dvHnzBo0aNRJrHESEX375BT179sTy5cvF2nZlYjh79uynGqReXl6wsbHh/X4QIzuysrK+KBT+eWJMSkqCrq5umZv5ue++R0/fS18sYqmsWjVr4IZTP7baU46wxCdjXrx4gW7duiEsLAzt27f/5uPPnj2L1atX4+rVqxKJ59WrV+jcuTOioqJgZGQkkT4qgojw999/w9XVFXXq1IGXlxesra1ZAmTKJRQKER8fX+pI8cWLF6jTcQjqdBkFqFa9xFvtmjWwwLolpvc2FmPkjCSxxCdDiouL0atXL0yYMAFz586t0DWOjo7Q19eHk5OTxOLy8fHB1atXcfr0ad4TDcdxOHLkCNzc3KCtrQ0vLy/06dOH15gY+UREmLX3Js48zqx2W8Pb68Fv9Lc/qDKygd0wkSHOzs7Q0dGBo6NjhR5PRAgNDRVLYenyLFq0CPHx8fjrr78k2k9F1KhRA6NHj8bDhw8xdepU2Nvbw9raGpGRkXyHxsgZFRUVFJF4Fk3lFgrF0g4jHSzxyYjz589j//792LVrV4VHVc+ePUNxcTF+/PFHicampqaGbdu2Yf78+cjNzZVoXxVVs2ZN2NnZ4enTp7C1tYWtrS1sbGxw584dvkNj5IBIJEJERARexz0WS3sNarPTMOQJS3wyIDU1Fb/99hv27t1bqQ3jp0+fxuDBg6Uy/WhhYYH+/fvD1dVV4n1VhpqaGqZNm4a4uDgMHDgQNjY2GDlyJB4+fMh3aIwMISI8f/4c27Ztw4gRI6ClpYUZM2bg++IsqFXzz6d2zRporctqmMoTlvh4xnEcJk2ahMmTJ+Onn36q1LXSmOb8nK+vLw4ePCiTo6patWphzpw5eP78OXr06AFLS0uMGzcOz5494zs0hifv3r3D0aNH4eDgACMjI/Tu3RuRkZEYMWIEHj9+jJiYGOxynYEaqtV7GyQAozrqiydoRirY4haebdiwAceOHcOVK1cqdXjo+/fvoaenh7dv3351Crsk7d69GwEBAYiIiJDpTeXv37/H5s2b4e/vDxsbG7i6upZb/YaRf8XFxbhx4wbCwsIQFhaGJ0+eoFevXujfvz+sra1hYmLy1exIfn4++rkeQLKqNlSqsEeU7eOTT2zEx6Pbt2/D19cX+/fvr/SJ2RcuXED37t2lmvQAYNKkSahTpw62b98u1X4rq379+li5ciXi4uJgYGCATp06YcaMGUhMTOQ7NEZMiAiPHj2Cv78/Bg8eDE1NTSxduhQAsHbtWqSnpyM0NBTz58+HqanpV0nv7t276NSpE7Qy7qLOd1UrjVe7pipm9W1R7efCSBcb8fHk/fv36NixI1avXo1ff/210tdPmzYNP/74I+bNmyeB6MoXGxuLPn364P79+9DV1ZV6/1WRkZGBdevWISgoCBMnTsTy5cshEAj4DouppNTUVFy4cAHnz5/HhQsX8N1338Ha2hrW1tbo168fNDQ0vtlGSUkJ1q9fjw0bNsDf3x/jxo1jtTqVDEt8PJk4cSLq1KmDP/74o9LXEhH09fURHh6OH374QQLRfduKFSvw6tUrHDhwgJf+qyolJQU+Pj7Ys2cPpk2bhiVLlkjlBAqmavLz83Ht2rVP05fx8fHo27fvp+lLY2PjSi3uio+Ph52dHYgIe/bsgaGh4aefVfR0BhAHVRWCx9C2LOnJKTbVyYO9e/ciOjoa/v7+Vbo+JiYGdevW5S3pAf/uObx58ybOnz/PWwxVIRAI4O/vj/v37yM3NxetWrWCq6srsrOz+Q6Nwb+Lve7cuQNfX19YWVlBR0cH3t7eaNCgAbZv34709HQcP34cM2fORIsWLSqV9Pbv349OnTrh559/xqVLl75IegAwoVszHHLohn4tNUCiYtSq+eXbY+2aNVCrZg30NlbH++Oe6NK4WCzPmZE+NuKTsri4OPTo0QMXL15E27Ztq9TG6tWrkZaWVuXEKS6hoaGYO3cuHjx4ILcHyL569QpeXl44efIk5s+fj7lz56J+fbY0XZoSEhIQFhaG8+fP4+LFi9DQ0IC1tTX69++PPn36oEGDBtVqPzs7G7Nnz8adO3ewb98+dOzYsdzHHz9+HL8H7cboZRvxJPk9cguFaFBbDa1162NUx39PYPf398eJEydw8eJF3qsZMVVAjNQUFRVRx44dacuWLdVqp2fPnnTu3DkxRVU9o0aNIhcXF77DqLanT5/SuHHjSFtbm9atW0cfPnzgOySFlZOTQ3///TfNmTOHWrVqRZqamjR69GgKDg6mN2/eiLWv8PBwMjQ0pNmzZ1f4dzp16lTy8/Mr9zFCoZA6duxIf/75pzjCZKSMJT4pWrRoEQ0dOpQ4jqtyG5mZmdSgQQMqKCgQY2RVl5iYSJqamvT48WO+QxGLBw8e0MiRI0lXV5c2b95MhYWFfIck94RCId24cYM8PDyoV69eVK9ePbK0tCQfHx+Kjo6mkpISsfdZVFRETk5OpKurS6dPn67wdRzHUZMmTejp06fffOzt27dJR0eH0tPTqxMqwwOW+KQkNDSU9PX1KSMjo1rt7N+/n4YMGSKmqMRj06ZN1Ldv32oldFkTHR1NgwcPJgMDAwoMDKTi4mK+Q5IbHMdRXFwcBQQE0PDhw0ldXZ3atm1LixYtorNnz0p8NB0bG0sdOnSgX375hVJTUyt17b1798jY2LjCr+V58+aRvb19VcJkeMQSnxQkJyeTQCCg8PDwarc1YcIE2r59uxiiEp+P0z579uzhOxSxi4iIICsrKzIyMqLdu3eTUCjkOySZlJmZSYcPH6Zp06ZRs2bNSFdXlyZNmkQhISGUkpIilRg4jqMtW7aQpqYmBQYGVumD2OrVq2nu3LkVfnxubi7p6+vT5cuXK90Xwx+W+CSspKSErKysyNXVtdptiUQi0tTUFPt9EHGIiooigUBAmZmZfIciEeHh4WRhYUGtWrWiAwcOSGR6Tp4UFRXR5cuXacWKFdS5c2eqX78+DRo0iPz8/Ojhw4dSH/0nJyfTzz//TJ07d67QNGVZevbsSWfPnq3UNcePH6dWrVqxaXE5whKfhPn6+lKvXr3EMlKIiIggMzMzMUQlGXPmzCEHBwe+w5AYjuPo/Pnz1KVLFzIzM6Pjx48r1PRueTiOo4cPH5Kfnx8NGjSI6tevT507d6YVK1bQ5cuXeX3T//vvv0kgEJCzs3O1pqQzMzOpfv36Vbp/PnToUPLw8Khy34x0scQnQZGRkaStrS22EZqLiws5OTmJpS1JyM7OpiZNmtD169f5DkWiOI6jkydPUvv27cnc3JxOnz6tkAkwOTmZ9u7dS3Z2dtSkSRNq1qwZOTg40JEjR2RiZJ+Xl0cODg7UvHlz+ueff6rd3oEDB8jGxqZK18bHx5OGhka1RpuM9LDEJyHZ2dlkZGRER48eFVub5ubmdOXKFbG1JwkHDx4kMzMzpVgMUlJSQkePHiUTExPq3r07Xbx4ke+QquXDhw909uxZWrRoEbVt25bU1dVp+PDhFBAQQHFxcTKV3KOiouiHH36gSZMmUU5OjljanDhxIgUEBFT5+o0bN1K/fv1k6r8TUzqW+CSA4zgaO3YsTZ8+XWxtvn37lho1aiTzCYXjOBowYACtW7eO71CkRiQSUUhICLVo0YL69u1L165d4zukCikpKaHo6Gjy8fGhfv36Ub169ahXr17k4eFBN27ckMmFPEKhkLy8vEhbW5sOHz4stnZLSkpIS0uLXr9+Xa3YOnTooJCLvBQNS3wSsGvXLjI1NaX8/Hyxtblz506ytbUVW3uS9Pz5c9LQ0KjWm4g8EgqFFBwcTIaGhjRgwACKioriO6SvxMfHU3BwMI0ePZo0NTWpVatWNGfOHDpx4oTYRk6S8vLlS+rZsydZWlpSQkKCWNuOjIwkU1PTardz69Yt0tHRqfa2JUayWOITsydPnpCmpiY9ePBArO2OGjWKdu/eLdY2JcnLy0vm9htKS1FREQUEBJCenh4NGTKE7t27x1sspVVJGTNmDAUHB1N8fDxvcVUGx3G0e/du0tTUpI0bN0pkRa2rqystWbJELG3NnTuXJk+eLJa2GMlgiU+MCgsLqUOHDrRt2zaxtltcXEzq6upS2w8lDoWFhdSqVSv63//+x3covMnPzyc/Pz/S0dEhW1tbio2NlXifn1dJ6dmzp1SqpEhSZmYm/frrr/Tjjz9STEyMxPrp1KmT2Pbi5eTkkL6+vlj27TKSwRKfGM2fP5+GDx8u9pvbly9fps6dO4u1TWm4fPkyNW3alN6/f893KLzKy8sjHx8f0tLSookTJ9Lz58/F1vbHKilbt26lYcOGfVEl5dy5c3JdczQsLIz09fVp/vz5Ei3Rl5KSQurq6mK9f/7XX39R69at2d4+GcUSn5icOnWKmjZtKpFl3kuWLCE3NzextysNdnZ2tGjRIr7DkAk5OTnk4eFBGhoaNHXq1CrfA/1vlZQmTZpIvUqKJBUUFNDChQtJT0+Pzp8/L/H+du/eTaNGjRJrmxzH0ZAhQ8jT01Os7TLiwRKfGCQlJZFAIKCrV69KpH1TU1O6efOmRNqWtNTUVNLS0uL1PpesyczMpBUrVlDjxo1p9uzZlJSUVO7jCwsLZapKiiTdv3+fzMzMaOTIkVJbIPLrr7/Szp07xd7umzdv2N4+GcUSXzWJRCLq168fubu7S6T9169fk7a2ttzdm/ncH3/8Qd26dZPr5yAJqamptHDhQmrUqBEtXLjwU0Hlj1VSNm7cSD///PMXVVLCw8OpqKiI58jFr6SkhPz8/EhTU5N27doltWT+8f7527dvJdI+29snm1jiq6bVq1dT7969SSQSSaT9gIAAsrOzk0jb0lJSUkI9evSQueLasiIpKYl+++03+v777+nHH38kgUAgc1VSJCkpKYmsra2pe/fuYr3/WRFXrlyhjh07Sqx9trdPNrHEVw0RERGkra0t0WXhNjY2dPDgQYm1Ly33798nLS0thbgHJQ6lVUkZMGAA9ejRg9TV1cnNzU3m99WJw9GjR0lbW5s8PT152TDv5OREzs7OEu0jKiqK7e2TMSzxVVFWVhY1b96cjh8/LrE+8vPzqX79+vTu3TuJ9SFNS5YsoQkTJvAdBi/Kq5ISERHxxZt+XFwcTZw4kbS0tMjHx4fy8vJ4jFwycnNzyd7enlq0aEGRkZG8xWFmZkYRERES78fR0ZGmTJki8X6YimGJrwo4jqPRo0fTrFmzJNrPmTNnyMLCQqJ9SFNeXh4ZGhrKfU3Linrz5g3t2LHjiyopjo6OdOLECcrNzf3m9bGxsWRra0sCgYA2btwo1kpAfLp+/ToZGRnR1KlTed3q8ubNG9LU1JTYbYrP5eTkkJ6enszX2lUWLPFVQXBwMJmZmUn8jcjR0ZHWrFkj0T6k7cSJE9SyZUuF3N8kqSop9+7do6FDh5Kenh5t3bpVbhe3FBcXk6urK+no6Eh0pqSitm/fTuPHj5daf8eOHWN7+2QES3yVFBsbS5qamvTo0SOJ9sNxHBkZGUm0WgVfhg0bphBnl5VXJeXOnTtiX8UaFRVFAwcOJENDQwoODpbJItJlefbsGXXt2pUGDhwosRWUlTVkyBDat2+f1PrjOI5++eUX8vLyklqfTOlY4quEgoICateuHQUGBkq8rydPnpC+vr5CLoP+eHbZs2fP+A6lUjiOo2fPnn1VJWXx4sVSrZLyzz//UN++falFixYUEhIilam6quI4joKCgkhTU5N+//13mXk9FxYWUoMGDaS+4OTj3j55e+0rGpb4KsHR0ZFGjRollT/ejRs3KvRp5hs2bCBra2uZeSMsiyxXSbl48SL16NGDTExM6MiRIzK3TzI9PZ2GDRtG7dq1o4cPH/IdzhfOnz9PPXr04KXvDRs2kKWlpcy/9hUZS3wVdOLECTI0NJTaCksrKyuFLvAsFAqpXbt2tH//fr5D+cLnVVI6deok81VSOI6j0NBQMjc3p/bt29OJEydkIsYzZ85QkyZNaOnSpTJ5T2vevHnk7e3NS99CoZDat29Pe/fu5aV/hiW+CklMTCQdHR26fv26VPrLzc2l+vXrK3xx54iICNLV1aWsrCzeYuA4jh48ePBVlZSVK1fKVZUUjuPo+PHjZGZmRl26dKFz587xkgDz8/Npzpw5ZGBgQJcuXZJ6/xX1ww8/0J07d3jrn+3t45cKERGYMpWUlMDKygqWlpZwdnaWSp//+9//EBAQgPPnz0ulPz7NmDEDqqqq2Lp1q9T6TElJwYULFxAWFoawsDDUqlUL/fv3h7W1Nfr164fGjRtLLRZx4zgOR44cgZubG7S1teHl5YU+ffpIpe979+5h3LhxaNeuHQICAtCoUSOp9FtZcXFx6NOnD5KSkqCiosJbHHPnzkV+fj527NjBWwxKi+/MK+u8vb2pb9++Ul1AMG3aNPLz85Naf3x69+4dCQQCiRbh/rxKipmZGamrq9Pw4cMpICCA4uLiZGJqUNyEQiH9+eef1Lx5c7KyspLoJu2SkhJau3YtaWlpUUhIiMz/99y0aZNMHBT7cW+fpIrbM2Vjia8c169fJx0dHUpMTJRanxzHkZ6enlJVdA8JCaH27duLbXn+xyopa9as+aJKiqen51dVUhRdcXEx/fHHH2RgYECDBw+m6OhosbYfHx9Pffv2JQsLiyofsyRtAwYMoGPHjvEdBhH9W7KtTZs2cjOlrihY4itDVlYWGRoa0okTJ6Ta771796hFixZS7ZNvHMeRpaVltUa5/62S0rp160pVSVF0BQUFtHnzZtLV1aURI0bQgwcPqt3mgQMHSEtLi9asWSPTWyo+l5eXR/Xq1ZOZOqgcx5GNjQ1vC22UFUt8peA4jkaNGkWOjo5S73v16tU0d+5cqffLt6dPn5KGhgYlJCRU6PGfV0lp2bKl2KqkKLoPHz7QunXrSFtbm8aOHVulmYXs7GwaP348tWrViplLNmsAACAASURBVG7fvi2BKCXnxIkT9NNPP/Edxhdev35NGhoaFBcXx3coSqMG3/cYZdGOHTsQFxeHtWvXSr3v06dPY9CgQVLvl28tW7bEnDlzMG/evFJ/LhKJEBERAQ8PD/Tq1Qt6enrYvHkzDAwMcPDgQaSmpuLAgQOYPHkyDAwMpBy9/Khbty4WL16M58+fw9TUFD179oS9vT1evXpVoeuvXr2Kdu3aoWHDhrhz5w7Mzc0lHLF4hYaGytzfl6GhIZYvX46ZM2eC2FpD6eA788qaR48ekaamJj1+/FjqfWdmZlL9+vWpoKBA6n3LgoKCAmrRogWdPHmy1Cop7dq1+1QlRVEKNvMtKyuLXFxcqHHjxjR9+vQyR9xFRUW0bNky0tXVpVOnTkk5SvHgOI6aNm0q8XKDVfFxX2tISAjfoSgFlvg+k5+fT2ZmZrRjxw5e+j9w4AD98ssvvPQtCzIzM8nZ2Znq1atHhoaGMlUlRdGlp6fT0qVLqVGjRjR37lxKTk7+9LPHjx9Tx44dycbG5tMp8fLo4cOH1KxZM5lddXrz5k0SCAQKf/CwLGD7+D4zZ84cZGRk4MCBA7zs77Gzs0OPHj0wY8YMqffNh6KiIkREROD8+fMICwvD06dPYWFhgZSUFHTo0AFBQUG87rNSRikpKfDx8cGePXswdepUaGlpYe3atfD29oaDg4Nc/z7Wrl2LN2/eSHXPaGU5OjqisLAQQUFBfIei2PjOvLLi+PHj1Lx5c8rOzualf5FIRFpaWvTmzRte+peG0qqkdOnS5asqKcnJyaSpqSmWlYdM1dy5c4eaNm1KqqqqNHPmTF6r64hLnz59ZH6aNjs7m+3tkwKW+OjfvUja2tpSOYm5LJGRkfTjjz/y1r+kJCcn0969e8nOzo50dXWpefPm5ODgQEeOHCl3SicgIIB69eolc4WXlcGJEydIIBDQypUr6enTp2Rvb0+amprk7e0tt1tDsrOzqV69elI7QaM62N4+yVP6xCcSiah37960evVqXuNwdXWlpUuX8hqDOHyskrJw4cKvqqQ8f/68wu2IRCLq0qULBQcHSzBa5nN5eXk0ffp0atas2VcjjqdPn9K4ceNIW1ub1q1bJxcJ5HNHjhyhgQMH8h1GhbC9fZKn9InPw8OD+vXrx/sGXHNzcwoPD+c1hqooKSmh27dvf1ElxcLCQixVUu7evUva2tqUlpYmxoiZ0ty6dYtatmxJdnZ25U73P3jwgEaOHEm6urq0efNmmTx5oTT29vb0+++/8x1GhbG9fZKl1Inv6tWrpKOjQ0lJSbzGkZycTOrq6lRcXMxrHBUlzSopCxYsoN9++02sbTL/RyQSkbe3N2lpadGhQ4cqfF10dDQNHjyYDAwMKDAwUKZfuyUlJaSjo0MvXrzgO5RKWbdunVycWSmPlDbxZWZmUtOmTWXiZveuXbvo119/5TuMMn2skjJ79uwvqqTs3LlT4lVScnNzycDAQC5Hw7Lu5cuX1LNnT+rXr1+FK+b8V0REBFlZWZGRkRHt3r1bJuug3r59m1q1asV3GJVWXFxM7dq1o3379vEdisJRysTHcRyNGDGC5s+fz3coREQ0atQo2rVrF99hfCIUCun69evk7u5OPXv2pHr16pGlpSX5+vrSnTt3pL7g5K+//mI3+8WI4zjas2cPaWlp0fr168Xy+wwPDycLCwtq1aoVHThwQKYWJXl6etKCBQv4DqNKIiMj2d4+CVDKxLdt2zbq0KGDTNyfKC4uJnV1dV43aMt6lZSPN/tXrVrFaxyKIDMzk2xtbcnU1JTu3bsn1rY5jqNz585Rly5dyMzMjI4fPy4T03TdunWjsLAwvsOostmzZ9O0adP4DkOhKF3ie/DgAWlqasrMsT/h4eHUqVMnqfebmZlJhw8fpmnTpn1RJWXfvn0yWSXl1atXpKGhIXf3aWTJxYsXycDAgObNmyfRDzMcx9HJkyepffv2ZG5uTqGhobwlwPT0dGrQoIFMfMitqo97+65du8Z3KApDqRLfhw8fyNTUVKamFZcuXUqurq4S76ewsJAuXbpEy5cvp06dOlH9+vVp8ODB5O/vT48ePZKJT+bf4uPjQwMHDpSLWGVJYWEhLVq0iPT09OjcuXNS67ekpISOHj1KJiYm1L17d7p48aLU+v5o7969NGzYMKn3K25HjhwhExMTNt0vJkqV+GbMmEFjx46VqTdOU1NTioyMFHu7Fa2SIk+Ki4vJ1NSUDh8+zHcocuPBgwfUtm1bGj58OKWnp/MSg0gkopCQEGrRogX17dtXqiOXsWPH0h9//CG1/iSF4zgaPHgwm+4XE6VJfMeOHSMjIyOZOYCS6N+9OlpaWmJbCFBWlZSjR4/Su3fvxNIH365du0Z6enoy9XuURSUlJeTv70+ampoUHBwsEx/2hEIhBQcHk6GhIQ0YMICioqIk2p9IJKLGjRtXecWqrPk43c/29lWfUiS+N2/ekLa2Nt28eZPvUL6wbds2mjhxYpWvL61KyogRI2jbtm2VqpIib6ZMmaKUh/VWVFJSEvXv35+6desmk6+DoqIiCggIID09PRoyZIjYF9l8dP36dWrbtq1E2uYL29snHgqf+IRCIfXq1Yt8fX35DuUrNjY2dODAgQo/XpJVUuRJRkYG6ejoyN3p39Jw7Ngx0tHRIXd3d5l/PeTn55Ofnx/p6OiQra0txcbGirX9lStX0vLly8XaJt/Y3j7xUPjE5+rqStbW1jK1r4jo30NX69ev/80pyI9VUmxtbUlDQ0OiVVLkye7du6lTp068l5qTFbm5uTR58mQyNjbmtdh6VeTl5ZGPjw9paWnRxIkTxTZKbd++vUKuhPy4t09Rbl/wQaETX3h4OAkEgi8O1ZQVZ8+epV69en317/+tkqKlpUVjx46VSpUUecJxHPXp00eu6i9KSkREBBkbG9OUKVPk+sNQTk4OeXh4kIaGBk2dOrVaR3QlJiZSo0aNZH7UW1WzZs0iBwcHvsOQWwqb+DIyMsjAwIBCQ0P5DqVUjo6OtHr16lKrpFhZWfFWJUWexMbGkqamJu+1VvkiFArJzc2NdHR06K+//uI7HLHJzMykFStWUOPGjWn27NlV+v1+rCWrqLKzs6lJkyb0zz//8B2KXFLIxMdxHA0dOpQWLlzIdyhf4TiOnj59ShoaGvTTTz/JXJUUebNy5UqytbXlOwypi4uLo65du1L//v0VNvGnpqbSwoULqVGjRrRw4UJKTU2t8LUjRoygP//8U4LR8e/w4cNkamoql1uT+KaQiW/r1q1kbm4uMy+IjIwMOnToEE2dOpUMDQ1JW1ub6tatSyEhITJZJUWe5Ofnk5GREZ05c4bvUKSC4zjasWMHaWpq0qZNm5RiRiApKYlmz55NjRs3puXLl3+zbmVRURE1bNiwUolSHnEcR4MGDeL9LFF5pHCJLyYmhjQ1NenZs2e8xfDfKikNGjT4okrKxo0bWe09MTpz5gwZGRkp/Gg5PT2dhg8fTm3btqWHDx/yHY7UvX79mqZOnUoaGhrk7u5e5l7OixcvUpcuXaQcHT8+7u2TxW0rsqwGFEh+fj7GjBmDjRs34ocffpBav0SEhw8fws/PD4MGDYKWlhaWLVuGGjVqYP369UhPT8epU6cwb948mJiYIDQ0FIMGDZJafIpu4MCBMDc3x+rVq/kORWLOnTuH9u3bw9jYGFFRUTA1NeU7JKkzNDREUFAQIiMj8eLFC7Ro0QK+vr748OHDF49Tpr+vZs2awcnJCbNmzQIR8R2O/OA784rTtGnTaMKECVLp62OVlIkTJ1aqSsr79++pXr169P79e6nEqSySkpJIU1NT7HvB+Jafn09z584lAwMDXmpdyrLY2FiytbUlgUBAfn5+VFBQQEREbdq0kXhVGFlSXFxMbdu2pf379/MditxQmMR3+PBhatGihcSWc3/48IHOnDnzqUpKo0aNqlQl5X//+x9ZWVlJJEZlt3nzZurTp4/CVLW4d+8emZqakq2tLduzVY579+7R0KFDSU9Pjzw9PUlbW1sp7n1+LiIigu3tqwQVIvkYH2fkFeFodCKepOQit1CEBrVrorWgAX4118f7jGR06dIFoaGh6NSpk1j64zgOd+/eRVhYGMLCwhAVFYUOHTrA2toa1tbW6NSpE2rWrFnpdh0cHNCmTRssWLBALHEy/6ekpARdu3bF3LlzYWdnx3c4VcZxHDZu3Ii1a9di48aNGD9+PFRUVPgOS+bdunULkyZNQnx8PDZv3gw7O7sq/Y3Kq9mzZ0MkEiEwMJDvUGSezCe+mIRsbA1/jivP0gEARSLu089q16wBAqCSHIshLWpj7bLZ1eorPj4eYWFhOH/+PC5evAgtLS1YW1ujf//+6NOnD+rXr1+t9okIBgYGuHTpElq2bFmttpjSRUdHY/DgwYiNjUXjxo35DqfSEhISMGnSJAiFQuzduxfNmjXjOyS5MnjwYHTt2hWXL19GYmIi3N3dMWbMGKiqqvIdmsTl5OTAxMQEhw8fRs+ePfkOR6bJdOILiXyNVaFPUCgqQblREofa39WE86A2mNCtWYXbz83NxeXLlz+N6rKysmBlZQVra2tYWVnBwMCg2s/hc/fv38eIESMQFxfHPsFL0Ny5c1FQUICgoCC+Q6mUQ4cOwdHREfPnz4eTk5NSvFmLU0FBAbS1tREfH49GjRrh0qVLcHZ2Rk5ODjw8PDBixAjUqKFQ6/m+cvjwYXh5eeHOnTtQU1PjOxyZJbOJ79+k9xgFQu7bD/7/6qjVwMpykp9IJEJUVNSnRBcTE4Nu3bp9mr5s166dRP8w1qxZg+TkZGzevFlifTDy98k3JycHjo6OuHnzJvbt2ye26Xplc+bMGaxZswZXr1799G9EhLNnz8LFxQUlJSXw9PSEjY2Nwn7wJCIMHjwYvXv3xrJly/gOR2bJZOKLScjGmKBIFAhLKn1tHTVVHHLohrb66iAiPH/+/NP0ZXh4OJo1a/Zp+rJXr16oU6eOBJ5B6SwsLODs7IwBAwZIrU9lJS+ffK9duwY7OzsMHDgQ69evx/fff893SHLL0dERenp6pb7hExH+/vtvuLq6ok6dOvDy8oK1tbVCJsBXr16hc+fOiIqKgpGREd/hyCSZTHwOe28j7HFq+dObZVABYNpQBN0XpxAWFgaRSPRpRGdpaQkdHR2xx1sRWVlZMDQ0RFpaGmrXrs1LDMqEiDBo0CD89NNPWLp0Kd/hfKW4uBgeHh7YuXMngoKCYGNjw3dIco2IYGxsjL///htmZmZlPo7jOBw5cgRubm7Q1taGt7c3evfuLcVIpWPt2rW4dOkSzpw5o5DJvbpkLvFl5BWhp++lLxaxVJYKJ4KDIB5DB1iiTZs2MvGLP3ToEEJCQnDy5Em+Q1EaL168QNeuXXH79m2ZWiTy9OlTjB8/HgKBAMHBwbx9GFMkT548gbW1NeLj4yv09y4SibB//364u7vD2NgYXl5e6NatmxQilQ6hUAhzc3OsWLECY8aM4TscmSNzd3qPRidWu41a332HxuaDYGJiIhNJDwBOnz6tNNUkZIWxsTEWLFgAR0dHmahqQUTYvn07evXqhalTp+LkyZMs6YnJx2otFf17r1mzJuzs7PD06VPY2trC1tYWNjY2uHPnjoQjlQ41NTUEBgZi4cKFyMrK4jscmSNzie9JSm61RnsAUCji8CT5vZgiqj6O43D27FmW+HiwePFiPH/+HH///TevcaSlpWHIkCEICgrCtWvXMGPGDJn5UKYIqlqmTE1NDdOmTcOzZ88wYMAA2NjYYOTIkXj48KEEopSu7t27Y9iwYVi+fDnfocgcmZvqnPznLVx6klbtdlTePkTzhHPQ19eHgYEB9PX1v/haQ0NDam88UVFRmDx5skL8McmjK1euYOLEiXj06FG192JWxalTpzBt2jTY29vD3d0d3333ndRjUGTv379HkyZNkJycjHr16lWrrfz8fAQEBGDdunWwtLSEu7u7XO+5zc7OhqmpKY4cOYIePXrwHY7MkLnEN//QXfzv3ttqt2NpVA9jmwuRkJCAxMREJCYmfvF1QUHBp2T436T48WtNTU2xJEc3NzcUFBRg7dq11W6LqZrffvsNGhoa2LBhg9T6zM/Px+LFixEaGoq9e/fCwsJCan0rk+PHj2Pbtm04f/682Np8//49Nm/eDH9/f9jY2MDV1RXNmzcXW/vSJC8rnKVJ5hLf9isv4HfhWbWmO2vXrIEF1i0xvbdxmY/Jy8tDUlLSVwnx8+8/fPgAPT29UkeMnyfHb+3969y5M9avX48+ffpU+Tkx1ZOeno4ff/zx0ykHkhYdHY3x48ejc+fO2LJlCxo2bCjxPpXVtGnTYGpqivnz54u97ezsbGzcuBFbt26Fra0tVq5cCX19fbH3I0lsb9/XZC7xiWNVZ62aNXDDqR806tWqViz5+fmlJsTPv37//j309PTKnFKtVasWLCwskJ6ezj5t8Sw4OBhBQUG4ceOGxAoVlJSUYO3atfDz88PmzZvZijoJIyLo6+vj8uXLEp2SzMjIwLp16xAUFAQ7OzssW7YMAoFAYv2JG9vb9yWZS3xA9fbxgeNQ510cDjlaoW3btmKP7b8KCgqQlJRU5pRqXFwc8vPz0bRp0zKTo76+PnR0dBS+nBLfOI5D7969MWHCBMyYMUPs7b9+/Rp2dnZQVVXFnj17xF7yjvlaTEwMRo4cKbUygCkpKfDx8cGePXswbdo0LFmyBJqamhLvVxx8fX1x+fJltrcPMpr4qle5pQZsNd5i26rlGDt2LDw8PKCuri6BKCvG1tYWVlZWsLS0LHNKNTExEdnZ2dDV1S1zSvVjcmT1G6vnwYMH6NevHx4+fCi2rQREhH379mHhwoVYunQpFi5cyD7ESAlfZQATExOxatUqHD58GLNnz8bChQt5fZ+pCKFQiI4dO8LZ2RmjR4/mOxxeyWTiA6pfqzMjIwMrVqzAyZMnsWbNGtjZ2Un9zUgoFEJbWxuPHz/+5rRIYWEh3r59W+aUamJiIt69e/fN5CgQCFhy/AYnJyckJSUhJCSk2m1lZWVh5syZePDgAfbt2yeV+4fM/+nVqxdcXFx4KwP46tUreHp64tSpU5g/fz7mzp3Ly8rhioqIiMDIkSMRGxsr84lakmQ28QEVP51BRQWoXVMVKwe1/qpA9a1btzB79mzUrFkTW7ZsQceOHSUb9GeuXLmCRYsW4fbt22Jpr6ioqNTk+Pn3mZmZEAgE5a5WFQgESnVO2X99+PABpqam2LFjB6ysrKrczuXLlzFp0iQMGzYMvr6+Uq37ygDv3r1Ds2bNZKIM4NOnT+Hh4YGLFy9iyZIlmDVrFurWrctrTGWZOXMmAGDbtm08R8IfmU58AHA/MRsB4c9x+Wk6VPDv5vSPPp7H91MrLczq2wJt9Uv/BMNxHHbu3ImVK1di5MiR8Pb2lspZbU5OTqhVqxY8PT0l3tdHxcXFn5JjWaPHjIwMaGtrl7taVVdXV6GT46lTp7Bw4ULcv3+/0m+aRUVFcHFxwb59+xAcHIyBAwdKKEqmPAcPHsS+fftkqgzgw4cP4e7ujhs3bmD58uVwcHBArVrVW2QnbtnZ2TAxMcGxY8fQvXt3vsPhhcwnvo8y84pw9E4iniS/x/nwa/ixpRH6mbfGqI76FV69+e7dO7i4uODo0aPw9vbGlClTJDr9aWZmhh07dqBr164S66MqhELhF8mxtASZnp4OLS2tLxLifxNkkyZN5Hql6ogRI9CuXTu4ublV+JpHjx5h/PjxaN68Of744w9oaWlJMEKmPHZ2dujevfunEYwsuXPnDlxdXXH//n04OzvD3t5epv5WDh06BG9vb6Xd2yc3ie9zU6ZMQffu3TF16tQqXX/37l3Mnj0bIpEIW7ZsQZcuXcQc4b+nuZubmyMlJUUu77kJhUKkpKSUeb8xISEBaWlp0NTULHe1apMmTWS2UkliYiLat2+PiIgI/PDDD+U+loiwZcsWeHp6wsfHB5MnT1b6lXF84jgOAoEAt27dgqGhId/hlCkyMhIuLi54+fIlXF1dMX78eJmYSfl4eknfvn3h5OTEdzhSJ5eJz83NDSoqKnB3d69yGxzHYe/evVi2bBlsbGywZs0asS5L3r59O65fv469e/eKrU1ZIxKJkJKSUu6CnJSUFGhoaHyVED//vkmTJrxNB/n5+eH06dMICwsrM5ElJyfD3t4eWVlZCAkJ+WaSZCTv5s2bmDJlityUAbxy5QpcXFyQlpYGd3d32Nra8r7yV5n39sll4gsKCkJkZCSCg4Or3VZ2djbc3d0/HVEyffp0sYzQhgwZgnHjxin9BmaRSITU1NRyk2NycjIaN25c7mpVPT09iSRHkUiETp06YenSpRg3btxXPz9+/DhmzpyJGTNmYOXKlUo5LSSL5LEMIBEhLCwMLi4uKCgogKenJ4YOHcrrzIGy7u2Ty8R35swZ+Pv749y5c2Jr8/79+3B0dMT79++xdevWat30LSwshLa2Nl6/fi2VRTTyrqSk5FNyLCtBvn37Furq6uUuyNHT06vS6r6bN29i+PDhePToERo1agTg35J28+fPx+XLlxESEqK0iwBkVefOnbFu3Tr07duX71AqjYhw6tQpuLi4oGbNmvDy8sLAgQN5STzKurdPLhPfw4cPMXr0aDx69Eis7RIRDhw4gCVLlsDa2hq+vr5V2uR87tw5eHl54Z9//hFrfMqM47gvkmNpCfLt27do0KDBN5NjadsOZs2aBSLCtm3bcPPmTUyYMAEWFhbYtGmTTO/LUkapqalo1aqV3JcB5DgOf/31F9zc3NCwYUN4e3ujX79+Uo/jxo0bGDVqlFLt7ZPLxJednQ1DQ0Pk5ORIpP3c3Fx4enrizz//hLOz86d9gBU1b9486OjoYMWKFRKJjykdx3FIT08vd0FOUlIS6tev/1VS1NDQwIoVK2BlZYXw8HBs27YNI0eO5PspMaX4888/cfLkSRw9epTvUMSipKQEBw8ehLu7OwwMDODl5YWePXtKNYYZM2agRo0aCAgIkGq/fJHLxEdEqF+//qdP+JISGxsLR0dHpKenY8uWLejdu3eFrvvhhx9w9OhRtGvXTmKxMVXDcRwyMjK+KgAQGxuL8+fPo6ioCGpqaqhXr165q1X19fXx/fff8/10lJKtrS1+/vln2Nvb8x2KWIlEIuzZsweenp5o06YNPD090blzZ6n0nZWVBVNTU6XZ2yeXiQ8AWrdujePHj6NNmzYS7YeIcPToUSxatAi9e/fG2rVr0aRJkzIf/+zZM/z0009ITExUqpvF8oqIsHv3bixduhQrV67EqVOnMGjQIEycOLHcBTmJiYmoU6dOuQty9PX1q30wKvOlj2UAY2Njoaury3c4ElFcXIzg4GCsWrUK5ubm8PT0lMqH6IMHD2L16tWIjo6W6ynkipDbxGdlZYWlS5eif//+UukvLy8Pq1atQlBQEJYvX465c+eW+uLw9/fHo0ePEBQUJJW4mKrLzMyEg4MD4uLisG/fPpiZmSEuLg7du3fH3bt3yz1dgYiQmZlZ7oKcxMRE1KpV65vJkd1DrLirV69iwYIFiI6O5jsUiSsoKEBgYCB8fHzQp08fuLu7S/SDPhHh559/Rr9+/bB06VKJ9SML5Dbx2dvbw8LCApMnT5Zqv8+ePcPcuXMRHx+PLVu2fHUzun///pg1axaGDRsm1biYyjl//jwmT56MMWPGwNvb+4vVoB4eHoiJicFff/1VrT6ICO/evSt3QU5CQgLU1NTKXZCjr68v0Sl9ebJs2TKoqanBy8uL71Ck5sOHD9iyZQs2bNiAgQMHws3NDcbGZR+yXR0vX75Ely5dcOvWLbk9cb4i5Dbxubi4QE1NDa6urlLvm4jw999/Y/78+ejatSvWr18PAwMD5OXlQVdXF2/fvmWf4mVUYWEhli1bhmPHjmH37t2wtLQs9TFt27bFhg0b8Msvv0g0HiJCdnZ2uQtyEhISoKqqWqHkqOjT623btkVgYKBS3If6r5ycHPj7++P333/H8OHD4eLigqZNm4q9Hx8fH1y5cgWhoaEK+3qS28QXGBiI6Oho/PHHH7zFkJ+fDx8fH2zduhVLlixBixYtsH37dly4cIG3mJiy3b9/H+PGjYOJiQm2b99e7h7LixcvYsqUKXj06BHvi1iICDk5OeWe55iQkAAA5U6pGhgYoGHDhnL7ZibvZQDF5d27d9iwYQO2b9+OsWPHYsWKFeWuO6isj3v7XFxcYGtrK7Z2ZYncJr7Tp09j69atCA0N5TsUvHjxAvPnz8e1a9cwevRoBAYG8h0S8xmO4+Dv7481a9Zgw4YNmDhxYoXe/CdMmAA9PT34+vpKIcrqISLk5uaWe78xISEBHMd9Mzmqq6vLZHIMDAzEtWvXxHKOoiJIS0uDr68vdu3aBXt7ezg5OUFbW1ssbd+4cQO//vorHj16pJB7++Q28cXExGDChAl48OAB36EA+PeNR0tLC3Xr1oW5uTn8/PzQrFkzvsNSeomJiZg0aRKKioqwd+/eSt23SE1NxY8//ohLly7BzMxMglFKz8fkWF6CFAqF31yQ07hxY6knx6FDh2L06NGllpZTZm/fvsXq1atx4MABTJ8+HYsXLxZLxShF3tsnt4nv3bt3MDY2RlZWFt+hAPh3Gm348OF4+PAh1q9fD39/f8yfPx9Llizh/ZBMZXXkyBHMmTMHc+fOhZOTU5Wq4m/fvh0hISG4evUq70WFpeX9+/dlbuH4+H1RUdE3k6OGhobYkmNRURG0tLTw6tUraGhoiKVNRfPmzRt4e3vj+PHjcHR0xIIFC6q1KEqR9/bJbeIjInz//fdIS0uTib1SPj4+SEpKwu+//w4AeP36NRYuXIiYmBhs2rQJNjY2PEeoPHJzc+Ho6IiIiAjs27evWpuAOY5Dz549MWXKlCofg6WI8vLyyr3fmJiYiIKCgjLPcvz46qOpHgAAIABJREFUtaamZoWSY1hYGNzc3HDjxg0pPDv59vz5c3h6euLs2bNYtGgR5syZU+X71AcOHMCaNWsUbm+f3CY+AGjZsiVOnjyJVq1a8R0KevfujRUrVnx1Gve5c+fg6OiIVq1awd/fX2LLkJl//fPPP7Czs0P//v2xYcMGsSxMiYmJgbW1NR49esQOnq2EDx8+lJkcP36fn58PPT29bybHhQsXQktLCytXruT7acmNx48fw93dHVevXoWTkxNmzJhR6dknIsLAgQNhZWWFJUuWSChS6ZPrxNevXz+sXLmy1CXp0pSVlQVDQ0OkpqaWWgC5qKgIfn5+WL9+PWbNmoVly5ahbt26PESquIRCITw8PBAcHIzAwEAMGTJErO0vWrQImZmZ2L17t1jbVXb5+flISkoqd0FOXl4eOI6DmZkZWrduXWqC1NLSUpqp6MqKiYmBm5sbbt++jZUrV2LKlCmVOhz6xYsX6Nq1K27fvq0w6xbkOvHZ2dmhX79++O2333iN49ChQ9i7dy9OnTpV7uMSEhKwePFiREVFwc/Pj/ezuBTFs2fPMH78eGhrayM4OBgCgUDsfeTl5cHExAR79uyRy6Nw5NmDBw9gaWmJQ4cOlTmCzMnJQZMmTcqtraqjo6PUyfHWrVtwdXXF48eP4erqCjs7uwrf916zZg3++ecfnDp1SiHes+Q68a1YsQJ169aFs7Mzr3FMmjQJ3bp1w8yZMyv0+IsXL8LR0RFNmzbF5s2b0bJlSwlHqJiICEFBQVi5ciU8PDwwc+ZMif5R/u9//8Py5ctx79493k6MV0abN29GTExMuQdPFxYWIikpqdzVqtnZ2dDV1S13QY6Ojo7C7xG8fv06nJ2dkZiYCHd3d4wZM+abz7m4uBgdO3aEm5sbfv31VylFKjlynfi2bduGmJgYbN++nbcYOI6DQCBAVFRUpaYBiouL8fvvv2PNmjWYNm0anJ2ded8oLU/S0tIwdepUJCUlISQkROLFyoF/E+3QoUPRtWtXdq9JigYOHAgHBweMGDGiWu0UFhbi7du35S7Ieffu3TeTo0AgUIjkeOnSJTg7OyMnJwceHh4YMWJEuSPi69evw9bWFrGxsWjYsCEy8opwNDoRT1JykVsoQoPaNdFa0AC/mutDo55sfzCU68R38uRJBAYGfnOKUZKioqJgb29f5UNx3759iyVLluDatWvYsGEDRo0apRBTCZIUGhqKqVOnYtKkSfDw8KjU/YrqevPmDczNzXHz5k22UEkKPnz4AIFAgKSkJKnUKy0uLv40cixrQU5mZiYEAkG5C3IEAkGVts9IGxHh7NmzcHZ2Bsdx8PT0hI2NTZnvQdOnT0fedxr4vssIXHmWDgAoEnGffl67Zg0QgL6ttDCrTwu0M5DNze9ynfju3r0Le3t73Lt3j7cY3N3dkZ+fj7Vr11arnatXr2LOnDnQ1tbG77//LpURjLzJz8/HkiVLcOrUKezZswd9+vThJY5169bh4sWLOHPmDPuQImEnT56En58fLl26xHconxQXFyM5ObncBTkZGRnQ0dEpNznq6urKTHL8WH/YxcUFdevWhZeXF6ytrb96fQdeeozVZx6jRs1aKC9xqKgAtWuqYuWg1pjQrZlEY68KuU586enpaNOmDTIyMniLoUuXLli7dq1YFjyIRCIEBATAy8sLv/32G1xdXVmx6//vzp07GD9+PMzNzbFlyxZeyygpQy1DWTFz5kwYGxtj8eLFfIdSKUKhEMnJyeXec0xLS4OWlla55eN0dXWlun+O4zgcPnwY7u7u0NbWhre396cDuEMiX2NV6GMUCLlvtPJ/6qjVwMpBbWQu+cl14iMi1KlTB+/eveNle0BqaipatWqF9PR0sb44U1NT4eTkhAsXLmDt2rUYO3as0o4sSkpKsG7dOmzcuBGbNm3C2LFj+Q4JwNf3OxjxIyI0a9YMZ86cgYmJCd/hiJ1QKERKSkq5yTE1NRWamprlrlZt0qSJ2Kf7RSIR9u/fD3d3dxgbG+O3hW7wjshDgbCk0m3VUVPFIYduaKsvO9Oecp34AKBFixY4c+YMfvjhB6n3/eeff+LkyZM4evSoRNq/ceMGZs+ejQYNGmDLli0KUy+yot68eQM7OzuoqKhgz549EjmCpTqmTZuGOnXqYPPmzXyHopAePXoEGxsbvHz5Umk/+IlEok/JsawEmZKSAg0NjXIX5Ojp6VUpOQqFQuzevRve4alQMWgHqFR+O4iKCjDARAfbJ3Sq9LWSIhsTzNWgr6+PxMREXhJfaGgoBg8eLLH2e/Togdu3byMwMBCWlpYYN24cPDw8lGKEsW/fPixYsACLFy/GokWLZHIVna+vL0xMTGBnZ4dOnWTnj1pRnD59GoMGDVLapAcANWvW/JS8ylJSUlJqcrxz586n75OTk9G4ceNvJsf/btNRU1PD8LF2WBd/CcWiik9xfo4IuPw0HZl5RTKz2lPuR3wTJkzAgAEDMHHiRKn2KxQKoa2tjdjYWOjq6kq8v/T0dKxYsQKnTp2Cj48PJk6cqJCbcbOzszFr1izcu3cP+/btQ4cOHfgOqVx79uzBpk2bEBUVJZPJWZ717dsXS5YskeiHS2VRUlKCtLS0chfkJCcnQ11d/auk+LKWMS5n1EUlbu19pXbNGlhg3RLTe8vGSmi5T3zLli1Dw4YNsXz5cqn2e/XqVSxYsADR0dFS7TcqKgqzZ8/Gd999hy1btsh8YqiM8PBwTJo0CUOGDIGvr69clHUjIvz0008YOXIkHB0d+Q5HYeTk5EBfXx+pqaly8TpQBBzHfZUcExMTcTHfABn1Kn6cV1mGt9eD3+j2Yoi0+hRiqjM2Nlbq/Up6mrMsXbp0QWRkJHbu3ImBAwdi1KhR8PLyEsv5W3wpKiqCq6sr9u7di+DgYPz88898h1RhKioq2L59OywsLDBy5EixnoStzMLCwtCrVy+W9KSoRo0aEAgEEAgEX5xoMvnPW7j0JK3a7ecWCqvdhrjI/VzZx3t80hYaGopBgwZJvV8AUFVVxbRp0/D48WMQEdq0aYMdO3aA46oxF8GT2NhYdOvWDU+f/r/27jyuxvT9A/gnI5Nk7VRKkiVK2lOnwZStQYyZ0RijspShksqWEWNMIvuSw8iWjDBjmRnzo8GMvbSIyhalLEmRrdJ6zrl+f3jVV4O0nHOec073+/WaF6Nz7vuK6nqe67nv676F1NRUhUp6VYyNjeHt7Y3AwECuQ1EaXF1YMq+VlpYiNjYWa9asQWriRYmM2UZNfo41UvjE17lzZ5knvvv37+PRo0eNOudNEjp06IDNmzcjJiYGO3bsgIODAy5dusRpTHVFRBAIBHB0dISvry9+//13hT7yJzg4GMnJyYiJieE6FIUnFos5vbBsaogImZmZ2LNnD/z8/GBrawsej4fAwEDcvXsXtkZ6aPFR4xYYqTVvBmNd+dmTrBSlTlknvpiYGAwbNkxuFjNYW1sjNjYWu3fvxqhRozBq1CgsW7YMPB6P69DeKS8vD5MnT8bTp08RFxfHyYpcSWvZsiU2b94MX19fXLt27Z3HUzF1c+XKFbRr1w7dunXjOhSl9PLlSyQlJSE+Pr76P3V1dfD5fNjb2+Pbb7+FtbV19ddwQXE5+q04BdTaq6V2BMDV+v0rU2VN4e/4tLS08PLlS5SVlclsTnm8Gm3WrBkmTZqEmzdvomXLlujduzd+/vlniET133AqTX/++ScsLS3Rt29fxMbGKkXSq/LZZ5+hb9++CA0N5ToUhSaP31+KSiQS4erVq9i2bRu8vLxgamqKTp06ISQkBIWFhfD09ERqairu37+P3377DbNnz0a/fv1qXLi1U/sIOuKnoAY+SlFRAQb20pKbrQyAEqzqBICuXbvi33//lckVYllZGXR0dJCVlQVNTU2pz9dQaWlp8PPzw6tXryAQCODg4MBpPMXFxZg5cyb+/fdf7NmzB5988gmn8UhLbm4uLCwscPbsWaXsNiILDg4OWLJkCYYMGcJ1KArn8ePHiI+PR0JCAuLj45GUlARdXV3w+fzqOzozM7M6d5rKzMyEu7s7WnQ0Qr7ZtygX1j9dyGPnFoW/4wNelzsfPHggk7nOnTuHPn36yHXSAwBzc3OcPXsWs2bNgqurKzw9PfH4ceNXZjVEYmIirK2tIRQKkZKSorRJDwD09PTw448/wsfHB0pwTSlzBQUFuHHjBgYMGMB1KHKvoqICiYmJCA8Px/jx49GtWzf06tULmzdvRvPmzTFnzhxkZ2fj1q1biIqKgo+PD6ytreuU9IgI27dvh4ODA8aPH48zh6Pwg0tvtFStX8p43avTWK6SHqAEz/gA2S5wUaTVZioqKnBzc8OoUaPw008/wdTUFIsWLYKPj49MusILhUKEhYVBIBBg06ZNcHV1lfqc8sDHxwdRUVHYvXs3Jk6cyHU4CuXvv//GoEGD2EG//0FEuH//fo3ncmlpaTAyMgKfz4ezszMWLVqEnj17NrqxxZMnTzBlyhTcv38fZ86cgampKQBUN5peeiwdZUIRaruuY6czyEBQUBA0NTUxb948qc9lZGSEAwcOwNJSPjZi1sf169cxY8YMPH36FAKBQKpX1VlZWXB3d0erVq2wa9cudOrUSWpzyaPk5GS4uLjg+vXrcl8dkCfjx4/HwIED8d1333EdCqeKi4tx6dKlGmVLIoKDg0N1ydLW1hYaGhoSnbfqrEsPDw+EhIS88wIkLecFNp/JxOlbT6ACoOwd5/EN7KUFX6cecnenV0UpEl94eDgyMjKwceNGqc6TkZEBJycn5OTkKGz/QCLCgQMHMHv2bDg5OWHlypUSbblGRIiKisLcuXMRHByMgIAApWytVhcBAQF49eoVtm/fznUoCkEkEkFbWxupqam19qZUNmKxGLdv365xN5eRkQFzc/PqZ3N8Ph8GBgZS+7nTkLMunxaX4+DlHKQ/KkJhWSXaqKnCWLc1XK3l/wR2pSh16uvr4/Tp01Kfp2q1maImPeB1+XPs2LEYMWIEQkNDYWZmhuDgYMyYMaPRRys9ffoU3t7eSE9Px6lTp5rcaRL/tWTJEvTu3RsXLlxA//79uQ5H7iUkJHywIbMyePbsWfVdXHx8PBITE9G+ffvqBOfp6QkLCwuZlXsvXboEd3d32NraIjU1tc5nXWpqfCw3vTfrjZRAQkIC2djYSH2eoUOH0uHDh6U+jyylp6eTs7Mz9e7dm06dOtXgcU6cOEGdOnWimTNnUmlpqQQjVGwHDhwgU1NTKi8v5zoUubdgwQKaP38+12FIVGVlJV2+fJk2b95MEyZMoJ49e1Lr1q1p0KBBFBwcTEeOHKH8/HxOYhMKhRQaGkpaWlq0b98+TmLgilKUOnNzc2FtbY28vDypzVFcXAxdXV08fPgQbdq0kdo8XCAi/P7775g5cyYcHBywevXqOl91l5WVYf78+Th48CAiIyPZEvT/ICK4uLjA0dFRJs+gFZmVlRU2btyo0HfHubm5NUqWly9fRpcuXWqULHv37s1584vs7Gx4eHigRYsWiIqKQufOnTmNR+a4zbuSIRQKSVVVVapX1X/++ScNGjRIauPLg1evXtHChQtJU1OTli9f/sG/z7S0NDIzM6MxY8ZQQUGBjKJUPHfu3CFNTU3Kzs7mOhS5lZOTQ+3bt6fKykquQ6mzkpISunDhAq1evZq+/vpr6ty5M2lqapKLiwstWbKETp48SS9evOA6zBrEYjHt2rWLeDwerV69mkQiEdchcUIpEh8RkYGBgVR/sEybNo3WrFkjtfHlSUZGBrm4uFDPnj3p+PHjb31cJBLR2rVricfjUWRkJInFYg6iVCxLly4lFxcX9nf1Htu3b6dvvvmG6zDeSywWU2ZmJu3Zs4f8/PzI1taW1NXVydbWlqZPn06//PILZWRkyPW/b0FBAY0ZM4b69OlDqampXIfDKaVJfJ988gmdP39eKmOLxWLS19enmzdvSmV8eXXkyBHq2rUrffXVV3T37l0ien1lPnjwYPrkk0/ozp07HEeoOMrLy8nExIQOHTrEdShy6auvvqKoqCiuw6j24sULOnnyJC1ZsoRcXFyIx+ORvr4+ubq60urVq+nChQtUUlLCdZh1dvz4cfYM/g1Kk/jGjh0rtQe0aWlp1LVrV7m+mpOWkpIS+umnn6hDhw40btw40tLSopCQEIUqScmLs2fPkr6+PhUWFnIdilwpLy+ntm3bcrrIIy0tjbZu3UpeXl5kampKrVq1ogEDBtDcuXPp0KFDlJOTw0lsjVVSUkL+/v6kr69PJ0+e5DocuaEU2xkA6bYtq+rWosjbGBqqZcuWCAwMRFpaGv766y+0b98eNjY2Mun8omw+/fRTDBkyBD/++CPWrl3LdThy48KFC+jVqxe0tbVlMt/jx49rbCeo6mdpb28PPp8PX1/fevWzlFcpKSlwc3NDnz59kJqaqtCHVUua0vz06ty5M7Kzs6Uy9tGjRzF//nypjC3vYmNj4eHhgSFDhiAvLw8XLlzAjBkzsGXLFqxfv54dHVNPq1atgqmpKTw8PGBlZcV1OHJBmqcxVFRUICUlpTrJJSQk4OnTp9VJbs6cObCzs1Oq7joikQhr167FypUrsW7dOri5uTXJi/ZacX3LKSkHDhygr776SuLjPnv2jFq3bq1Q9XxJqKiooIULF5KOjg798ccfNT5WVlZGy5Ytow4dOtCiRYua3N9NY+3YsYPs7OxIKBRyHYpcMDExocTExEaPIxaL6e7du7R//36aOXMmOTg4kLq6OllYWNC0adMoMjKSbty4odQrGe/du0eOjo40YMAAtoq4FkqT+C5evEh2dnYSH/fXX38lFxcXiY8rz27fvk12dnY0bNgwevTo0Xtfd+/ePXJ1dSVDQ0P6448/muQz0IYQiUTUv39/2rx5M9ehcC4rK4u0tbUblIyKi4vpzJkztHz5cvriiy+oY8eOpKOjQ6NHj6awsDA6ffo0FRUVSSFq+RQdHU1aWloUFhbGLqo+QGkS34MHD0hPT0/i406YMIE2bdok8XHlkVgspq1btxKPx6ONGzfWOZGdPHmSjI2NadiwYXT79m0pR6kcrl27Rjwer9YLi6ZAIBDQxIkTP/g6kUhEN2/epMjISJo2bRpZWlqSuro68fl8CgwMpP3799Pdu3eb5MXXs2fPaNy4cWRsbEzJyclch6MQlKJzC/D6CBx1dXW8evVKYg+lxWIxOnbsiISEBHTt2lUiY8qrN48iiY6OrvchqhUVFQgPD8fy5csxbdo0BAcHo1WrVlKKVjnMnz8f9+7dw969e7kOhTMuLi6YOHEixo4dW+PPnz17hsTExBrP5t7sZ2lvbw9LS8smf3zR6dOnMXHiRIwePRorVqyAuro61yEpBq4zryTp6+vTvXv3JDZeYmIimZiYSGw8eXXs2DHS1dWloKCgRne/ycnJoW+//ZYMDAzowIEDTfIKvK5evXpFhoaGdOLECa5D4URJSQlpaGjQ48eP39nPcuDAgTR//nz6888/KS8vj+tw5UpZWRnNnj2b9PT0KCYmhutwFI7SrOoEXm9pyMnJgYGBgUTGU6RDZxuipKQEQUFB+Ouvv7B37144OTk1esxOnTph7969OHPmDPz8/BAREYHw8HCYmJg0PmAlo66uDoFAAF9fX1y9ehVqampchyQTVf0s9+7di2bNmqFbt24wMDAAn89H//79MWfOHLnoZymvrl69Cnd3d3Tr1g2pqang8Xhch6RwlOqgtKrEJylHjx6V2jJrrl25cgW2trZ49uwZUlNTJZL03uTk5IQrV67AxcUFAwYMQFBQEIqKiiQ6hzJwcXGBubk5li9fznUoUlFaWoq4uDisXbsWY8eOhYGBAczNzbFz507k5OTA1dUVDx48wPXr17Fjxw589913MDMzY0nvHcRiMdavX49BgwYhICAAhw8fZkmvobi+5ZSkwMBAifXTzMvLo7Zt2yrdcTJCoZCWL19OWlpaFB0dLZM5Hz16RBMmTKBOnTrR3r17WfnzPx48eECampqUnp7OdSiN8q5+li1btiQbG5u3+lmKxWLq2rUrpaWlcR22Qnjw4AENHjyYHBwcKDMzk+twFJ7SlTol1b3l+PHjGDJkCFq0aCGR8eTB/fv3MWHCBBARkpKS0KVLF5nM27FjR0RFRSE2Nra6/CkQCNCnTx+ZzC/v9PX1sXDhQvj6+uKff/5RmM3GhYWFSExMrNEFRU1NrXoByrhx42BtbY2WLVu+9d709HRUVlayr4E6+O233zBjxgz4+flh/vz5rGuSJHCdeSVp//795OrqKpGxxo4dSzt27JDIWPJAXvb4CIVCEggExOPxKDAwUO6ObeFKZWUlWVpa0p49e7gO5Z2EQiFdvXqVtm3b9lY/yzlz5tDBgwfpwYMHdR5vzZo1NHXqVClGrPhevHhBHh4eZGRkRAkJCVyHo1SUKvHFxsYSn89v9DiVlZXUvn17ys3NlUBU3Hr+/DmNHz9e7vb4PH78mLy8vEhXV5eioqJY+ZOIEhISqGPHjvTs2TOuQ6H8/Hw6cuQIBQcH0+DBg6lNmzZkZGREHh4etGnTJkpOTqaKiooGjz948OC3OgIx/3Pu3DkyNDSkadOmUXFxMdfhKB2l2ccHvC7l9evXr9HlzvPnzyMwMBDJyckSiowbZ8+excSJE+Hi4oJVq1bJ5R6fhIQETJ8+HWpqahAIBLC0tOQ6JE5Nnz4dIpEIW7ZskdmcVf0s3yxZvtnP0t7eHvb29hLrZ1lUVAQ9PT08evQIGhoaEhlTWVRUVODHH3/Erl27sHXrVowaNYrrkJQT15lXkioqKkhVVbXRR+bMmzePFi5cKKGoZK+8vJzmzZtHurq6dPToUa7D+SChUEgRERGkra1N06dPl4s7Hq48f/6cdHV1KS4uTirji8ViunfvHv36669v9bOcOnUq7dy5U+r9LA8fPkxDhw6V2viK6saNG2RlZUUjR45k+xalTKkSHxGRrq5uo8/OMjMzk9oPHmmr+uYZNWoUZ+ebNVRBQQF5e3uTjo4O7dixQ6mbCddm7969ZG5uLpEzD//bz1JXV5fzfpZTpkyhdevWyXROeSYWi0kgEJCmpiZt2bKFlf1lQOkSX9++fenixYsNfv/9+/dJU1NT4Zq8Vn3z8Hg8ioiIUOhvnkuXLpG9vT3Z29tTUlIS1+HInFgspqFDh9Lq1avr9T6RSETp6em0a9cu8vb2fquf5b59+yg7O5vTrw2xWEx6enp069YtzmKQJ7m5uTRs2DDq27cv+zuRIaVbF9u5c+dGbWKPiYnBsGHDFGoDbV5eHjw9PfHkyRPExsaiZ8+eXIfUKDY2NoiLi0NUVBRGjhyJL774AkuXLlWqM9Nqo6Kigk2bNsHBwQFff/31ezsRvaufZbt27aq3E0yaNEnu+lmmpaWhZcuWMDIy4joUzv3+++/w8fHB1KlT8cMPPyj8wbeKRKk6twCN796iaN1ajhw5AktLS1hbWyMuLk7hk16VZs2aYfLkybh58yZUVVXRu3dvREREQCQScR2aTBgZGcHf3x8BAQEAXjdhv3LlCn7++WdMmjQJxsbGMDQ0xMqVK1FRUQFfX1+kp6cjOzsb+/btQ0BAAOzt7eUq6QH/O3RWUfYqSkNRURG8vLwwZ84cHD58GCEhISzpyRrXt5yStnLlSpo9e3aD3ltWVkZt2rShgoICCUclecXFxTR16lQyNDSk8+fPcx2O1KWkpFD//v3JxsamUaVsRfHw4UP69ddfqX379mRqakoaGhrUu3dv8vT0pIiICEpNTVW4cjwRUb9+/ejvv//mOgzOxMXFUffu3cnT05MKCwu5DqfJUrpSp76+foO3IZw7dw59+vSR+5JaUlIS3Nzc4ODggNTUVLRp04brkKTOwsIC586dQ3R0NL766isMHz4cYWFh0NbW5jq0RisrK8Ply5erS5bx8fF49eoV+Hw+Ro8ejWPHjiE9PR2dOnXiOtRGefbsGdLS0uDo6Mh1KDJXWVmJJUuWYOvWrfj555/x5Zdfch1Sk6aUpc6G7uOT9zKnUChEaGgoRo4cidDQUERFRTWJpFdFRUUF7u7uSE9PR9u2bWFqagqBQAChUMh1aHVGRLhz5w727t0Lf39/2NnZQVNTE/7+/sjKysLnn3+OU6dOoaCgAEePHkVkZCScnZ2xYcMGrkNvtBMnTsDR0bHJnEJR5fbt2+jXrx8SExNx5coVlvTkAde3nJKWlZVFBgYGDXqvkZERXb58WcIRSUZWVhb169ePBg8eXK/WUMrs2rVr5OTkRBYWFnJb7n358iX9888/FBoaSiNHjiQej0f6+vo0ZswYWrVqFZ0/f55evXpV6xh5eXmkpaWl8A2dPTw8aPPmzVyHITNisZgiIiKIx+PRxo0bFXqltbJRusRXXl5Oqqqq9X7+cfv2bdLV1ZW7L06xWEy7du0iHo9Ha9asabJ7295HLBbTvn37qFOnTuTh4cFpm7n/9rPs06cPtWrVivr379+gfpZvioiIIAcHB4X99xeJRKSlpUV3797lOhSZyM/Pp5EjR5KVlRXduHGD63CY/1C6xEdEpKOjU+8fgOvXrycvLy8pRdQwT58+pa+//ppMTU0pJSWF63DkWlFREQUFBZGmpiatXbu2UX0k6+rx48f0119/0YIFC97qZykQCOjSpUsSi0MkEhGfz6etW7dKZDxZi4+PJ1NTU67DkIm//vqLOnbsSN9//73SHWumLJQy8dnY2FBiYmK93uPs7EyHDh2SUkT1d/LkSdLX16eAgAAqLS3lOhyFcfPmTRo6dCiZmprS6dOnJTZueXk5JSYmUnh4OI0fP566detGbdu2paFDh9IPP/xAR48epSdPnkhsvndJSUkhLS0thevIQ0S0aNEimjt3LtdhSFVxcTFNmzaNunTpQmfPnuU6HKYWSpn4Ro8eTYcPH67z64uLi0lDQ4NevnwpxajqprS0lGbFjSRXAAAY20lEQVTNmkWdOnWi48ePcx2OQhKLxXTw4EEyMDCgcePG1buF3X/7WX7yySekrq5O5ubmMutn+T6zZ8+mCRMmyHzexrK1tZXohYi8SUxMpJ49e5KHhwc7aksBKN12BqD+KztPnToFOzs7zldIXrt2DePHj4eRkRFSU1PlfluFvFJRUcGYMWMwbNgwhIWFwcLCAkFBQQgMDHznwcKvXr1CcnJyje0EIpEIDg4O4PP5CA0Nha2tLVq3bs3BZ1PT4sWLYWpqitOnT2PgwIFch1Mn+fn5yMjIQL9+/bgOReKEQiHCwsKwceNGCAQCjB07luuQmDpQysRX37ZlXG9jEIvFCA8Px9KlS7Fy5UpMmjSpSXe2kJRWrVohNDQUEydOREBAACIjI7FhwwZ06dKlRpK7ffs2zMzMwOfzMXbsWKxduxZdunSRy38DDQ0NhIeHw8fHB6mpqXLXmeVd/v77bwwZMkTpupPcuXMHHh4eUFdXx+XLl6Gvr891SEwdKWXi09fXR2pqap1eS0Q4duwYjh8/LuWo3i03NxeTJk1CcXEx4uPj0b17d07iUFbPnj3DnTt3YGdnh/z8fAwfPhxqamoYMmQIBg0aJJf9LD9k9OjR2LlzJ1atWoWFCxdyHc4HHT16FC4uLlyHITFEhMjISMybNw/BwcEICAhAs2ZKtyVaqSnlv1Z9+nVev34dzZs3h7GxsZSjetuhQ4dgZWWF/v3749y5cyzpNZJQKERKSgq2bNnyVj/L8vJyLFq0CJmZmQgKCsKFCxdQXFyscEmvSnh4ONavX4/MzEyuQ6lVZWUlTp48iWHDhnEdikQUFBRgzJgxWL9+PU6dOoWZM2eypKeIuH7IKA2ZmZnUtWvXOr12+fLlNH36dClHVFNhYSFNnjyZunfv3iT6TkpLbm4uHT58mIKCgujTTz8lDQ0NMjExocmTJ3+wn2VWVhZ9/vnn1KNHDzp27JiMI5eMVatWkbOzs9ztPX3T2bNnydramuswJOLvv/8mPT09mjNnDpWVlXEdDtMISpn4SktLqUWLFnVadffpp5/K9JTy2NhY6tatG3l5ecn8AFBFVlpaSrGxsbR27VoaO3YsGRgYUIcOHWjEiBEUEhJCJ06coOfPn9d73KNHj1KPHj1o9OjRlJWVJYXIpaeiooLMzMxo//79XIfyXvPmzaOFCxdyHUajlJSUkJ+fH3Xu3Jn+/fdfrsNhJEApEx8RkZaWFuXl5dX6mufPn1Pr1q2ppKRE6vFUVFTQokWLSEdHp15bLZoisVhMd+7coejoaJoxYwb17duX1NXVycbGhnx9fWn37t10+/Ztid3plJaW0tKlS0lTU5MWL14sk68HSYmNjSU9PT25XUJvZmZGcXFxXIfRYMnJyWRiYkLjxo2jZ8+ecR0OIyEqRERcl1ulwdraGtu2bYONjc17X/Pbb78hKioKR48elWosGRkZ8PDwQLt27RAZGQldXV2pzqdoCgsLkZSUVH2Yanx8PFRVVau3E/D5fFhbW0NdXV2qcdy/fx+zZs3C5cuXsX79eowaNUouV3b+17Rp06CqqgqBQMB1KDXcv38fNjY2yMvLU6iDnQFAJBJh1apVWLNmDTZs2IDx48dzHRIjSVxnXmkZNWoU/fHHH7W+ZuLEiSQQCKQWg1gspm3bthGPx6Pw8HCF7bMoSVX9LLdv305Tpkx5q5/lgQMHOG/CfeLECerVqxeNGDGCMjIyOI2lLp4+fUodO3asd7ciaduyZQu5ublxHUa9ZWdn04ABA8jR0bHJ9BZtapQ28fn4+NSa1EQiEWlra0vtuc6TJ0/oiy++IAsLC7p27ZpU5lAE7+tn6e7uLvF+lpJUXl5OK1asIE1NTVqwYMEHT1Dg2u7du8nKyooqKyu5DqXa559/TtHR0VyHUWdisZh2795NPB6PVqxYoZAH/TJ1o7SlzmXLlqGwsBDLly9/58eTkpIwceJE3LhxQ+Jz//333/Dy8oKbmxuWLFmikMvlG6KiogKpqak1SpYFBQWws7OrLlna2dmBx+NxHWqdPXz4EHPmzEFcXBzWrl2Lr776Si7Ln0SEwYMH44svvoC/vz/X4aC8vBxaWlrIzs5WiA5Ez549g4+PD65du4bo6GhYWlpyHRIjTRwnXqmJioqqtcyyePFimj17tkTnLCkpoRkzZlDnzp3p1KlTEh1b3ojFYrp//z799ttvNGvWrLf6We7YsYOuX7+uNOXd06dPk6mpKQ0dOpTS09O5Dued0tPTicfj1bs3qTScOHGCHBwcuA6jTv755x/S19cnf39/hVrYxDScUnZuAT7ctuzYsWMICwuT2HwpKSkYP348zM3NkZqaivbt20tsbHnwvn6WVXdy8tTPUhqcnJxw5coVCAQC9OvXD15eXvjhhx+goaHBdWjVevXqBR8fHwQGBuLAgQOcxqII3VrKysoQHByM3377DTt37oSzszPXITGywnXmlZbbt29T9+7d3/mx/Px8atu2rUTOyhKJRLRy5UrS0tKiPXv2yPVm4roSi8WUnp5OUVFR5OPjQ1ZWVqSurk729vYUEBBA+/bto6ysLKX4XBvi0aNH5OHhQfr6+rRv3z65+nsoKSmh7t27y3Rv6rsYGRnR5cuXOY2hNqmpqdSnTx8aM2YMFRQUcB0OI2NK+4yvpKQEmpqaKCkpeeuZzO7du/Hnn3/i0KFDjZrjwYMHmDBhAkQiEXbv3g1DQ8NGjceV58+fIzExsfpOLiEhAW3btgWfz4e9vT34fD4sLS2hpqbGdahy5cKFC5g+fTo0NTWxceNGmJqach0SAODEiRPw9vbGtWvXpL4F5F0yMjLg6OiIhw8fyt3zULFYjPXr1yMsLAyrV6/GhAkT5C5GRvqUttSprq4OdXV1PH369K3FFMeOHWv0aQz79++Hv78/Zs6ciaCgIIXZpyQUCnHt2rUaJcuHDx/C1tYWfD4f3t7eiIyMRMeOHbkOVe71798fycnJ2LJlC5ycnODh4YHFixdzfryVs7Mz7O3tERoaimXLlsl8/piYGAwfPlzuEsqDBw8wadIklJeXIyEhAd26deM6JIYrXN9ySpO5uflb5ZbKykpq3749PXz4sEFjvnjxgtzc3KhXr1506dIlSYQpVbm5ufT777/TvHnzyNHR8Z39LOVpCbyiys/PJ09PT9LT06NffvmF8/Jnbm4u8Xg8TrbSfPbZZ3Tw4EGZz1ubffv2kZaWFi1dupRtU2CUdx8fEZGLiwsdOXKkxp+dO3eOrKysGjTe2bNnqUuXLuTj4yOX+7pKS0spLi7urX6Ww4cPp59++omOHz/eoH6WTN1dvHiRrK2tqX///pSSksJpLAKBgAYMGCDTJFxcXEwaGhr08uVLmc1Zm+fPn5Obmxv17NmTkpKSuA6HkRNKfZ7Gu44nakiZs6KiAsHBwfjmm28gEAiwefNmTp6dvImIkJWVhb179yIgIAD29vbQ1NSEn58fMjMzMXLkSPzzzz8oKCjAsWPHsGjRIjg7O6Ndu3acxq3s+Hw+EhMT4e7uDmdnZ/j7++PFixecxOLt7Y2ysjLs2rVLZnOeOnUKffv25bzcCwBnz56FpaUl2rRpg8uXL8PW1pbrkBg5obTP+ID3J76ff/65zmOkp6fDzc0Nenp6SElJgY6OjqTDrJOioqLqfpZV/73Zz9LV1RU2NjacJ2QG+OijjzBt2jS4uroiODgYJiYmWLZsGSZOnCjTs9s++ugjbNmyBcOHD8eoUaNk0jhAEs/PG6vq7MVffvkF27Ztk/ttFYzsKe2qTgDYtWsXTp8+jaioKACvH25bWVkhPz//g4tRiAhbtmzBokWLEBoaiqlTp8rsYb1YLMbNmzdrJLns7GxYWlpW75vj8/nQ19eXSTxM41y6dAl+fn5QUVGBQCCotXG6NAQGBqKoqAg7duyQ6jxEBENDQ8TExKB3795Snet9rl+/Dnd3dxgYGGDbtm3Q1tbmJA5Gvin9Hd+DBw+q/z8mJgafffbZB5Nefn4+vLy8kJeXhwsXLqBXr15SjfPJkyfVLb4SEhKQmJgIbW3t6gTn7e0Nc3NzqKqqSjUORjpsbW0RFxeHXbt2wcXFBV9++SWWLl2KDh06yGT+kJAQmJqa4vz58xgwYIDU5rlx4wZUVFRgYmIitTneRywWQyAQICQkBGFhYZgyZYrcrSpl5IfSJr6C4nLEPW+F7I6O8IxKQhu15kiIz8Fk59rLHn/99RemTp0KT09PHD58GC1atJBoXBUVFUhLS6txN1fVz9Le3h6BgYGwt7dXqH6WzIc1a9YMnp6e+PLLL7Fo0SKYmJhgyZIl8PLykvpWmDZt2mD9+vXw9vbGlStXJP41XaWqW4usE05ubi4mT56Mly9f4uLFizAyMpLp/IziUbpSZ+qDF9h0JhNnbz8BQCgX/u/To8pyfKymhoHG2vB17AGLzv9b6PHq1SvMnj0bx48fx+7duyVyZUxEyMnJqZHkUlNT0a1btxolS2NjY5k++2G4l5KSAj8/P5SXl0MgEMDe3l6q8xERRo4ciQEDBuD777+XyhxOTk6YO3euTJ+pHTp0CL6+vvD19cWCBQvQvLnSXsszEqRUiW9P/F0sPZaOMqEItX1WKiqAWvOPsGCEMdz5hrh06RLc3Nxgb2+PjRs3om3btg2av6Sk5K1+lkKhsEaSU+Z+lkz9EBF++eUXfP/99xgxYgTCwsKgpaUltfmys7PRt29fJCUloWvXrhId++XLl9DX10d+fr5MFlgVFhYiICAA58+fx549e8Dn86U+J6M8lCbxvU56N1FaKa7ze1qqNoO1yj2c/HkRNm7ciG+++abO7yUiZGRk1Ehyt27dgpmZWY1WX4aGhuxZA1Orly9fYvHixYiOjsaPP/4Ib29vqZU/w8LCcOHCBfzf//2fRL8uDx48iB07diAmJkZiY75PbGwsPDw8MHjwYKxbt06uGoUzikEpEl/qgxcYty0epZWier9XRVSJrd+YYKhN7QtY/tvPMjExEa1bt65xN8f6WTKNce3aNfj5+eHly5fVp0BIWkVFBaysrBASEoIxY8ZIbFxPT09YWVlhxowZEhvzvyorK/HTTz9h+/btiIiIwOjRo6U2F6PclCLxTf3lEk7ezK+1vPk+KirAZ711sMX9f5tbhUIhrl+/XuNuLicnp7qfZdUdHetnyUgaEWH//v2YO3cuBg8ejJUrV0p87+i5c+fg5uaGGzduSKTsLhaLoaenh7i4OKn1v7x16xbc3d2hpaWFnTt3su89plEUPvEVFJej34pTKBfWvcT5Xy0+UsEiiwpcS36d5JKTk6Gvr1/jbs7U1JQ9OGdkpqioCEuWLEFkZCQWLlyI6dOnS/Trz9PTE23btsW6desaPVZycjLc3NyQnp4ugchqqtpP+8MPPyAkJAQ+Pj7s0QHTaAqf+LacvYN1/9xuVOIjYTl0Hl/CyO5q4PP56Nu3r9IdJMsopvT0dMyYMQN5eXkQCARwdHSUyLgFBQUwNTVFTEwMrK2tGzXWkiVL8Pz5c6xdu1YisVV5cz/tnj17YGxsLNHxmaZL4dfQp+cVNirpAYBK84/Rb8TY6n6WLOkx8sLY2BgnTpzAjz/+CA8PD4wfPx65ubmNHpfH42H58uXw9vaGSFT/Z+NvkkabsiNHjsDS0hKWlpaIi4tjSY+RKIVPfIVlQomMcyntOiIiIvDHH3/g4sWLuHPnDoqLi6HgN8SMElBRUYGrqytu3ryJrl27wtzcHKtXr0ZFRUWjxp04cSLU1NQQERHR4DEKCgpw48YNiXWEKS4uxtSpUxEQEIADBw4gNDRUahvumaZL4Uudgb9ewR8pjb8C7oon6FEQi/z8fDx+/Bj5+fnIz88HAOjo6EBHRwfa2trv/LXq9x06dGAb0Rmpy8jIgL+/P+7evQuBQIDBgwc3eKzr16/DyckJaWlp0NXVrff79+zZg0OHDuH3339vcAxVEhIS4O7ujn79+iE8PFwuTnhglJPCJz5JPONTa94MM4f2xLRPu7/1seLi4hqJsOr37/qzoqIi8Hi8OiVKLS0tdiXLNBgR4ciRIwgMDETfvn2xZs0adO7cuUFjBQcHIzs7G/v27av3e8ePH4+BAwfiu+++a9DcwOtV1EuXLsXmzZuxadMmuLq6NngshqkLhU98kljV+XHzZoibNwiaGh83KpaKigo8efLknQnyv4nyyZMnaN269Vt3je+6k9TR0UGrVq3YajbmLaWlpVixYgUEAgFmz56NWbNm4eOP6/d1XFJSAlNTU0RERMDZ2bnO7xOJRNDW1kZqamqDTwrJzMyEu7s72rRpg8jISHTq1KlB4zBMfSh84gMkv49PFsRiMZ4/f16nO8k3S651SZSs5Nr0ZGVlITAwEOnp6QgPD8ewYcPq9f5jx47B398fV69eRcuWLev0nri4OPj4+CA1NbXe8RIRduzYgfnz52PhwoWYMWMG+5plZEYpEl9jOre0VP0Iv07lw1xfvk8mZyVXpi6OHj2KgIAAmJmZYd26dTA0NKzze7/++muYmJggJCSkTq9fuHAhRCIRwsLC6hXjkydP8N133+Hu3buIjo6Gqalpvd7PMI2lFIkPaHivzgUjTODON5ReYByoKrnWJVHWpeT65sdYyVX+lZWVYc2aNVi3bh38/f0RFBRUp1Z6Dx8+hIWFBS5cuFCn7QNWVlbYuHEj+vfvX+fYYmJi4OXlBQ8PD4SEhNS7LMswkqA0iQ9o+OkMTdmbJdcPJcq6llyrfmUlV27du3cPs2bNQkpKCtavX49Ro0Z98D0bNmzAn3/+iX///bfWC5yHDx/CzMwMjx8/rlNHmZKSEsydOxf/93//h6ioKDg5OdXnU2EYiVKqxAcAaTkvsPlMJk7fegIVAGVvLHpRa94MBGBgLy34OvWQ+/KmPPpQyfXNj71Zcv3Qs0lWcpWeEydOwN/fHz169MCGDRvQvfvbq5erCIVC2NnZYdasWXB3d3/v63bs2IGTJ09i//79H5y/qqWZjY0NNm3ahHbt2Pcdwy2lS3xVnhaX4+DlHKQ/KkJhWSXaqKnCWLc1XK31G716k6mbupRcq379b8n1Q4mSlVzrp6KiAuvWrcOqVavg4+OD+fPnv/fcvKSkJHz++ee4fv06OnTo8M7XjBkzBqNHj8aECRPeO6dIJMKKFSuwfv16bNiwAd9++61EPheGaSylTXyMYmlsybW2RMlKrv+Tk5ODOXPmID4+HuvWrcMXX3zxzgsIPz8/VFZWIiIiAgXF5TiYnIP0vEIUlgmh0aIZfo1Yi3O7VqKX4bu3H2RnZ8PDwwOqqqqIioqCgYGBtD81hqkzlvgYhfRmyfVDibKuJVdtbW1oa2s3iZLrqVOnMGPGDOjr6yM8PBy9etU8j/Lly5fo3X84HKYsxtWC16ula+yVFVXi448/hlMvLfg69oBF59flSyLC7t27MWfOHHz//feYOXMmu+hg5A5LfIzS+2/JtbZEWZ+Sq7a2NjQ0NBS25FpZWYmNGzdi2bJlmDJlChYuXFh9mvme+Lv46cg1VIoIqCVxvblQbLhRa3h7e+PmzZuIjo6GhYWFrD4VhqkXlvgY5g3vKrnWliiJSOFLro8ePUJQUBDOnDmDNWvWoLyzHZbF1G9rUItmQGlcNL626ohly5bVafsEw3CFJT6GaYS6llwfP36MwsLCt0qu70uUXJRcz58/D+8FYSjhTwV9pFrv97doBhz06cdWSzNyjyU+hpERSZRc35coJVVy/W53Ek7ezAdQ/7G4av/HMPXFEh/DyCEuSq7y1PCdYaSJJT6GUQLvKrm+L1G+r+T6sE1vXCrXgZAafudY2xFfDCMvPtxriGEYuaehoQENDQ1069btg699X8k1qQAQNmtcubRMKEb6o6JGjcEw0sYSH8M0MS1atECnTp3eOvvuZlQS7qc/bvT4hWWVjR6DYaRJ/tZWMwzDiTZqkrkObqNW/xWhDCNLLPExDAMAMO7YBh83b9yPBLXmzWCs21pCETGMdLDExzAMAMDVRr/RYxAAV+vGj8Mw0sQSH8MwAACexsdw7KmFhm4HVFF5feQX28rAyDuW+BiGqTbdqQfUmn/UoPeqNf8Ivk49JBwRw0geS3wMw1Sz6NwOC0YYo6Vq/X40tFRthgUjjFm7MkYhsO0MDMPU4M43BAAsPZaOMqEItbW4ePN0hqr3MYy8Y51bGIZ5p7ScF9h8JhOnbz2BCl5vTq+i1rwZCK+f6fk69WB3eoxCYYmPYZhaPS0ux8HLOUh/VITCskq0UVOFsW5ruFrrs4UsjEJiiY9hGIZpUtjiFoZhGKZJYYmPYRiGaVJY4mMYhmGaFJb4GIZhmCaFJT6GYRimSWGJj2EYhmlSWOJjGIZhmhSW+BiGYZgmhSU+hmEYpklhiY9hGIZpUljiYxiGYZoUlvgYhmGYJoUlPoZhGKZJYYmPYRiGaVJY4mMYhmGaFJb4GIZhmCaFJT6GYRimSWGJj2EYhmlSWOJjGIZhmhSW+BiGYZgmhSU+hmEYpkn5f+++Zt8tkon8AAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": {} + } + ], + "source": [ + "# python -m graphsage.supervised_train --train_prefix ./example_data/toy-ppi --model graphsage_mean --sigmoid\n", + "# test networkx and visualization\n", + "import networkx as nx\n", + "import tensorflow as tf\n", + "tf.compat.v1.disable_eager_execution()\n", + "\n", + "G = nx.complete_graph(6)\n", + "nx.draw(G)" + ] + }, + { + "cell_type": "code", + "source": [ + "# download code and data\n", + "!git clone https://github.com/williamleif/GraphSAGE" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "S6709xbNrBok", + "outputId": "51e908ea-9105-4844-9427-b57a417bf9da" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Cloning into 'GraphSAGE'...\n", + "remote: Enumerating objects: 265, done.\u001b[K\n", + "remote: Counting objects: 100% (7/7), done.\u001b[K\n", + "remote: Compressing objects: 100% (7/7), done.\u001b[K\n", + "remote: Total 265 (delta 3), reused 0 (delta 0), pack-reused 258\u001b[K\n", + "Receiving objects: 100% (265/265), 6.43 MiB | 11.28 MiB/s, done.\n", + "Resolving deltas: 100% (160/160), done.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import json\n", + "from networkx.readwrite import json_graph\n", + "import os\n", + "import numpy as np\n", + "import sys\n", + "\n", + "CODE_ROOT = \"GraphSAGE/graphsage\"\n", + "sys.path.append(\"GraphSAGE\")\n", + "\n", + "def load_data():\n", + " data_path = 'GraphSAGE/example_data'\n", + " # DATA 1, 14755 nodes, 228431 links\n", + " G_data = json.load(open(data_path + '/toy-ppi-G.json'))\n", + " #G_data['nodes'] = G_data['nodes'][:100]\n", + " #G_data['links'] = G_data['links'][:100]\n", + " G = json_graph.node_link_graph(G_data)\n", + " \n", + " conversion = lambda n : n\n", + " lab_conversion = lambda n : n\n", + " \n", + " # DATA 2, (14755, 50) dtype('float64')\n", + " feats = np.load(data_path + '/toy-ppi-feats.npy')\n", + " \n", + " # DATA 3, {\"0\": 0, \"1\": 1}, len: 14755\n", + " # node ids to integer values indexing feature tensor\n", + " # 其实没什么用\n", + " id_map = json.load(open(data_path + \"/toy-ppi-id_map.json\"))\n", + " \n", + " # DATA 4, dict, len: 14755, column 121\n", + " # from node ids to class values (integer or list)\n", + " # 分类标签\n", + " class_map = json.load(open(data_path + \"/toy-ppi-class_map.json\"))\n", + " \n", + " broken_count = 0\n", + " for node in G.nodes():\n", + " if not 'val' in G.nodes()[node] or not 'test' in G.nodes()[node]:\n", + " G.remove_node(node)\n", + " broken_count += 1\n", + " print(\"Removed {:d} nodes that lacked proper annotations due to networkx versioning issues\".format(broken_count))\n", + " \n", + " # edge: (0, 800) 边\n", + " # G[0]: 某结点与所有的关联结点组成的边的集合\n", + " # 标记需要在训练中移除的关联关系,即边\n", + " for edge in G.edges():\n", + " if (G.nodes()[edge[0]]['val'] or G.nodes()[edge[1]]['val'] or\n", + " G.nodes()[edge[0]]['test'] or G.nodes()[edge[1]]['test']):\n", + " G[edge[0]][edge[1]]['train_removed'] = True\n", + " else:\n", + " G[edge[0]][edge[1]]['train_removed'] = False\n", + " \n", + " from sklearn.preprocessing import StandardScaler\n", + " \n", + " # 训练集的id集合,result only int, len: 9716\n", + " train_ids = np.array([id_map[str(n)] for n in G.nodes() \\\n", + " if not G.nodes()[n]['val'] and not G.nodes()[n]['test']])\n", + " \n", + " train_feats = feats[train_ids]\n", + " \n", + " # 特征缩放,标准化:z = (x - u) / s\n", + " # u is the mean of the training samples\n", + " # s is the standard deviation of the training samples\n", + " scaler = StandardScaler()\n", + " scaler.fit(train_feats)\n", + " feats = scaler.transform(feats)\n", + "\n", + " walks = []\n", + "\n", + " return G, feats, id_map, walks, class_map" + ], + "metadata": { + "id": "jg81uw5O6Gaz" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "def construct_placeholders(num_classes):\n", + " # Define placeholders\n", + " placeholders = {\n", + " 'labels' : tf.compat.v1.placeholder(tf.float32, shape=(None, num_classes), name='labels'),\n", + " 'dropout': tf.compat.v1.placeholder_with_default(0., shape=(), name='dropout'),\n", + " 'batch' : tf.compat.v1.placeholder(tf.int32, shape=(None), name='batch1'),\n", + " 'batch_size' : tf.compat.v1.placeholder(tf.int32, name='batch_size'),\n", + " }\n", + " return placeholders" + ], + "metadata": { + "id": "mReVSrV9UzYO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "train_data = load_data()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Rv1F4lYF_FfW", + "outputId": "7b9e88c6-8ba5-4128-9564-55a7d4cc835c" + }, + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Removed 0 nodes that lacked proper annotations due to networkx versioning issues\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "G = train_data[0]\n", + "features = train_data[1]\n", + "id_map = train_data[2]\n", + "context_pairs = train_data[3]\n", + "class_map = train_data[4]\n", + "\n", + "# num_classes = 121\n", + "num_classes = len(list(class_map.values())[0])\n", + "# pad with dummy zero vector, row wise\n", + "features = np.vstack([features, np.zeros((features.shape[1],))])\n", + "placeholders = construct_placeholders(num_classes)" + ], + "metadata": { + "id": "CjSUlil1kNZP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "class NodeMinibatchIterator(object):\n", + "\n", + " \"\"\"\n", + " This minibatch iterator iterates over nodes for supervised learning.\n", + "\n", + " G -- networkx graph\n", + " id2idx -- dict mapping node ids to integer values indexing feature tensor\n", + " placeholders -- standard tensorflow placeholders object for feeding\n", + " label_map -- map from node ids to class values (integer or list)\n", + " num_classes -- number of output classes\n", + " batch_size -- size of the minibatches\n", + " max_degree -- maximum size of the downsampled adjacency lists\n", + " 以toy-ppi数据集举例:\n", + " label_map为输出,维度为(14755, 121)\n", + " num_class为label_map的第二维,即121\n", + " \"\"\"\n", + " def __init__(self, G, id2idx,\n", + " placeholders, label_map, num_classes,\n", + " batch_size=100, max_degree=25,\n", + " **kwargs):\n", + "\n", + " self.G = G\n", + " self.nodes = G.nodes()\n", + " self.id2idx = id2idx\n", + " self.placeholders = placeholders\n", + " self.batch_size = batch_size\n", + " self.max_degree = max_degree\n", + " self.batch_num = 0\n", + " self.label_map = label_map\n", + " self.num_classes = num_classes\n", + "\n", + " self.adj, self.deg = self.construct_adj()\n", + " self.test_adj = self.construct_test_adj()\n", + "\n", + " self.val_nodes = [n for n in self.G.nodes() if self.G.nodes()[n]['val']]\n", + " self.test_nodes = [n for n in self.G.nodes() if self.G.nodes()[n]['test']]\n", + "\n", + " # 不参与训练的结点id\n", + " self.no_train_nodes_set = set(self.val_nodes + self.test_nodes)\n", + " # 可训练的结点id\n", + " self.train_nodes = set(G.nodes()).difference(self.no_train_nodes_set)\n", + " # don't train on nodes that only have edges to test set\n", + " # 只保留有邻居的结点\n", + " self.train_nodes = [n for n in self.train_nodes if self.deg[id2idx[str(n)]] > 0]\n", + "\n", + " def _make_label_vec(self, node):\n", + " label = self.label_map[node]\n", + " if isinstance(label, list):\n", + " label_vec = np.array(label)\n", + " else:\n", + " label_vec = np.zeros((self.num_classes))\n", + " class_ind = self.label_map[node]\n", + " label_vec[class_ind] = 1\n", + " return label_vec\n", + "\n", + " def construct_adj(self):\n", + " # adjacency shape: (14756, 128) ,用于存储所有节点的邻居节点id\n", + " adj = len(self.id2idx) * np.ones((len(self.id2idx)+1, self.max_degree))\n", + " # (14755,) ,用于存储所有结点的degree值\n", + " deg = np.zeros((len(self.id2idx),))\n", + "\n", + " for nodeid in self.G.nodes():\n", + " if self.G.nodes()[nodeid]['test'] or self.G.nodes()[nodeid]['val']:\n", + " continue\n", + "\n", + " # 获取所有训练集的邻居节点的id\n", + " neighbors = np.array([self.id2idx[str(neighbor)]\n", + " for neighbor in self.G.neighbors(nodeid)\n", + " if (not self.G[nodeid][neighbor]['train_removed'])])\n", + " \n", + " deg[self.id2idx[str(nodeid)]] = len(neighbors)\n", + " if len(neighbors) == 0:\n", + " continue\n", + " if len(neighbors) > self.max_degree:\n", + " neighbors = np.random.choice(neighbors, self.max_degree, replace=False)\n", + " elif len(neighbors) < self.max_degree:\n", + " neighbors = np.random.choice(neighbors, self.max_degree, replace=True)\n", + " adj[self.id2idx[str(nodeid)], :] = neighbors\n", + " return adj, deg\n", + "\n", + " def construct_test_adj(self):\n", + " adj = len(self.id2idx) * np.ones((len(self.id2idx)+1, self.max_degree))\n", + " for nodeid in self.G.nodes():\n", + " # 所有邻居节点的id,这里没有限制训练集或测试集\n", + " neighbors = np.array([self.id2idx[str(neighbor)]\n", + " for neighbor in self.G.neighbors(nodeid)])\n", + " if len(neighbors) == 0:\n", + " continue\n", + " if len(neighbors) > self.max_degree:\n", + " neighbors = np.random.choice(neighbors, self.max_degree, replace=False)\n", + " elif len(neighbors) < self.max_degree:\n", + " neighbors = np.random.choice(neighbors, self.max_degree, replace=True)\n", + " adj[self.id2idx[str(nodeid)], :] = neighbors\n", + " return adj\n", + "\n", + " def end(self):\n", + " return self.batch_num * self.batch_size >= len(self.train_nodes)\n", + "\n", + " def batch_feed_dict(self, batch_nodes, val=False):\n", + " batch1id = batch_nodes\n", + " batch1 = [self.id2idx[n] for n in batch1id]\n", + "\n", + " labels = np.vstack([self._make_label_vec(node) for node in batch1id])\n", + " feed_dict = dict()\n", + " feed_dict.update({self.placeholders['batch_size'] : len(batch1)})\n", + " feed_dict.update({self.placeholders['batch']: batch1})\n", + " feed_dict.update({self.placeholders['labels']: labels})\n", + "\n", + " return feed_dict, labels\n", + "\n", + " def node_val_feed_dict(self, size=None, test=False):\n", + " if test:\n", + " val_nodes = self.test_nodes\n", + " else:\n", + " val_nodes = self.val_nodes\n", + " if not size is None:\n", + " val_nodes = np.random.choice(val_nodes, size, replace=True)\n", + " # add a dummy neighbor\n", + " ret_val = self.batch_feed_dict(val_nodes)\n", + " return ret_val[0], ret_val[1]\n", + "\n", + " def incremental_node_val_feed_dict(self, size, iter_num, test=False):\n", + " if test:\n", + " val_nodes = self.test_nodes\n", + " else:\n", + " val_nodes = self.val_nodes\n", + " val_node_subset = val_nodes[iter_num*size:min((iter_num+1)*size,\n", + " len(val_nodes))]\n", + "\n", + " # add a dummy neighbor\n", + " ret_val = self.batch_feed_dict(val_node_subset)\n", + " return ret_val[0], ret_val[1], (iter_num+1)*size >= len(val_nodes), val_node_subset\n", + "\n", + " def num_training_batches(self):\n", + " return len(self.train_nodes) // self.batch_size + 1\n", + "\n", + " def next_minibatch_feed_dict(self):\n", + " start_idx = self.batch_num * self.batch_size\n", + " self.batch_num += 1\n", + " end_idx = min(start_idx + self.batch_size, len(self.train_nodes))\n", + " batch_nodes = self.train_nodes[start_idx : end_idx]\n", + " return self.batch_feed_dict(batch_nodes)\n", + "\n", + " def incremental_embed_feed_dict(self, size, iter_num):\n", + " node_list = self.nodes\n", + " val_nodes = node_list[iter_num*size:min((iter_num+1)*size,\n", + " len(node_list))]\n", + " return self.batch_feed_dict(val_nodes), (iter_num+1)*size >= len(node_list), val_nodes\n", + "\n", + " def shuffle(self):\n", + " \"\"\" Re-shuffle the training set.\n", + " Also reset the batch number.\n", + " \"\"\"\n", + " self.train_nodes = np.random.permutation(self.train_nodes)\n", + " self.batch_num = 0\n" + ], + "metadata": { + "id": "ZYCKM4i5PmPf" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "\"\"\"\n", + " This minibatch iterator iterates over nodes for supervised learning.\n", + "\n", + " G -- networkx graph\n", + " id2idx -- dict mapping node ids to integer values indexing feature tensor\n", + " placeholders -- standard tensorflow placeholders object for feeding\n", + " label_map -- map from node ids to class values (integer or list)\n", + " num_classes -- number of output classes\n", + " batch_size -- size of the minibatches\n", + " max_degree -- maximum size of the downsampled adjacency lists\n", + "\"\"\"\n", + "# 实例化 NodeMinibatch 迭代器\n", + "minibatch = NodeMinibatchIterator(G,\n", + " id_map,\n", + " placeholders,\n", + " class_map,\n", + " num_classes,\n", + " batch_size=512,\n", + " max_degree=128,\n", + " context_pairs = context_pairs)\n", + "\n", + "# adjacency shape: (14756, 128) 包装为placeholder\n", + "adj_info_ph = tf.compat.v1.placeholder(tf.int32, shape=minibatch.adj.shape)\n", + "adj_info = tf.Variable(adj_info_ph, trainable=False, name=\"adj_info\")\n", + "\n", + "# 接着就是构建模型了,需要改动的兼容代码过多,暂不继续了" + ], + "metadata": { + "id": "mUW98eVhQ5H7" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "" + ], + "metadata": { + "id": "Wp3DZreLrdtF" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git "a/graphsage/doc/\345\256\236\351\252\214\347\273\223\346\236\234.md" "b/graphsage/doc/\345\256\236\351\252\214\347\273\223\346\236\234.md" new file mode 100644 index 00000000..6d12a507 --- /dev/null +++ "b/graphsage/doc/\345\256\236\351\252\214\347\273\223\346\236\234.md" @@ -0,0 +1,98 @@ +### 有监督 + +```bash +python -m graphsage.supervised_train \ +--train_prefix ./ppi/ppi \ +--model graphsage_maxpool \ +--learning_rate 0.001 \ +--epochs 200 \ +--batch_size 1024 \ +--validate_iter 10 \ +--random_context False \ +--sigmoid \ +--gpu 0 + +Full validation stats: loss= 0.35714 f1_micro= 0.71096 f1_macro= 0.63525 time= 0.30733 +``` + + + +```bash +python -m graphsage.supervised_train \ +--train_prefix ./ppi/ppi \ +--model gcn \ +--print_every 50 \ +--epochs 200 \ +--batch_size 1024 \ +--learning_rate 0.001 \ +--validate_iter 10 \ +--random_context False \ +--sigmoid \ +--gpu 0 + + +Full validation stats: loss= 0.49029 f1_micro= 0.52777 f1_macro= 0.34071 time= 0.25776 +``` + + + +```bash +python -m graphsage.supervised_train \ +--train_prefix ./ppi/ppi \ +--model graphsage_seq \ +--learning_rate 0.001 \ +--epochs 200 \ +--batch_size 1024 \ +--validate_iter 10 \ +--random_context False \ +--sigmoid \ +--gpu 0 + +Full validation stats: loss= 0.34045 f1_micro= 0.72752 f1_macro= 0.65601 time= 0.41571 +``` + + + +```bash +python -m graphsage.supervised_train \ +--train_prefix ./ppi/ppi \ +--model graphsage_mean \ +--learning_rate 0.001 \ +--epochs 200 \ +--batch_size 1024 \ +--validate_iter 10 \ +--random_context False \ +--sigmoid \ +--gpu 0 + +Full validation stats: loss= 0.42566 f1_micro= 0.60157 f1_macro= 0.47732 time= 0.30706 +``` + + + +### 无监督 + + + +```bash +python -m graphsage.unsupervised_train \ +--train_prefix ./ppi/ppi \ +--model graphsage_mean \ +--model_size big \ +--print_every 50 \ +--epoch 50 \ +--batch_size 1024 \ +--dropout 0.1 \ +--learning_rate 0.0001 \ +--validate_iter 10 \ +--random_context False +--gpu 0 + + +python eval_scripts/ppi_eval.py ./ppi ./unsup-ppi/graphsage_mean_big_0.000100 test + +F1-micro 0.761944872 +``` + + + diff --git "a/graphsage/doc/\350\212\202\347\202\271\346\233\264\346\226\260\346\265\201\347\250\213.png" "b/graphsage/doc/\350\212\202\347\202\271\346\233\264\346\226\260\346\265\201\347\250\213.png" new file mode 100644 index 00000000..807cc1f4 Binary files /dev/null and "b/graphsage/doc/\350\212\202\347\202\271\346\233\264\346\226\260\346\265\201\347\250\213.png" differ diff --git "a/graphsage/doc/\351\207\207\346\240\267\351\202\273\345\261\205\350\212\202\347\202\271\346\265\201\347\250\213.png" "b/graphsage/doc/\351\207\207\346\240\267\351\202\273\345\261\205\350\212\202\347\202\271\346\265\201\347\250\213.png" new file mode 100644 index 00000000..4a822afd Binary files /dev/null and "b/graphsage/doc/\351\207\207\346\240\267\351\202\273\345\261\205\350\212\202\347\202\271\346\265\201\347\250\213.png" differ diff --git a/graphsage/inits.py b/graphsage/inits.py index c3351494..a38880d5 100644 --- a/graphsage/inits.py +++ b/graphsage/inits.py @@ -8,23 +8,32 @@ def uniform(shape, scale=0.05, name=None): """Uniform init.""" + #根据shpae参数生成一个均匀分布值为minval和maxval之间的矩阵 initial = tf.random_uniform(shape, minval=-scale, maxval=scale, dtype=tf.float32) + #创建一个tf的节点变量 return tf.Variable(initial, name=name) def glorot(shape, name=None): """Glorot & Bengio (AISTATS 2010) init.""" + #获取矩阵元素的平方根 init_range = np.sqrt(6.0/(shape[0]+shape[1])) + #根据shpae参数生成一个均匀分布值为minval和maxval之间的矩阵,值为上面算出来的平方根 initial = tf.random_uniform(shape, minval=-init_range, maxval=init_range, dtype=tf.float32) + #创建一个tf的节点变量 return tf.Variable(initial, name=name) def zeros(shape, name=None): """All zeros.""" + #生成一个全是0的矩阵 initial = tf.zeros(shape, dtype=tf.float32) + #创建一个tf的节点变量 return tf.Variable(initial, name=name) def ones(shape, name=None): """All ones.""" + #生成一个全是1的矩阵 initial = tf.ones(shape, dtype=tf.float32) + #创建一个tf的节点变量 return tf.Variable(initial, name=name) diff --git a/graphsage/layers.py b/graphsage/layers.py index ca2496d9..d4bcb3c9 100644 --- a/graphsage/layers.py +++ b/graphsage/layers.py @@ -16,6 +16,7 @@ # global unique layer ID dictionary for layer name assignment _LAYER_UIDS = {} + def get_layer_uid(layer_name=''): """Helper function, assigns unique layer IDs.""" if layer_name not in _LAYER_UIDS: @@ -25,6 +26,7 @@ def get_layer_uid(layer_name=''): _LAYER_UIDS[layer_name] += 1 return _LAYER_UIDS[layer_name] + class Layer(object): """Base layer class. Defines basic API for all layer objects. Implementation inspired by keras (http://keras.io). @@ -37,6 +39,11 @@ class Layer(object): (i.e. takes input, returns output) __call__(inputs): Wrapper for _call() _log_vars(): Log all variables + + 最基础的层类型 + name:定义层的名称,字符型 + logging:布尔型,如果开的话就可以打印训练过程中,当需要查看一个张量在训练过程中值的分布情况时,可通过tf.summary.histogram()将其分布情况以直方图的形式在TensorBoard直方图仪表板上显示. + 这里并没有参数矩阵、激活函数等值,都是在其子类中实现 """ def __init__(self, **kwargs): @@ -53,6 +60,7 @@ def __init__(self, **kwargs): self.logging = logging self.sparse_inputs = False + # 用于定义该层的计算逻辑的函数,该类这里没计算逻辑,需要子类去实, Dense类就有一个实现 def _call(self, inputs): return inputs @@ -71,9 +79,24 @@ def _log_vars(self): class Dense(Layer): - """Dense layer.""" - def __init__(self, input_dim, output_dim, dropout=0., - act=tf.nn.relu, placeholders=None, bias=True, featureless=False, + """Dense layer. + 一个基础的全连接层 + + 需要的入参: + dropout :0-1的数字,表示输入被丢弃的概率 + act:激活函数的类型选取,例如sigmoid、relu等 + featureless:意义暂时不明,没有用到 + bias :偏置项 + input_dim:输入维度 + output_dim :输出维度 + + """ + + + + + def __init__(self, input_dim, output_dim, dropout=0., + act=tf.nn.relu, placeholders=None, bias=True, featureless=False, sparse_inputs=False, **kwargs): super(Dense, self).__init__(**kwargs) @@ -90,17 +113,22 @@ def __init__(self, input_dim, output_dim, dropout=0., if sparse_inputs: self.num_features_nonzero = placeholders['num_features_nonzero'] + + # 根据输入输出维度生成参数矩阵 with tf.variable_scope(self.name + '_vars'): self.vars['weights'] = tf.get_variable('weights', shape=(input_dim, output_dim), - dtype=tf.float32, - initializer=tf.contrib.layers.xavier_initializer(), - regularizer=tf.contrib.layers.l2_regularizer(FLAGS.weight_decay)) + dtype=tf.float32, + initializer=tf.contrib.layers.xavier_initializer(), + regularizer=tf.contrib.layers.l2_regularizer(FLAGS.weight_decay)) if self.bias: self.vars['bias'] = zeros([output_dim], name='bias') if self.logging: self._log_vars() + + #调用模型的时候的计算逻辑定义 + # 此全连接中, output = XW + b, (还有dropout和激活函数等操作) def _call(self, inputs): x = inputs diff --git a/graphsage/minibatch.py b/graphsage/minibatch.py index 0a1cd963..f9dd79f2 100644 --- a/graphsage/minibatch.py +++ b/graphsage/minibatch.py @@ -1,23 +1,24 @@ -from __future__ import division +from __future__ import division from __future__ import print_function import numpy as np np.random.seed(123) +#边迭代器,在无监督训练中使用 class EdgeMinibatchIterator(object): """ This minibatch iterator iterates over batches of sampled edges or random pairs of co-occuring edges. - G -- networkx graph - id2idx -- dict mapping node ids to index in feature tensor - placeholders -- tensorflow placeholders object - context_pairs -- if not none, then a list of co-occuring node pairs (from random walks) - batch_size -- size of the minibatches - max_degree -- maximum size of the downsampled adjacency lists - n2v_retrain -- signals that the iterator is being used to add new embeddings to a n2v model - fixed_n2v -- signals that the iterator is being used to retrain n2v with only existing nodes as context + G -- networkx graph 拓扑图 + id2idx -- dict mapping node ids to index in feature tensor 节点索引,映射toy-ppi-id_map.json文件 + placeholders -- tensorflow placeholders object tf的占位符 + context_pairs -- if not none, then a list of co-occuring node pairs (from random walks) 随机步数采用结果,采样流程见utils + batch_size -- size of the minibatches 批次大小 + max_degree -- maximum size of the downsampled adjacency lists 最大的度(邻居节点的数量) + n2v_retrain -- signals that the iterator is being used to add new embeddings to a n2v model 重新训练n2v模型的标识 + fixed_n2v -- signals that the iterator is being used to retrain n2v with only existing nodes as context 只用外部节点训练n2v模型标识 """ def __init__(self, G, id2idx, placeholders, context_pairs=None, batch_size=100, max_degree=25, @@ -30,18 +31,20 @@ def __init__(self, G, id2idx, self.placeholders = placeholders self.batch_size = batch_size self.max_degree = max_degree - self.batch_num = 0 + self.batch_num = 0 #批次数,初始化为0,训练过程中递增 - self.nodes = np.random.permutation(G.nodes()) - self.adj, self.deg = self.construct_adj() - self.test_adj = self.construct_test_adj() - if context_pairs is None: + self.nodes = np.random.permutation(G.nodes()) #节点乱序 + self.adj, self.deg = self.construct_adj() #adj矩阵:所有训练节点的取max_degree个邻居节点,deg矩阵:所有训练节点的有效邻居数 + self.test_adj = self.construct_test_adj() #所有节点(包含测试和验证节点)的adj矩阵 + if context_pairs is None: #若入参context_pairs不为空,取context_pairs作为训练边,否则取G图中所有边 edges = G.edges() else: edges = context_pairs self.train_edges = self.edges = np.random.permutation(edges) if not n2v_retrain: + #训练边剔除顶点不存在拓扑图,顶点有效邻居个数,顶点为test或val的边 self.train_edges = self._remove_isolated(self.train_edges) + #验证边取顶点为test或val的边 self.val_edges = [e for e in G.edges() if G[e[0]][e[1]]['train_removed']] else: if fixed_n2v: @@ -49,19 +52,22 @@ def __init__(self, G, id2idx, else: self.train_edges = self.val_edges = self.edges + #打印训练节点和测试节点的数量 print(len([n for n in G.nodes() if not G.node[n]['test'] and not G.node[n]['val']]), 'train nodes') print(len([n for n in G.nodes() if G.node[n]['test'] or G.node[n]['val']]), 'test nodes') self.val_set_size = len(self.val_edges) + #剔除顶点1为测试或训练节点的边 def _n2v_prune(self, edges): is_val = lambda n : self.G.node[n]["val"] or self.G.node[n]["test"] return [e for e in edges if not is_val(e[1])] + #剔除顶点不在G图中,顶点的有效邻居数为0且顶点不为test def _remove_isolated(self, edge_list): new_edge_list = [] missing = 0 for n1, n2 in edge_list: - if not n1 in self.G.node or not n2 in self.G.node: + if not n1 in self.G.node or not n2 in self.G.node: #顶点1或顶点2不在G图中 missing += 1 continue if (self.deg[self.id2idx[n1]] == 0 or self.deg[self.id2idx[n2]] == 0) \ @@ -73,26 +79,32 @@ def _remove_isolated(self, edge_list): print("Unexpected missing:", missing) return new_edge_list + #获取adj和deg两个矩阵,adj矩阵每行为当年节点的的指定数量邻居节点id,按ididx索引排列 + #deg为每个节点的训练邻居节点的个数 def construct_adj(self): + #adj初始化:一个节点总数+1行,max_degree列,初始化值全部为节点总数的二维矩阵 + #deg初始化:一个节点总数行的一维矩阵 adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree)) deg = np.zeros((len(self.id2idx),)) - for nodeid in self.G.nodes(): - if self.G.node[nodeid]['test'] or self.G.node[nodeid]['val']: + for nodeid in self.G.nodes():#对全部节点循环 + if self.G.node[nodeid]['test'] or self.G.node[nodeid]['val']: #测试或验证节点直接跳过 continue neighbors = np.array([self.id2idx[neighbor] for neighbor in self.G.neighbors(nodeid) - if (not self.G[nodeid][neighbor]['train_removed'])]) - deg[self.id2idx[nodeid]] = len(neighbors) + if (not self.G[nodeid][neighbor]['train_removed'])])#取所有不为test和val的邻居节点 + deg[self.id2idx[nodeid]] = len(neighbors)#deg赋值为邻居个数 if len(neighbors) == 0: continue - if len(neighbors) > self.max_degree: + #取max_degree个邻居节点 + if len(neighbors) > self.max_degree: #邻居节点大于max_degree,无重复采样 neighbors = np.random.choice(neighbors, self.max_degree, replace=False) - elif len(neighbors) < self.max_degree: + elif len(neighbors) < self.max_degree:#邻居节点小于max_degree,有重复采样 neighbors = np.random.choice(neighbors, self.max_degree, replace=True) adj[self.id2idx[nodeid], :] = neighbors return adj, deg + #取所有节点的adj矩阵,方式与construct_adj相同 def construct_test_adj(self): adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree)) for nodeid in self.G.nodes(): @@ -106,10 +118,11 @@ def construct_test_adj(self): neighbors = np.random.choice(neighbors, self.max_degree, replace=True) adj[self.id2idx[nodeid], :] = neighbors return adj - + #判断当前epoch是否结束 def end(self): return self.batch_num * self.batch_size >= len(self.train_edges) + #将边集合转为feed_dict,batch_size:集合总数,batch1:节点1集合,batch2:节点2集合 def batch_feed_dict(self, batch_edges): batch1 = [] batch2 = [] @@ -124,6 +137,7 @@ def batch_feed_dict(self, batch_edges): return feed_dict + #下一批次feed_dict def next_minibatch_feed_dict(self): start_idx = self.batch_num * self.batch_size self.batch_num += 1 @@ -131,9 +145,11 @@ def next_minibatch_feed_dict(self): batch_edges = self.train_edges[start_idx : end_idx] return self.batch_feed_dict(batch_edges) + #取当前是第几批次 def num_training_batches(self): return len(self.train_edges) // self.batch_size + 1 + #取指定大小的训练边的feed_dict,首次取数 def val_feed_dict(self, size=None): edge_list = self.val_edges if size is None: @@ -143,22 +159,22 @@ def val_feed_dict(self, size=None): val_edges = [edge_list[i] for i in ind[:min(size, len(ind))]] return self.batch_feed_dict(val_edges) + #下一批次的训练边的feed_dict def incremental_val_feed_dict(self, size, iter_num): edge_list = self.val_edges val_edges = edge_list[iter_num*size:min((iter_num+1)*size, len(edge_list))] return self.batch_feed_dict(val_edges), (iter_num+1)*size >= len(self.val_edges), val_edges - + #去下一批次的节点自己到自己组成的边,并转为feed_dict def incremental_embed_feed_dict(self, size, iter_num): node_list = self.nodes - val_nodes = node_list[iter_num*size:min((iter_num+1)*size, - len(node_list))] + val_nodes = node_list[iter_num*size:min((iter_num+1)*size,len(node_list))] val_edges = [(n,n) for n in val_nodes] return self.batch_feed_dict(val_edges), (iter_num+1)*size >= len(node_list), val_edges - + #将全量边分为训练边和验证边 def label_val(self): train_edges = [] - val_edges = [] + val_edges = [] for n1, n2 in self.G.edges(): if (self.G.node[n1]['val'] or self.G.node[n1]['test'] or self.G.node[n2]['val'] or self.G.node[n2]['test']): @@ -167,6 +183,7 @@ def label_val(self): train_edges.append((n1,n2)) return train_edges, val_edges + #洗牌 def shuffle(self): """ Re-shuffle the training set. Also reset the batch number. @@ -175,6 +192,7 @@ def shuffle(self): self.nodes = np.random.permutation(self.nodes) self.batch_num = 0 +#节点迭代器,在有监督训练中使用 class NodeMinibatchIterator(object): """ @@ -183,10 +201,13 @@ class NodeMinibatchIterator(object): G -- networkx graph id2idx -- dict mapping node ids to integer values indexing feature tensor placeholders -- standard tensorflow placeholders object for feeding - label_map -- map from node ids to class values (integer or list) - num_classes -- number of output classes + label_map -- map from node ids to class values (integer or list) 所有节点的类标数据,映射文件toy-ppi-map + num_classes -- number of output classes 每个类标数据的维度 batch_size -- size of the minibatches max_degree -- maximum size of the downsampled adjacency lists + 以toy-ppi数据集举例: + label_map为输出,维度为(14755, 121) + num_class为label_map的第二维,即121 """ def __init__(self, G, id2idx, placeholders, label_map, num_classes, @@ -203,17 +224,20 @@ def __init__(self, G, id2idx, self.label_map = label_map self.num_classes = num_classes + #adj:采样邻居矩阵,deg:训练邻居节点个数矩阵 self.adj, self.deg = self.construct_adj() self.test_adj = self.construct_test_adj() - + #验证节点集合,测试节点集合 self.val_nodes = [n for n in self.G.nodes() if self.G.node[n]['val']] self.test_nodes = [n for n in self.G.nodes() if self.G.node[n]['test']] - + #非训练节点集合,训练节点集合 self.no_train_nodes_set = set(self.val_nodes + self.test_nodes) self.train_nodes = set(G.nodes()).difference(self.no_train_nodes_set) # don't train on nodes that only have edges to test set + #剔除有效边为0的节点 self.train_nodes = [n for n in self.train_nodes if self.deg[id2idx[n]] > 0] + #若类标数据为list,转为一维矩阵;若为单数值,则创建一个全零矩阵,并将该数据位置位1 def _make_label_vec(self, node): label = self.label_map[node] if isinstance(label, list): @@ -225,15 +249,23 @@ def _make_label_vec(self, node): return label_vec def construct_adj(self): + # 一个numpy 2dim的数组,用于存储各个节点的邻接点,最多为max_degree个邻接点 + # adjacency shape: (14756, 128) adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree)) + # (14755,) 用于存储所有节点的degree值 deg = np.zeros((len(self.id2idx),)) for nodeid in self.G.nodes(): + # 测试集合验证集的节点直接跳过 if self.G.node[nodeid]['test'] or self.G.node[nodeid]['val']: continue + + # 获取所有训练集中节点邻居节点的id neighbors = np.array([self.id2idx[neighbor] for neighbor in self.G.neighbors(nodeid) if (not self.G[nodeid][neighbor]['train_removed'])]) + + # 不足degree的邻接点补足degree,超过的随机选择degree个邻接点 deg[self.id2idx[nodeid]] = len(neighbors) if len(neighbors) == 0: continue @@ -244,9 +276,11 @@ def construct_adj(self): adj[self.id2idx[nodeid], :] = neighbors return adj, deg + #所有节点的adj矩阵,包含test和val节点 def construct_test_adj(self): adj = len(self.id2idx)*np.ones((len(self.id2idx)+1, self.max_degree)) for nodeid in self.G.nodes(): + # 所有邻接点的id,这里没有限制训练或测试集 neighbors = np.array([self.id2idx[neighbor] for neighbor in self.G.neighbors(nodeid)]) if len(neighbors) == 0: @@ -258,13 +292,16 @@ def construct_test_adj(self): adj[self.id2idx[nodeid], :] = neighbors return adj + #判断是否结束 def end(self): return self.batch_num * self.batch_size >= len(self.train_nodes) + #节点集合的feed_dict,batch_size:集合大小,batch:节点的index信息,labels:集合的类标数据 def batch_feed_dict(self, batch_nodes, val=False): batch1id = batch_nodes batch1 = [self.id2idx[n] for n in batch1id] - + + #按照batch_nodes的顺序,将batch_nodes的类标数据堆叠成2维数组 labels = np.vstack([self._make_label_vec(node) for node in batch1id]) feed_dict = dict() feed_dict.update({self.placeholders['batch_size'] : len(batch1)}) @@ -273,6 +310,7 @@ def batch_feed_dict(self, batch_nodes, val=False): return feed_dict, labels + #从测试节点或验证节点中取size个节点,获取节点list的feed_dict def node_val_feed_dict(self, size=None, test=False): if test: val_nodes = self.test_nodes @@ -284,21 +322,25 @@ def node_val_feed_dict(self, size=None, test=False): ret_val = self.batch_feed_dict(val_nodes) return ret_val[0], ret_val[1] + #取训练或验证节点的下一批次节点的feed_dict def incremental_node_val_feed_dict(self, size, iter_num, test=False): if test: val_nodes = self.test_nodes else: val_nodes = self.val_nodes - val_node_subset = val_nodes[iter_num*size:min((iter_num+1)*size, + val_node_subset = val_nodes[iter_num*size:min((iter_num+1)*size, len(val_nodes))] # add a dummy neighbor ret_val = self.batch_feed_dict(val_node_subset) return ret_val[0], ret_val[1], (iter_num+1)*size >= len(val_nodes), val_node_subset + + #返回当前是第几批次 def num_training_batches(self): return len(self.train_nodes) // self.batch_size + 1 + #训练节点的下一批次的feed_dict def next_minibatch_feed_dict(self): start_idx = self.batch_num * self.batch_size self.batch_num += 1 @@ -306,12 +348,14 @@ def next_minibatch_feed_dict(self): batch_nodes = self.train_nodes[start_idx : end_idx] return self.batch_feed_dict(batch_nodes) + #全量节点的下一批次节点feed_dict def incremental_embed_feed_dict(self, size, iter_num): node_list = self.nodes - val_nodes = node_list[iter_num*size:min((iter_num+1)*size, + val_nodes = node_list[iter_num*size:min((iter_num+1)*size, len(node_list))] return self.batch_feed_dict(val_nodes), (iter_num+1)*size >= len(node_list), val_nodes + #洗牌 def shuffle(self): """ Re-shuffle the training set. Also reset the batch number. diff --git a/graphsage/models.py b/graphsage/models.py index b3b9db45..7d5d84c0 100644 --- a/graphsage/models.py +++ b/graphsage/models.py @@ -17,6 +17,7 @@ # https://github.com/tkipf/gcn # which itself was very inspired by the keras package + class Model(object): def __init__(self, **kwargs): allowed_kwargs = {'name', 'logging', 'model_size'} @@ -30,8 +31,8 @@ def __init__(self, **kwargs): logging = kwargs.get('logging', False) self.logging = logging - self.vars = {} - self.placeholders = {} + self.vars = {} # 模型参数 + self.placeholders = {} # 预留的位置,存放输入数据 self.layers = [] self.activations = [] @@ -53,21 +54,23 @@ def build(self): self._build() # Build sequential layer model - self.activations.append(self.inputs) + self.activations.append(self.inputs) # 第一步,将输入数据加进激活层,作为第一层 for layer in self.layers: + # 逐层计算,并将每一层的输出都放进activations中保存 hidden = layer(self.activations[-1]) self.activations.append(hidden) - self.outputs = self.activations[-1] + self.outputs = self.activations[-1] # 模型的输出即为最后一层 # Store model variables for easy access - variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) + variables = tf.get_collection( + tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) # 获取全局的参数,并赋给vars self.vars = {var.name: var for var in variables} # Build metrics - self._loss() + self._loss() # 定义损失函数 self._accuracy() - self.opt_op = self.optimizer.minimize(self.loss) + self.opt_op = self.optimizer.minimize(self.loss) # 优化策略 def predict(self): pass @@ -78,14 +81,14 @@ def _loss(self): def _accuracy(self): raise NotImplementedError - def save(self, sess=None): + def save(self, sess=None): # 保存模型到本地文件 if not sess: raise AttributeError("TensorFlow session not provided.") saver = tf.train.Saver(self.vars) save_path = saver.save(sess, "tmp/%s.ckpt" % self.name) print("Model saved in file: %s" % save_path) - def load(self, sess=None): + def load(self, sess=None): # 从本地文件读取模型 if not sess: raise AttributeError("TensorFlow session not provided.") saver = tf.train.Saver(self.vars) @@ -94,8 +97,10 @@ def load(self, sess=None): print("Model restored from file: %s" % save_path) +# 多层感知机,是一个基础的深度模型 class MLP(Model): """ A standard multi-layer perceptron """ + def __init__(self, placeholders, dims, categorical=True, **kwargs): super(MLP, self).__init__(**kwargs) @@ -108,7 +113,8 @@ def __init__(self, placeholders, dims, categorical=True, **kwargs): self.inputs = placeholders['features'] self.labels = placeholders['labels'] - self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) + self.optimizer = tf.train.AdamOptimizer( + learning_rate=FLAGS.learning_rate) self.build() @@ -120,45 +126,49 @@ def _loss(self): # Cross entropy error if self.categorical: self.loss += metrics.masked_softmax_cross_entropy(self.outputs, self.placeholders['labels'], - self.placeholders['labels_mask']) + self.placeholders['labels_mask']) # L2 else: diff = self.labels - self.outputs - self.loss += tf.reduce_sum(tf.sqrt(tf.reduce_sum(diff * diff, axis=1))) + self.loss += tf.reduce_sum( + tf.sqrt(tf.reduce_sum(diff * diff, axis=1))) def _accuracy(self): if self.categorical: self.accuracy = metrics.masked_accuracy(self.outputs, self.placeholders['labels'], - self.placeholders['labels_mask']) + self.placeholders['labels_mask']) def _build(self): self.layers.append(layers.Dense(input_dim=self.input_dim, - output_dim=self.dims[1], - act=tf.nn.relu, - dropout=self.placeholders['dropout'], - sparse_inputs=False, - logging=self.logging)) + output_dim=self.dims[1], + act=tf.nn.relu, + dropout=self.placeholders['dropout'], + sparse_inputs=False, + logging=self.logging)) self.layers.append(layers.Dense(input_dim=self.dims[1], - output_dim=self.output_dim, - act=lambda x: x, - dropout=self.placeholders['dropout'], - logging=self.logging)) + output_dim=self.output_dim, + act=lambda x: x, + dropout=self.placeholders['dropout'], + logging=self.logging)) def predict(self): return tf.nn.softmax(self.outputs) + class GeneralizedModel(Model): """ Base class for models that aren't constructed from traditional, sequential layers. Subclasses must set self.outputs in _build method (Removes the layers idiom from build method of the Model class) + + GeneralizedModel 这个类相比于Model类,主要是删去了中间的序列模型层,该模型需要其子类自己去定义中间层的计算逻辑以及输出 + """ def __init__(self, **kwargs): super(GeneralizedModel, self).__init__(**kwargs) - def build(self): """ Wrapper for _build() """ @@ -166,7 +176,10 @@ def build(self): self._build() # Store model variables for easy access - variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) + # 和Model类相比,GeneralizedModel在build的时候,并没去生成序列层 + # self.output必须在它的子类build()函数中实现。 + variables = tf.get_collection( + tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) self.vars = {var.name: var for var in variables} # Build metrics @@ -175,14 +188,16 @@ def build(self): self.opt_op = self.optimizer.minimize(self.loss) -# SAGEInfo is a namedtuple that specifies the parameters + +# SAGEInfo is a namedtuple that specifies the parameters # of the recursive GraphSAGE layers SAGEInfo = namedtuple("SAGEInfo", - ['layer_name', # name of the layer (to get feature embedding etc.) - 'neigh_sampler', # callable neigh_sampler constructor - 'num_samples', - 'output_dim' # the output (i.e., hidden) dimension - ]) + ['layer_name', # name of the layer (to get feature embedding etc.) + 'neigh_sampler', # callable neigh_sampler constructor + 'num_samples', + 'output_dim' # the output (i.e., hidden) dimension + ]) + class SampleAndAggregate(GeneralizedModel): """ @@ -190,9 +205,9 @@ class SampleAndAggregate(GeneralizedModel): """ def __init__(self, placeholders, features, adj, degrees, - layer_infos, concat=True, aggregator_type="mean", - model_size="small", identity_dim=0, - **kwargs): + layer_infos, concat=True, aggregator_type="mean", + model_size="small", identity_dim=0, + **kwargs): ''' Args: - placeholders: Stanford TensorFlow placeholder object. @@ -206,7 +221,18 @@ def __init__(self, placeholders, features, adj, degrees, - aggregator_type: how to aggregate neighbor information - model_size: one of "small" and "big" - identity_dim: Set to positive int to use identity features (slow and cannot generalize, but better accuracy) + + - features:节点特征 [num_nodes,num_features] + - adj: 图的邻接表, [num_nodes, maxdegree] maxdegree是个超参,表示对于每个节点,最多只记录其maxdegree个邻居信息 + - degrees:列表,表示每个节点的度数长度为[num_nodes] + - layer_infos:一个列表,记录了每一层的信息包括,名称、邻居采样器、 + - concat:是否在递归迭代期间拼接,是或者否 + - aggregator_type:聚合方式的定义 + - model_size:模型大小,有small 和big, 隐藏层的维度有区别 + - identity_dim:int,若>0则加入额外特征(速度慢且泛化性差,但准确度更高) ''' + + # 选择聚合器类型 super(SampleAndAggregate, self).__init__(**kwargs) if aggregator_type == "mean": self.aggregator_cls = MeanAggregator @@ -222,61 +248,108 @@ def __init__(self, placeholders, features, adj, degrees, raise Exception("Unknown aggregator: ", self.aggregator_cls) # get info from placeholders... + # batch1和batch2 是一条边的两个顶点id,即每条边的两个顶点,分别放进batch1和batch2中 + # 他们后续会分别作为模型的输入,得到中间表达结果output1和output2,然后在会用表达结果计算性能指标 self.inputs1 = placeholders["batch1"] self.inputs2 = placeholders["batch2"] + self.model_size = model_size self.adj_info = adj + + # 若identity_dim>0,则创建额外的嵌入特征,扩充到feature的列维度上 if identity_dim > 0: - self.embeds = tf.get_variable("node_embeddings", [adj.get_shape().as_list()[0], identity_dim]) + self.embeds = tf.get_variable( + "node_embeddings", [adj.get_shape().as_list()[0], identity_dim]) else: - self.embeds = None - if features is None: + self.embeds = None + if features is None: if identity_dim == 0: - raise Exception("Must have a positive value for identity feature dimension if no input features given.") + raise Exception( + "Must have a positive value for identity feature dimension if no input features given.") self.features = self.embeds else: - self.features = tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False) + self.features = tf.Variable(tf.constant( + features, dtype=tf.float32), trainable=False) # 节点特征通过tf.Variable方式获取,不可训练 if not self.embeds is None: + # (feature的最终特征维度为 原始特征维度50+identity_dim) self.features = tf.concat([self.embeds, self.features], axis=1) + self.degrees = degrees - self.concat = concat + self.concat = concat # 布尔值,表示在模型计算完batch1和batch2的特征表达之后,是否拼接 - self.dims = [(0 if features is None else features.shape[1]) + identity_dim] - self.dims.extend([layer_infos[i].output_dim for i in range(len(layer_infos))]) + # dim是一个列表,代表aggregator每一层的输出维度,第一层是输入层,维度=输入特征的维度,后面的维度是从layer_info得到的 + # 本实验中,dims = [50,128,128] + + self.dims = [ + (0 if features is None else features.shape[1]) + identity_dim] + self.dims.extend( + [layer_infos[i].output_dim for i in range(len(layer_infos))]) self.batch_size = placeholders["batch_size"] self.placeholders = placeholders self.layer_infos = layer_infos - self.optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) + # 优化器选择为adam方法,是当前最常用的梯度更新策略 + + self.optimizer = tf.train.AdamOptimizer( + learning_rate=FLAGS.learning_rate) + # 构建模型 self.build() def sample(self, inputs, layer_infos, batch_size=None): """ Sample neighbors to be the supportive fields for multi-layer convolutions. + 函数功能:对输入的每一个节点,根据采样跳数目,递归地采样邻居,作为该节点的支持域 + 输入: + inputs:一批次的节点id + + 输出: + samples是一个列表,列表的每一个元素又是一个列表,长度不一,存放的是该跳数下的所有的邻居节点id + 示例: + samples[0] 维度是 [batch_size,] ,即是自身 + samples[1] [layer_infos[1].num_samples * batch_size,] + samples[2] [layer_infos[1].num_samples * layer_infos[0].num_samples * batch_size,] + 以此类推 + + support_sizes 存的是的各层的采样数目,是一个列表,每个元素是一个正整数 + support_sizes[0] = 1, 意义是初始状态,邻居就是节点本身 + support_sizes[1] = layer_infos[-1].num_samples * 1, 本实验中为10 + support_sizes[2] = layer_infos[-1].num_samples * layer_infos[-2].num_samples * 1, 本实验中为10*15=250 + 以此类推,从最外层的邻居数依次往内乘 + Args: inputs: batch inputs batch_size: the number of inputs (different for batch inputs and negative samples). """ - + if batch_size is None: batch_size = self.batch_size - samples = [inputs] + samples = [inputs] # samples[0] 是输入, # size of convolution support at each layer per node + support_size = 1 support_sizes = [support_size] - for k in range(len(layer_infos)): - t = len(layer_infos) - k - 1 + + for k in range(len(layer_infos)): # k为跳数,实验中k = 0 1 + t = len(layer_infos) - k - 1 # t = 1 0 + + # 每一跳的邻居数目是前一跳的邻居节点数*该层的采样数,有个累乘的逻辑 support_size *= layer_infos[t].num_samples - sampler = layer_infos[t].neigh_sampler + + sampler = layer_infos[t].neigh_sampler # 采样器选择 + + # 采样器的两个输入,第一个入参是将要被采样的节点id,第二个入参是采样多少个邻居 node = sampler((samples[k], layer_infos[t].num_samples)) - samples.append(tf.reshape(node, [support_size * batch_size,])) + + # reshape成一维数组,再添加进samples中 + samples.append(tf.reshape(node, [support_size * batch_size, ])) + + # 同时记录好每一层的采样数 support_sizes.append(support_size) return samples, support_sizes - def aggregate(self, samples, input_features, dims, num_samples, support_sizes, batch_size=None, - aggregators=None, name=None, concat=False, model_size="small"): + aggregators=None, name=None, concat=False, model_size="small"): """ At each layer, aggregate hidden representations of neighbors to compute the hidden representations at next layer. Args: @@ -290,49 +363,114 @@ def aggregate(self, samples, input_features, dims, num_samples, support_sizes, b batch_size: the number of inputs (different for batch inputs and negative samples). Returns: The hidden representation at the final layer for all nodes in batch + + samples: 一个列表,里面存放的是邻居节点id, + sample[0]是初始节点,可以理解为第0跳邻居采样 (hop) + sample[1]是对sample[0]中每一个节点进行邻居采样,即第1跳采样 + sample[2]是对sample[1]中每一个节点进行邻居采样,即第2跳采样 + 以此类推 + input_features: 矩阵,存放的是全量的节点的特征 + + num_samples: 列表,表示模型每一层的邻居采样数目,实验中为[25,10] + + Returns: + + + + """ if batch_size is None: batch_size = self.batch_size # length: number of layers + 1 - hidden = [tf.nn.embedding_lookup(input_features, node_samples) for node_samples in samples] + # 遍历samples列表,根据每一个元素中存放的节点id,从全量的特征矩阵里获取所需的节点特征 + + hidden = [tf.nn.embedding_lookup( + input_features, node_samples) for node_samples in samples] + # hidden[0] [batch, num_features] + # hidden[1] [layer_infos[1].num_samples * batch_size, num_features] + # hidden[2] [layer_infos[1].num_samples * layer_infos[0].num_samples * batch_size, num_features] + # num_features表示的是特征维度,实验中为50 + + # 输入batch1的时候,该项为aggregators = None, 输入batch2或者neg_samples的时候,aggregators为batch1生成的aggregators + # 即他们用的是同一个聚合器 new_agg = aggregators is None + if new_agg: aggregators = [] - for layer in range(len(num_samples)): + for layer in range(len(num_samples)): # 按层数循环 if new_agg: dim_mult = 2 if concat and (layer != 0) else 1 # aggregator at current layer + # 根据给定的参数,初始化一个聚合器类, + # 其中,聚合器有多种选择,是由超参定义的, + # 另外需要的参数是输入维度、输出维度、dropout系数等等 + # 注意输入维度前面有个dim_mult,该值为1或者2,如果concat=True,表示节点自身的结果和邻居的会拼接一下,则从第二层开始,输入维度需要乘2 + # 判断是否是最后一层,如果是的话,会有个参数act=lambda x: x if layer == len(num_samples) - 1: - aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1], act=lambda x : x, - dropout=self.placeholders['dropout'], - name=name, concat=concat, model_size=model_size) + aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1], act=lambda x: x, + dropout=self.placeholders['dropout'], + name=name, concat=concat, model_size=model_size) else: aggregator = self.aggregator_cls(dim_mult*dims[layer], dims[layer+1], - dropout=self.placeholders['dropout'], - name=name, concat=concat, model_size=model_size) + dropout=self.placeholders['dropout'], + name=name, concat=concat, model_size=model_size) aggregators.append(aggregator) else: + # 在batch2或者neg_samples输入的时候,直接使用已有的聚合器 aggregator = aggregators[layer] + + # 本实验中,aggregator1 的输入输出维度分别为:50,256, 参数矩阵维度为50,128 ,后面有个拼接 + # aggregator2 的输入输出维度为:256,256,参数矩阵维度为256,128 + # hidden representation at current layer for all support nodes that are various hops away + # 该变量存放的是当前层,各节点利用邻居节点的信息更新后的中间表达 next_hidden = [] + # as layer increases, the number of support nodes needed decreases + # 随着层数增加,跳数需要减少 for hop in range(len(num_samples) - layer): dim_mult = 2 if concat and (layer != 0) else 1 - neigh_dims = [batch_size * support_sizes[hop], - num_samples[len(num_samples) - hop - 1], + + # 每个节点的特征,是由自身的特征和其邻居节点的特征聚合而来的, + # hidden[hop+1]包含了hidden[hop]中节点的所有邻居特征 + # 因为hidden[i]存放为二维,而mean_aggregator是需要将邻居节点特征平均, + # 因此需要将它reshape一下,方便在后面的处理中取所有邻居的均值 + # neigh_dims = [batch_size * 当前跳数的支持节点数,当前层的需要采样的邻居节点数,特征数] + # + neigh_dims = [batch_size * support_sizes[hop], + # 这个维度,对应sample函数里的 t = len(layer_infos) - k - 1 + num_samples[len(num_samples) - hop - 1], dim_mult*dims[layer]] h = aggregator((hidden[hop], tf.reshape(hidden[hop + 1], neigh_dims))) next_hidden.append(h) hidden = next_hidden + + # 输出的hidden[0],本实验中,shape=[batch_size,128*2] return hidden[0], aggregators def _build(self): + + # 将batch2 reshape一下,用作下一步采样的输入 labels = tf.reshape( - tf.cast(self.placeholders['batch2'], dtype=tf.int64), - [self.batch_size, 1]) + tf.cast(self.placeholders['batch2'], dtype=tf.int64), + [self.batch_size, 1]) + """ + tf.nn.fixed_unigram_candidate_sampler函数功能是从[0,range_max)中随机采样num_sampled个类 + 其中,返回的类是一个列表,每一个元素属于[0, range_max), 代表一个类别 + 每个类被采样的概率由参数unigrams决定,可以是表示概率的数组,也可以是表示count的数组(count大表示被采样的概率大) + range_max参数代表从[0,range_max)中采样,这里等于节点数,刚好是对应节点id + + -------- + 在本实验中,就是利用这个函数,利用每个节点的度数形成概率分布,从节点集合中获取一批节点id,在后续视作负样本 + true_classes个参数传入的是labels,但经测试,采样的结果和这个参数是无关的样子,而且实际是有可能会采样到正样本的 + 返回的结果neg_samples里面是一个列表,每一个元素代表的是节点id + + 参考 https://github.com/williamleif/GraphSAGE/issues/76, 这个里面作者说了是可能会采样到正样本,只是他们假设,当整个图数据集远大于邻域计算图时,采样到正样本的概率很小。 + """ + self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler( true_classes=labels, num_true=1, @@ -342,69 +480,147 @@ def _build(self): distortion=0.75, unigrams=self.degrees.tolist())) - # perform "convolution" + + # 根据节点id去采样其邻居节点id + # 返回的结果:samples,support_sizes samples1, support_sizes1 = self.sample(self.inputs1, self.layer_infos) samples2, support_sizes2 = self.sample(self.inputs2, self.layer_infos) - num_samples = [layer_info.num_samples for layer_info in self.layer_infos] + + # 每层需要的采样数 实验中是[25,10] + + num_samples = [ + layer_info.num_samples for layer_info in self.layer_infos] + + # 获取batch1的特征表达,该步传入的聚合器参数为None,会构建一个聚合器返回 self.outputs1, self.aggregators = self.aggregate(samples1, [self.features], self.dims, num_samples, - support_sizes1, concat=self.concat, model_size=self.model_size) + support_sizes1, concat=self.concat, model_size=self.model_size) + + # 获取batch2的特征表达,其中聚合器是直接使用上一步生成的:aggregators=self.aggregators self.outputs2, _ = self.aggregate(samples2, [self.features], self.dims, num_samples, - support_sizes2, aggregators=self.aggregators, concat=self.concat, - model_size=self.model_size) + support_sizes2, aggregators=self.aggregators, concat=self.concat, + model_size=self.model_size) + # 对负样本做邻居节点采样,和上面的正样本是同样的处理 neg_samples, neg_support_sizes = self.sample(self.neg_samples, self.layer_infos, - FLAGS.neg_sample_size) + FLAGS.neg_sample_size) + + # 获取负样本的特征表达,聚合器也是用和之前同一个,注意batch_size参数,这里赋值的是负样本数量,和正样本的batch_size不同 self.neg_outputs, _ = self.aggregate(neg_samples, [self.features], self.dims, num_samples, - neg_support_sizes, batch_size=FLAGS.neg_sample_size, aggregators=self.aggregators, - concat=self.concat, model_size=self.model_size) + neg_support_sizes, batch_size=FLAGS.neg_sample_size, aggregators=self.aggregators, + concat=self.concat, model_size=self.model_size) dim_mult = 2 if self.concat else 1 + + # 这里生成了一个预测层,注意参数bilinear_weights,这个值如果为True,则会生成一个可训练的参数矩阵,在后续的计算loss会用到 + # 但是本实验在这里设置了否,则无参数矩阵,本质上就是一个计算loss的类,完全不影响上述aggregator的输出 self.link_pred_layer = BipartiteEdgePredLayer(dim_mult*self.dims[-1], - dim_mult*self.dims[-1], self.placeholders, act=tf.nn.sigmoid, - bilinear_weights=False, - name='edge_predict') + dim_mult*self.dims[-1], self.placeholders, act=tf.nn.sigmoid, + bilinear_weights=False, + name='edge_predict') + # 对输出的样本执行L2规范化,dim=0或者1,1是表示按行做 + # x_l2[i] = x[i]/sqrt(sum(x^2)) + # 对应论文 Algorithm 1的第7行 self.outputs1 = tf.nn.l2_normalize(self.outputs1, 1) self.outputs2 = tf.nn.l2_normalize(self.outputs2, 1) self.neg_outputs = tf.nn.l2_normalize(self.neg_outputs, 1) def build(self): + + # 构建模型的输出 self._build() # TF graph management + # 构建模型的损失函数和准确度指标 self._loss() self._accuracy() + + # 除以batch,得到的平均loss self.loss = self.loss / tf.cast(self.batch_size, tf.float32) + + # 计算梯度 grads_and_vars = self.optimizer.compute_gradients(self.loss) - clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var) - for grad, var in grads_and_vars] + + # 梯度裁剪,若梯度大于5则置为5,小于-5则置为-5, + clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var) + for grad, var in grads_and_vars] + + # clipped_grads_and_vars 是一个元组,(grad,var),表示梯度值和变量值,这里只取了grad self.grad, _ = clipped_grads_and_vars[0] + + # 利用裁剪后的梯度更新模型参数 self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars) def _loss(self): + + # 参数的L2正则化项 + # output = sum(t ** 2) / 2 for aggregator in self.aggregators: for var in aggregator.vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) - self.loss += self.link_pred_layer.loss(self.outputs1, self.outputs2, self.neg_outputs) + # 根据之前生成的预测层,计算loss,该loss有三个选项:_xent_loss、_skipgram_loss、_hinge_loss,论文中使用的是第一个 + self.loss += self.link_pred_layer.loss( + self.outputs1, self.outputs2, self.neg_outputs) tf.summary.scalar('loss', self.loss) def _accuracy(self): - # shape: [batch_size] + """ + 计算性能指标 + 模型计算了三组数据:一条边上的两个顶点(batch1和batch2)和采样得到的负样本(neg_samples)的特征值 + 主体思想是希望边两端点之间的相似度>该点和所有neg_samles相似度 + ①计算正样本对的"亲和度" + ②计算顶点和负样本的"亲和度" + ③将两组数据拼接,拼接后的数组维度[batch_size, neg_samples_size + 1],意义是每一个顶点和负样本、正样本之间的"亲和度" + ④计算正样本对之间的亲和度的排名,排名越靠前越好 + """ + + # ①计算正样本对的"亲和度" + # aff值在本实验即是两个输入按元素点乘,再按行求和 + # shape : [batch_size,] 表示了该batch中,每个节点和其邻居节点的“亲和度”,越大代表越相似 aff = self.link_pred_layer.affinity(self.outputs1, self.outputs2) - # shape : [batch_size x num_neg_samples] - self.neg_aff = self.link_pred_layer.neg_cost(self.outputs1, self.neg_outputs) - self.neg_aff = tf.reshape(self.neg_aff, [self.batch_size, FLAGS.neg_sample_size]) + + # ②计算顶点和负样本的"亲和度" + # 返回的是一个矩阵,维度:[batch_size,num_neg_samples] + # 含义是一组batch里每一个节点对每个负样本的"亲和度" + self.neg_aff = self.link_pred_layer.neg_cost( + self.outputs1, self.neg_outputs) + + self.neg_aff = tf.reshape( + self.neg_aff, [self.batch_size, FLAGS.neg_sample_size]) + + # ③将两组数据拼接,拼接后的数组维度[batch_size, neg_samples_size + 1],意义是每一个顶点和负样本、正样本之间的"亲和度" + # shape : [batch_size,1] _aff = tf.expand_dims(aff, axis=1) + # shape : [batch_size,num_neg_samples + 1] self.aff_all = tf.concat(axis=1, values=[self.neg_aff, _aff]) size = tf.shape(self.aff_all)[1] + + # ④利用top_k函数,两步计算出正样本对之间的亲和度的排名 + # tf.nn.top_k函数是根据输入的数组,返回最大的k个数和他们的序号 + # 这里两次利用tok_k函数,得到rank值 + # self.ranks中表示的是每个顶点和负样本、正样本之间的亲和度排名,维度:[batch_size, neg_samples_size + 1] + # 示例: + # result = tf.constant([0.5, 0.9, 0.3, 0.4, 0.6, 0.8, 0.7, 0.1]) + # _, indices_of_ranks = tf.nn.top_k(result, k=len(result)) + # indices_of_ranks.numpy() : [1, 5, 6, 4, 0, 3, 2, 7] 按顺序看,表示最大数序号为1,第二大的数序号为5 ... + # _, ranks = tf.nn.top_k(-indices_of_ranks, k=len(result)) + # ranks.numpy() : [4, 0, 6, 5, 3, 1, 2, 7] 这里得到的就是result中每个元素的排名序号,0表示最大,以此类推 + # _, indices_of_ranks = tf.nn.top_k(self.aff_all, k=size) _, self.ranks = tf.nn.top_k(-indices_of_ranks, k=size) - self.mrr = tf.reduce_mean(tf.div(1.0, tf.cast(self.ranks[:, -1] + 1, tf.float32))) + + # 取self.ranks最后一列,即正样本的排名序数,因为是从0算起的,所以要+1 + # Mean Reciprocal Rank(MRR) = 1.0/rank + # 通过正确的检索结果值在所有检索结果中的排名来评估排序性能, rank越大,mrr值越小 + + self.mrr = tf.reduce_mean( + tf.div(1.0, tf.cast(self.ranks[:, -1] + 1, tf.float32))) tf.summary.scalar('mrr', self.mrr) +# class Node2VecModel(GeneralizedModel): def __init__(self, placeholders, dict_size, degrees, name=None, nodevec_dim=50, lr=0.001, **kwargs): @@ -429,15 +645,15 @@ def __init__(self, placeholders, dict_size, degrees, name=None, # following the tensorflow word2vec tutorial self.target_embeds = tf.Variable( - tf.random_uniform([dict_size, nodevec_dim], -1, 1), - name="target_embeds") + tf.random_uniform([dict_size, nodevec_dim], -1, 1), + name="target_embeds") self.context_embeds = tf.Variable( - tf.truncated_normal([dict_size, nodevec_dim], - stddev=1.0 / math.sqrt(nodevec_dim)), - name="context_embeds") + tf.truncated_normal([dict_size, nodevec_dim], + stddev=1.0 / math.sqrt(nodevec_dim)), + name="context_embeds") self.context_bias = tf.Variable( - tf.zeros([dict_size]), - name="context_bias") + tf.zeros([dict_size]), + name="context_bias") self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr) @@ -445,8 +661,8 @@ def __init__(self, placeholders, dict_size, degrees, name=None, def _build(self): labels = tf.reshape( - tf.cast(self.placeholders['batch2'], dtype=tf.int64), - [self.batch_size, 1]) + tf.cast(self.placeholders['batch2'], dtype=tf.int64), + [self.batch_size, 1]) self.neg_samples, _, _ = (tf.nn.fixed_unigram_candidate_sampler( true_classes=labels, num_true=1, @@ -456,14 +672,19 @@ def _build(self): distortion=0.75, unigrams=self.degrees.tolist())) - self.outputs1 = tf.nn.embedding_lookup(self.target_embeds, self.inputs1) - self.outputs2 = tf.nn.embedding_lookup(self.context_embeds, self.inputs2) - self.outputs2_bias = tf.nn.embedding_lookup(self.context_bias, self.inputs2) - self.neg_outputs = tf.nn.embedding_lookup(self.context_embeds, self.neg_samples) - self.neg_outputs_bias = tf.nn.embedding_lookup(self.context_bias, self.neg_samples) + self.outputs1 = tf.nn.embedding_lookup( + self.target_embeds, self.inputs1) + self.outputs2 = tf.nn.embedding_lookup( + self.context_embeds, self.inputs2) + self.outputs2_bias = tf.nn.embedding_lookup( + self.context_bias, self.inputs2) + self.neg_outputs = tf.nn.embedding_lookup( + self.context_embeds, self.neg_samples) + self.neg_outputs_bias = tf.nn.embedding_lookup( + self.context_bias, self.neg_samples) self.link_pred_layer = BipartiteEdgePredLayer(self.hidden_dim, self.hidden_dim, - self.placeholders, bilinear_weights=False) + self.placeholders, bilinear_weights=False) def build(self): self._build() @@ -476,26 +697,31 @@ def _minimize(self): self.opt_op = self.optimizer.minimize(self.loss) def _loss(self): - aff = tf.reduce_sum(tf.multiply(self.outputs1, self.outputs2), 1) + self.outputs2_bias - neg_aff = tf.matmul(self.outputs1, tf.transpose(self.neg_outputs)) + self.neg_outputs_bias + aff = tf.reduce_sum(tf.multiply( + self.outputs1, self.outputs2), 1) + self.outputs2_bias + neg_aff = tf.matmul(self.outputs1, tf.transpose( + self.neg_outputs)) + self.neg_outputs_bias true_xent = tf.nn.sigmoid_cross_entropy_with_logits( - labels=tf.ones_like(aff), logits=aff) + labels=tf.ones_like(aff), logits=aff) negative_xent = tf.nn.sigmoid_cross_entropy_with_logits( - labels=tf.zeros_like(neg_aff), logits=neg_aff) + labels=tf.zeros_like(neg_aff), logits=neg_aff) loss = tf.reduce_sum(true_xent) + tf.reduce_sum(negative_xent) self.loss = loss / tf.cast(self.batch_size, tf.float32) tf.summary.scalar('loss', self.loss) - + def _accuracy(self): # shape: [batch_size] aff = self.link_pred_layer.affinity(self.outputs1, self.outputs2) # shape : [batch_size x num_neg_samples] - self.neg_aff = self.link_pred_layer.neg_cost(self.outputs1, self.neg_outputs) - self.neg_aff = tf.reshape(self.neg_aff, [self.batch_size, FLAGS.neg_sample_size]) + self.neg_aff = self.link_pred_layer.neg_cost( + self.outputs1, self.neg_outputs) + self.neg_aff = tf.reshape( + self.neg_aff, [self.batch_size, FLAGS.neg_sample_size]) _aff = tf.expand_dims(aff, axis=1) self.aff_all = tf.concat(axis=1, values=[self.neg_aff, _aff]) size = tf.shape(self.aff_all)[1] _, indices_of_ranks = tf.nn.top_k(self.aff_all, k=size) _, self.ranks = tf.nn.top_k(-indices_of_ranks, k=size) - self.mrr = tf.reduce_mean(tf.div(1.0, tf.cast(self.ranks[:, -1] + 1, tf.float32))) + self.mrr = tf.reduce_mean( + tf.div(1.0, tf.cast(self.ranks[:, -1] + 1, tf.float32))) tf.summary.scalar('mrr', self.mrr) diff --git a/graphsage/neigh_samplers.py b/graphsage/neigh_samplers.py index 9c83d553..14292636 100644 --- a/graphsage/neigh_samplers.py +++ b/graphsage/neigh_samplers.py @@ -23,7 +23,13 @@ def __init__(self, adj_info, **kwargs): def _call(self, inputs): ids, num_samples = inputs + + # 从全量的邻接表里根据ids获取各个节点的邻居节点 adj_lists = tf.nn.embedding_lookup(self.adj_info, ids) + + # 该步操作是 转置——乱序——转置, 目的是对列做打乱操作,如果直接打乱的话就是行操作 adj_lists = tf.transpose(tf.random_shuffle(tf.transpose(adj_lists))) + + # 对列切片,只需要num_samples列的邻居,多的部分去掉 adj_lists = tf.slice(adj_lists, [0,0], [-1, num_samples]) return adj_lists diff --git a/graphsage/prediction.py b/graphsage/prediction.py index 0c00c68e..a64be482 100644 --- a/graphsage/prediction.py +++ b/graphsage/prediction.py @@ -11,8 +11,8 @@ class BipartiteEdgePredLayer(Layer): def __init__(self, input_dim1, input_dim2, placeholders, dropout=False, act=tf.nn.sigmoid, - loss_fn='xent', neg_sample_weights=1.0, - bias=False, bilinear_weights=False, **kwargs): + loss_fn='xent', neg_sample_weights=1.0, + bias=False, bilinear_weights=False, **kwargs): """ Basic class that applies skip-gram-like loss (i.e., dot product of node+target and node and negative samples) @@ -20,7 +20,11 @@ def __init__(self, input_dim1, input_dim2, placeholders, dropout=False, act=tf.n bilinear_weights: use a bilinear weight for affinity calculation: u^T A v. If set to false, it is assumed that input dimensions are the same and the affinity will be based on dot product. + + 一个基础类,使用了"skip-gram" 类型的损失函数(节点和目标的点乘以及节点和负样本的点乘) + """ + super(BipartiteEdgePredLayer, self).__init__(**kwargs) self.input_dim1 = input_dim1 self.input_dim2 = input_dim2 @@ -41,20 +45,25 @@ def __init__(self, input_dim1, input_dim2, placeholders, dropout=False, act=tf.n # output a likelihood term self.output_dim = 1 + #生成命名空间 with tf.variable_scope(self.name + '_vars'): # bilinear form if bilinear_weights: - #self.vars['weights'] = glorot([input_dim1, input_dim2], + # self.vars['weights'] = glorot([input_dim1, input_dim2], # name='pred_weights') + #获取已存在的变量,如果不存在就创建一个,pred_weights为变量名,shape为维度,initializer为初始化的方式 self.vars['weights'] = tf.get_variable( - 'pred_weights', - shape=(input_dim1, input_dim2), - dtype=tf.float32, - initializer=tf.contrib.layers.xavier_initializer()) + 'pred_weights', + shape=(input_dim1, input_dim2), + dtype=tf.float32, + #该函数返回一个用于初始化权重的初始化程序“Xavier”。这个初始化器是用来保持每一层的阶梯大小都差不多相同。 + initializer=tf.contrib.layers.xavier_initializer()) + #如果bias=False就不生成矩阵,初始化时定义的默认值为False,如果为true就生成为0的矩阵 if self.bias: self.vars['bias'] = zeros([self.output_dim], name='bias') + #根据loss_fn传入的参数,把参数同名的方法赋值给loss_fn if loss_fn == 'xent': self.loss_fn = self._xent_loss elif loss_fn == 'skipgram': @@ -69,13 +78,22 @@ def affinity(self, inputs1, inputs2): """ Affinity score between batch of inputs1 and inputs2. Args: inputs1: tensor of shape [batch_size x feature_size]. + + 计算正样本对之间的"亲和度": + ①特征矩阵点乘(没有bilinear_weights的情况下) + ②求均值 + + 返回的是样本和其对应的正样本之间的亲和度,尺寸:[batch_size,1] """ # shape: [batch_size, input_dim1] if self.bilinear_weights: + #inputs2矩阵和权重矩阵相乘,权重矩阵做了参数转置处理 prod = tf.matmul(inputs2, tf.transpose(self.vars['weights'])) self.prod = prod + #向量prod * inputs1形状张量得到一个一维矩阵 result = tf.reduce_sum(inputs1 * prod, axis=1) else: + #向量inputs1 * inputs2形状张量得到一个一维矩阵 result = tf.reduce_sum(inputs1 * inputs2, axis=1) return result @@ -85,42 +103,84 @@ def neg_cost(self, inputs1, neg_samples, hard_neg_samples=None): Returns: Tensor of shape [batch_size x num_neg_samples]. For each node, a list of affinities to negative samples is computed. + 计算输入样本和每一个负样本之间的"亲和度": + ①inputs_features × neg_features.T + + 返回的是样本和每一个负样本之间的"亲和度",尺寸是[batch_size, num_neg_samples] + """ if self.bilinear_weights: + #inputs1矩阵和权重相乘 inputs1 = tf.matmul(inputs1, self.vars['weights']) + #inputs1矩阵和neg_samples矩阵相乘,neg_samples参数做了转置处理 neg_aff = tf.matmul(inputs1, tf.transpose(neg_samples)) + return neg_aff + #调用loss_fn函数 def loss(self, inputs1, inputs2, neg_samples): """ negative sampling loss. Args: neg_samples: tensor of shape [num_neg_samples x input_dim2]. Negative samples for all inputs in batch inputs1. + """ return self.loss_fn(inputs1, inputs2, neg_samples) def _xent_loss(self, inputs1, inputs2, neg_samples, hard_neg_samples=None): + """ + 计算正样本的交叉熵损失,正样本label赋值全1, 负样本label赋值全0 + 公式 : y * -log(sigmoid(x)) + (1 - y) * -log(1 - sigmoid(x)) + 正样本y=1,负样本y=0,分别可以省略一项 + + ①计算正样本对的亲和度 + ②计算样本和负样本的亲和度 + ③将label全部设为1,计算正样本对产生的loss + ④将label全部设为0,计算所有负样本产生的loss + ⑤将两个loss平均一下 + + 对应论文的公式(1) + + """ + # 计算正样本对的亲和度 aff = self.affinity(inputs1, inputs2) + + # 计算顶点和各个负样本的亲和度 neg_aff = self.neg_cost(inputs1, neg_samples, hard_neg_samples) + + + #计算损失,logits和labels必须有相同的类型和大小 true_xent = tf.nn.sigmoid_cross_entropy_with_logits( - labels=tf.ones_like(aff), logits=aff) + labels=tf.ones_like(aff), logits=aff) + + # 计算负样本的交叉熵损失 negative_xent = tf.nn.sigmoid_cross_entropy_with_logits( - labels=tf.zeros_like(neg_aff), logits=neg_aff) - loss = tf.reduce_sum(true_xent) + self.neg_sample_weights * tf.reduce_sum(negative_xent) + labels=tf.zeros_like(neg_aff), logits=neg_aff) + + + # neg_sample_weights 默认为1.0 + loss = tf.reduce_sum( + true_xent) + self.neg_sample_weights * tf.reduce_sum(negative_xent) return loss def _skipgram_loss(self, inputs1, inputs2, neg_samples, hard_neg_samples=None): aff = self.affinity(inputs1, inputs2) neg_aff = self.neg_cost(inputs1, neg_samples, hard_neg_samples) neg_cost = tf.log(tf.reduce_sum(tf.exp(neg_aff), axis=1)) + #off矩阵 减去 neg_cost矩阵,再求和得出损失 loss = tf.reduce_sum(aff - neg_cost) return loss def _hinge_loss(self, inputs1, inputs2, neg_samples, hard_neg_samples=None): aff = self.affinity(inputs1, inputs2) neg_aff = self.neg_cost(inputs1, neg_samples, hard_neg_samples) - diff = tf.nn.relu(tf.subtract(neg_aff, tf.expand_dims(aff, 1) - self.margin), name='diff') + #tf.nn.relu:将输入小于0的值赋值为0,输入大于0的值不变 + #tf.subtract:返回的数据类型与neg_aff相同,且第一个参数减去第二个参数的操作是元素级别的 + #tf.expand_dims:用于给函数增加维度 + diff = tf.nn.relu(tf.subtract( + neg_aff, tf.expand_dims(aff, 1) - self.margin), name='diff') loss = tf.reduce_sum(diff) + #得到neg_aff矩阵的shape self.neg_shape = tf.shape(neg_aff) return loss diff --git a/graphsage/supervised_models.py b/graphsage/supervised_models.py index 9ea123ce..84a6c1ad 100644 --- a/graphsage/supervised_models.py +++ b/graphsage/supervised_models.py @@ -27,6 +27,9 @@ def __init__(self, num_classes, - aggregator_type: how to aggregate neighbor information - model_size: one of "small" and "big" - sigmoid_loss: Set to true if nodes can belong to multiple classes + + + 该初始化部分和Model.SampleAndAggregate类基本相同 ''' models.GeneralizedModel.__init__(self, **kwargs) @@ -76,31 +79,54 @@ def __init__(self, num_classes, def build(self): - samples1, support_sizes1 = self.sample(self.inputs1, self.layer_infos) + + """ + 输出特征表达outputs1、计算梯度的流程和SampleAndAggregate的基本相同 + + 只是该有监督模型在特征表达的结果后面加了一层全连接(Dense),用来预测 + """ + # 调用的父类SampleAndAggregate的采样方案 + samples1, support_sizes1 = self.sample(self.inputs1, self.layer_infos) num_samples = [layer_info.num_samples for layer_info in self.layer_infos] + + # 构建聚合器并输出节点表达结果 self.outputs1, self.aggregators = self.aggregate(samples1, [self.features], self.dims, num_samples, support_sizes1, concat=self.concat, model_size=self.model_size) - dim_mult = 2 if self.concat else 1 + dim_mult = 2 if self.concat else 1 + # L2规范化 self.outputs1 = tf.nn.l2_normalize(self.outputs1, 1) - dim_mult = 2 if self.concat else 1 + + + """ + 全连接层 + 该层的输入维度为aggregator输出的特征表达的维度 + 输出维度为分类数 + """ self.node_pred = layers.Dense(dim_mult*self.dims[-1], self.num_classes, dropout=self.placeholders['dropout'], act=lambda x : x) # TF graph management self.node_preds = self.node_pred(self.outputs1) + # 定义损失函数 self._loss() + + # 梯度计算过程和SampleAndAggregate一样 grads_and_vars = self.optimizer.compute_gradients(self.loss) clipped_grads_and_vars = [(tf.clip_by_value(grad, -5.0, 5.0) if grad is not None else None, var) for grad, var in grads_and_vars] self.grad, _ = clipped_grads_and_vars[0] + self.opt_op = self.optimizer.apply_gradients(clipped_grads_and_vars) + self.preds = self.predict() def _loss(self): # Weight decay loss + + # L2正则化项 for aggregator in self.aggregators: for var in aggregator.vars.values(): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) @@ -108,6 +134,7 @@ def _loss(self): self.loss += FLAGS.weight_decay * tf.nn.l2_loss(var) # classification loss + # 交叉熵损失,激活函数可选sigmoid或者softmax,目的是将结果映射到[0,1]之间 if self.sigmoid_loss: self.loss += tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits( logits=self.node_preds, diff --git a/graphsage/supervised_train.py b/graphsage/supervised_train.py index c5bff002..98349e07 100644 --- a/graphsage/supervised_train.py +++ b/graphsage/supervised_train.py @@ -131,11 +131,12 @@ def train(train_data, test_data=None): num_classes = len(set(class_map.values())) if not features is None: - # pad with dummy zero vector + # pad with dummy zero vector, row wise features = np.vstack([features, np.zeros((features.shape[1],))]) context_pairs = train_data[3] if FLAGS.random_context else None placeholders = construct_placeholders(num_classes) + #实例化 NodeMinibatch 迭代器 minibatch = NodeMinibatchIterator(G, id_map, placeholders, @@ -144,6 +145,7 @@ def train(train_data, test_data=None): batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree, context_pairs = context_pairs) + # adjacency shape: (14756, 128) 包装为placeholder adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") diff --git a/graphsage/unsupervised_train.py b/graphsage/unsupervised_train.py index 44ef6091..c7bd8df2 100644 --- a/graphsage/unsupervised_train.py +++ b/graphsage/unsupervised_train.py @@ -70,12 +70,27 @@ def log_dir(): # Define model evaluation function def evaluate(sess, model, minibatch_iter, size=None): + """ + 评估函数 + 输入: + model:训练好了的模型 + minibatch_iter:迭代类 + size: batch_size + """ + t_test = time.time() + + # 评估阶段,这里传入的是验证集 feed_dict_val = minibatch_iter.val_feed_dict(size) + + # 这里只跑模型的loss等指标 outs_val = sess.run([model.loss, model.ranks, model.mrr], feed_dict=feed_dict_val) return outs_val[0], outs_val[1], outs_val[2], (time.time() - t_test) + + +# 这个函数在无监督中没有用到,暂时忽略 def incremental_evaluate(sess, model, minibatch_iter, size): t_test = time.time() finished = False @@ -92,6 +107,16 @@ def incremental_evaluate(sess, model, minibatch_iter, size): return np.mean(val_losses), np.mean(val_mrrs), (time.time() - t_test) def save_val_embeddings(sess, model, minibatch_iter, size, out_dir, mod=""): + """ + 输入: + model:训练好了的模型 + minibatch_iter:迭代类 + size: batch_size + out_dir: 输出目录 + + 该函数作用是,训练好模型之后,再将所有的节点都输入到模型计算,保存其特征表达到本地 + """ + val_embeddings = [] finished = False seen = set([]) @@ -99,18 +124,27 @@ def save_val_embeddings(sess, model, minibatch_iter, size, out_dir, mod=""): iter_num = 0 name = "val" while not finished: + + # 获取batch dict数据 + # finished是一个信号,如果为真代表节点遍历完毕,则退出 feed_dict_val, finished, edges = minibatch_iter.incremental_embed_feed_dict(size, iter_num) iter_num += 1 + + # 计算特征表达 outs_val = sess.run([model.loss, model.mrr, model.outputs1], feed_dict=feed_dict_val) - #ONLY SAVE FOR embeds1 because of planetoid + + # ONLY SAVE FOR embeds1 because of planetoid for i, edge in enumerate(edges): if not edge[0] in seen: + # outs_val[-1]表示的是 model.outputs1这个变量, + # outs_val[-1][i,:]是第i个节点的特征 val_embeddings.append(outs_val[-1][i,:]) nodes.append(edge[0]) seen.add(edge[0]) if not os.path.exists(out_dir): os.makedirs(out_dir) + val_embeddings = np.vstack(val_embeddings) np.save(out_dir + name + mod + ".npy", val_embeddings) with open(out_dir + name + mod + ".txt", "w") as fp: @@ -118,6 +152,13 @@ def save_val_embeddings(sess, model, minibatch_iter, size, out_dir, mod=""): def construct_placeholders(): # Define placeholders + ''' + batch1和batch2分别代表一条边的两个端点,在后续的处理中用作正样本对 + neg_samples 是负样本的数量 + dropout 是特征传入下一层的时候的丢弃概率,有助于模型的泛化性能 + + ''' + placeholders = { 'batch1' : tf.placeholder(tf.int32, shape=(None), name='batch1'), 'batch2' : tf.placeholder(tf.int32, shape=(None), name='batch2'), @@ -130,30 +171,42 @@ def construct_placeholders(): return placeholders def train(train_data, test_data=None): + + #读取图、节点特征、节点映射表 + G = train_data[0] - features = train_data[1] + features = train_data[1] # shape = [num_nodes, num_features] id_map = train_data[2] if not features is None: - # pad with dummy zero vector + # pad with dummy zero vector,features的特征加一列全0 features = np.vstack([features, np.zeros((features.shape[1],))]) + + # 根据开关判断是否使用随机游走的边,如果为真则使用随机游走的边代替图G里的边信息 context_pairs = train_data[3] if FLAGS.random_context else None + + # 定义一些占位符 placeholders = construct_placeholders() + + # 创建一个边batch迭代器类,每个batch是一条边,边上的两个点的id作为模型的输入 minibatch = EdgeMinibatchIterator(G, id_map, placeholders, batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree, - num_neg_samples=FLAGS.neg_sample_size, - context_pairs = context_pairs) - adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape) + num_neg_samples=FLAGS.neg_sample_size, # num_neg_samples 这个形参在这里没有用到 + context_pairs = context_pairs) + + # 根据邻接表的维度创建一个占位符 + adj_info_ph = tf.placeholder(tf.int32, shape=minibatch.adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") if FLAGS.model == 'graphsage_mean': - # Create model + # Create model + # sampler = UniformNeighborSampler(adj_info) layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), - SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] + SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] # samples1 =25, sample2=10 model = SampleAndAggregate(placeholders, features, @@ -233,6 +286,8 @@ def train(train_data, test_data=None): else: raise Exception('Error: model name unrecognized.') + + config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True #config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION @@ -243,10 +298,10 @@ def train(train_data, test_data=None): merged = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) - # Init variables + # Init variables 初始化参数 sess.run(tf.global_variables_initializer(), feed_dict={adj_info_ph: minibatch.adj}) - # Train model + # Train model 训练 train_shadow_mrr = None shadow_mrr = None @@ -255,8 +310,12 @@ def train(train_data, test_data=None): avg_time = 0.0 epoch_val_costs = [] + # 利用tf.assgin函数,将邻接表数据赋值给adj_info + # 训练的时候,用训练的邻接表,验证的时候用验证的邻接表 + # 此时操作并不会执行,在之后sess.run的时候才会真正执行 train_adj_info = tf.assign(adj_info, minibatch.adj) val_adj_info = tf.assign(adj_info, minibatch.test_adj) + for epoch in range(FLAGS.epochs): minibatch.shuffle() @@ -265,24 +324,39 @@ def train(train_data, test_data=None): epoch_val_costs.append(0) while not minibatch.end(): # Construct feed dictionary - feed_dict = minibatch.next_minibatch_feed_dict() + # 按batch读取数据,每个batch包含边的两个端点,分别放进batch1和batch2中, + feed_dict = minibatch.next_minibatch_feed_dict() feed_dict.update({placeholders['dropout']: FLAGS.dropout}) t = time.time() - # Training step + + # Training step 训练 + # merged是保存了的所有的tf.summary相关的变量,用于tensorboard绘图, + # model.opt_op 是优化调参的操作 + # + # outs = sess.run([merged, model.opt_op, model.loss, model.ranks, model.aff_all, - model.mrr, model.outputs1], feed_dict=feed_dict) + model.mrr, model.outputs1], feed_dict=feed_dict) # 训练 train_cost = outs[2] - train_mrr = outs[5] + train_mrr = outs[5] + + + # train_shadow_mrr值是一个滑动平均更新的方式 + # 在得到该批次数据的mrr的时候,不直接更新,而是将历史的mrr值也引入进来计算,避免评估结果抖动较大 + # 参考 https://www.pianshen.com/article/11191400472/ if train_shadow_mrr is None: train_shadow_mrr = train_mrr# else: + # 1-0.99中,0.99为衰减率 train_shadow_mrr -= (1-0.99) * (train_shadow_mrr - train_mrr) if iter % FLAGS.validate_iter == 0: - # Validation + # Validation 验证的时候,需要用验证集的邻接表信息,因此这里跑一下tf.assign的操作, 将验证的邻接表赋给adj_info + sess.run(val_adj_info.op) val_cost, ranks, val_mrr, duration = evaluate(sess, model, minibatch, size=FLAGS.validate_batch_size) + # 验证完毕再切回训练集的邻接表信息 + sess.run(train_adj_info.op) epoch_val_costs[-1] += val_cost if shadow_mrr is None: diff --git a/graphsage/utils.py b/graphsage/utils.py index ff05072d..d80a9a49 100644 --- a/graphsage/utils.py +++ b/graphsage/utils.py @@ -16,7 +16,13 @@ WALK_LEN=5 N_WALKS=50 + +""" +加载数据集,并进行简单的预处理 +如下注释以toy-ppi数据为例 +""" def load_data(prefix, normalize=True, load_walks=False): + # DATA 1, 14755 nodes, 228431 links G_data = json.load(open(prefix + "-G.json")) G = json_graph.node_link_graph(G_data) if isinstance(G.nodes()[0], int): @@ -24,14 +30,23 @@ def load_data(prefix, normalize=True, load_walks=False): else: conversion = lambda n : n + # DATA 2, numpy数组, (14755, 50) dtype(float64) if os.path.exists(prefix + "-feats.npy"): feats = np.load(prefix + "-feats.npy") else: print("No features present.. Only identity features will be used.") feats = None + + # DATA 3, {"0": 0, "1": 1}, len: 14755 + # node ids to int value indexing feature tensor, + # 用来做string id到int id的映射,其实没啥用 id_map = json.load(open(prefix + "-id_map.json")) id_map = {conversion(k):int(v) for k,v in id_map.items()} walks = [] + + # DATA4, dict, len: 14755, column: 121 + # from nodes ids to class value (int or list) + # 分类标签 class_map = json.load(open(prefix + "-class_map.json")) if isinstance(list(class_map.values())[0], list): lab_conversion = lambda n : n @@ -42,6 +57,7 @@ def load_data(prefix, normalize=True, load_walks=False): ## Remove all nodes that do not have val/test annotations ## (necessary because of networkx weirdness with the Reddit data) + # 移除损坏的节点:无val和test字段的,即验证和测试标识字段 broken_count = 0 for node in G.nodes(): if not 'val' in G.node[node] or not 'test' in G.node[node]: @@ -51,6 +67,9 @@ def load_data(prefix, normalize=True, load_walks=False): ## Make sure the graph has edge train_removed annotations ## (some datasets might already have this..) + # edge: (0, 800) 边,是个元组,表示源节点ID和目标节点ID + # G[0]:某节点与所有的关联节点组成的边的集合 + # 下面这段代码的作用:标记需要在训练中移除的关联关系 print("Loaded data.. now preprocessing..") for edge in G.edges(): if (G.node[edge[0]]['val'] or G.node[edge[1]]['val'] or @@ -59,10 +78,14 @@ def load_data(prefix, normalize=True, load_walks=False): else: G[edge[0]][edge[1]]['train_removed'] = False - if normalize and not feats is None: + if normalize and feats is not None: from sklearn.preprocessing import StandardScaler + # 训练集的id集合,only int, len: 9716 train_ids = np.array([id_map[n] for n in G.nodes() if not G.node[n]['val'] and not G.node[n]['test']]) train_feats = feats[train_ids] + # 特征缩放,标准化:z = (x - u) / s + # u is the mean of the training samples + # s is the standard deviation of the training samples scaler = StandardScaler() scaler.fit(train_feats) feats = scaler.transform(feats)