圖神經網絡-圖遊走算法核心代碼SkipGram、Node2Vec實現
1. DeepWalk採樣算法
對於給定的節點,DeepWalk會等概率的選取下一個相鄰節點加入路徑,直至達到最大路徑長度,或者沒有下一個節點可選。
2. SkipGram模型訓練
在得到節點路徑後,node2vec會使用SkipGram模型學習節點表示,給定中心節點,預測局部路徑中還有哪些節點。模型中用了negative sampling來降低計算量。
loss的計算過程可參考 PGL/examples/node2vec/node2vec.py中的 node2vec_model 函數。
%%writefile userdef_model.pyimport paddle.fluid.layers as l
def userdef_loss(embed_src, weight_pos, weight_negs): """ 輸入:embed_src - 中心節點向量 list (batch_size, 1, embed_size) weight_pos - 標籤節點向量 list (batch_size, 1, embed_size) weight_negs - 負樣本節點向量 list (batch_size, neg_num, embed_size) 輸出:loss - 正負樣本的交叉熵 float """ pos_logits = l.matmul( embed_src, weight_pos, transpose_y=True) neg_logits = l.matmul( embed_src, weight_negs, transpose_y=True)
ones_label = pos_logits * 0. + 1. ones_label.stop_gradient = True pos_loss = l.sigmoid_cross_entropy_with_logits(pos_logits, ones_label)
zeros_label = neg_logits * 0. zeros_label.stop_gradient = True neg_loss = l.sigmoid_cross_entropy_with_logits(neg_logits, zeros_label)
loss = (l.reduce_mean(pos_loss) + l.reduce_mean(neg_loss)) / 2 return loss運行腳本,可以查看在ArXiv數據集上的效果:
!python my_node2vec.py --use_my_model --epoch 5 # 使用自己定義的loss函數!python link_predict.py --ckpt_path ./tmp/node2vec/paddle_model --epoch 100
pgl的官網node2vec_model原始碼:
def node2vec_model(graph, hidden_size=16, neg_num=5):
pyreader = l.py_reader( capacity=70, shapes=[[-1, 1, 1], [-1, 1, 1], [-1, neg_num, 1]], dtypes=['int64', 'int64', 'int64'], lod_levels=[0, 0, 0], name='train', use_double_buffer=True)
embed_init = fluid.initializer.UniformInitializer(low=-1.0, high=1.0) weight_init = fluid.initializer.TruncatedNormal(scale=1.0 / math.sqrt(hidden_size))
src, pos, negs = l.read_file(pyreader)
embed_src = l.embedding( input=src, size=[graph.num_nodes, hidden_size], param_attr=fluid.ParamAttr( name='content', initializer=embed_init))
weight_pos = l.embedding( input=pos, size=[graph.num_nodes, hidden_size], param_attr=fluid.ParamAttr( name='weight', initializer=weight_init)) weight_negs = l.embedding( input=negs, size=[graph.num_nodes, hidden_size], param_attr=fluid.ParamAttr( name='weight', initializer=weight_init))
pos_logits = l.matmul( embed_src, weight_pos, transpose_y=True) # [batch_size, 1, 1] neg_logits = l.matmul( embed_src, weight_negs, transpose_y=True) # [batch_size, 1, neg_num]
ones_label = pos_logits * 0. + 1. ones_label.stop_gradient = True pos_loss = l.sigmoid_cross_entropy_with_logits(pos_logits, ones_label)
zeros_label = neg_logits * 0. zeros_label.stop_gradient = True neg_loss = l.sigmoid_cross_entropy_with_logits(neg_logits, zeros_label) loss = (l.reduce_mean(pos_loss) + l.reduce_mean(neg_loss)) / 2
return pyreader, loss3. Node2Vec採樣算法
Node2Vec會根據與上個節點的距離按不同概率採樣得到當前節點的下一個節點。
PGL/pgl/graph_kernel.pyx 中用Cython語言實現了節點採樣函數node2vec_sample,以下使用numpy實現自己的node2vec_sample函數 ,用於隨機遊走後得到的路徑,然後對這些路徑進行吸收學習,訓練圖結構
%%writefile userdef_sample.pydef node2vec_sample(succ, prev_succ, prev_node, p, q): """ 輸入:succ - 當前節點的下一個相鄰節點id列表 list (num_neighbors,) prev_succ - 前一個節點的下一個相鄰節點id列表 list (num_neighbors,) prev_node - 前一個節點id int p - 控制回到上一節點的概率 float q - 控制偏向DFS還是BFS float 輸出:下一個節點id int """ succ_len = len(succ) prev_succ_len = len(prev_succ) prev_succ_set = np.asarray([])
for i in range(prev_succ_len): prev_succ_set = np.append(prev_succ_set,prev_succ[i]) probs = [] prob = 0 prob_sum = 0.
for i in range(succ_len): if succ[i] == prev_node: prob = 1. / p elif np.where(prev_succ_set==succ[i]): prob = 1. elif np.where(prev_succ_set!=succ[i]): prob = 1. / q else: prob = 0.
probs.append(prob) prob_sum += prob RAND_MAX = 65535 rand_num = float(np.random.randint(0, RAND_MAX+1)) / RAND_MAX * prob_sum
sampled_succ = 0.
for i in range(succ_len): rand_num -= probs[i] if rand_num <= 0: sampled_succ = succ[i] return sampled_succ運行腳本
!python my_node2vec.py --use_my_sample --epoch 5 !python link_predict.py --ckpt_path ./tmp/node2vec/paddle_model --epoch 100pgl官網node2vec_sample原始碼
@cython.boundscheck(False)@cython.wraparound(False)def node2vec_sample(np.ndarray[np.int64_t, ndim=1] succ, np.ndarray[np.int64_t, ndim=1] prev_succ, long long prev_node, float p, float q): """Fast implement of node2vec sampling """ cdef long long i cdef succ_len = len(succ) cdef prev_succ_len = len(prev_succ)
cdef vector[float] probs cdef float prob_sum = 0
cdef unordered_set[long long] prev_succ_set for i in xrange(prev_succ_len): prev_succ_set.insert(prev_succ[i])
cdef float prob for i in xrange(succ_len): if succ[i] == prev_node: prob = 1. / p elif prev_succ_set.find(succ[i]) != prev_succ_set.end(): prob = 1. else: prob = 1. / q probs.push_back(prob) prob_sum += prob
cdef float rand_num = float(rand())/RAND_MAX * prob_sum
cdef long long sample_succ = 0 for i in xrange(succ_len): rand_num -= probs[i] if rand_num <= 0: sample_succ = succ[i] return sample_succ