forked from BUPT-GAMMA/OpenHGNN
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadapter.py
422 lines (370 loc) · 18.8 KB
/
adapter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
"""Dataset adapters for re-purposing a dataset for a different kind of training task."""
import os
import json
import numpy as np
from dgl.data import utils, DGLDataset
from dgl import backend as F
import dgl
from dgl.dataloading.negative_sampler import GlobalUniform, PerSourceUniform
import torch as th
from dgl import DGLHeteroGraph
import torch
__all__ = ['AsNodeClassificationDataset', 'AsLinkPredictionDataset']
class AsNodeClassificationDataset(DGLDataset):
"""Repurpose a dataset for a standard semi-supervised transductive
node prediction task.
The class converts a given dataset into a new dataset object that:
- Contains only one heterogeneous graph, accessible from ``dataset[0]``.
- The graph stores:
- Node labels in ``g.nodes[target_ntype].data['label']``.
- Train/val/test masks in ``g.nodes[target_ntype].data['train_mask']``, ``g.nodes[target_ntype].data['val_mask']``,
and ``g.nodes[target_ntype].data['test_mask']`` respectively.
- In addition, the dataset contains the following attributes:
- ``num_classes``, the number of classes to predict.
- ``train_idx``, ``val_idx``, ``test_idx``, train/val/test indexes.
The class will keep only the first graph in the provided dataset and
generate train/val/test masks according to the given spplit ratio. The generated
masks will be cached to disk for fast re-loading. If the provided split ratio
differs from the cached one, it will re-process the dataset properly.
Parameters
----------
data : DGLDataset or DGLHeteroGraph
The dataset or graph to be converted.
name : str
The dataset name. Optional when data is DGLDataset. Required when data is DGLHeteroGraph.
labeled_nodes_split_ratio : (float, float, float), optional
Split ratios for training, validation and test sets. Must sum to 1. If None, we will use the train_mask,
val_mask and test_mask from the original graph.
prediction_ratio : float, optional
The ratio of number of prediction nodes to all unlabeled nodes. Prediction_ratio ranges from 0 to 1.
If None, we will use the pred_mask from the original graph.
target_ntype : str
The node type to add split mask for.
label_feat_name: str, optional
The feature name of label.
If None, we will use the name "label".
label_mask_feat_name: str, optional
The feature name of the mask indicating the indices of nodes with labels. None means that all nodes are labeled.
Attributes
----------
num_classes : int
Number of classes to predict.
train_idx : Tensor
An 1-D integer tensor of training node IDs.
val_idx : Tensor
An 1-D integer tensor of validation node IDs.
test_idx : Tensor
An 1-D integer tensor of test node IDs.
pred_idx : Tensor
An 1-D integer tensor of prediction node IDs.
"""
def __init__(self,
data,
name=None,
labeled_nodes_split_ratio=None,
prediction_ratio=None,
target_ntype=None,
label_feat_name='label',
label_mask_feat_name=None,
**kwargs):
self.label_feat_name = label_feat_name
self.prediction_ratio = prediction_ratio
self.label_mask_feat_name = label_mask_feat_name
if isinstance(data, DGLDataset):
self.dataset = data
self.g = data[0]
if name is None:
name = self.dataset.name
elif isinstance(data, DGLHeteroGraph):
self.dataset = None
self.g = data
assert name is not None, \
"Name is required when data is a graph."
else:
raise ValueError("Invalid data type.")
self.split_ratio = kwargs.pop('split_ratio', None) # for compatibility
if labeled_nodes_split_ratio is not None:
self.split_ratio = labeled_nodes_split_ratio
self.target_ntype = target_ntype
super().__init__(name + '-as-nodepred',
hash_key=(self.split_ratio, target_ntype, name, 'nodepred'), **kwargs)
def process(self):
if self.label_feat_name not in self.g.nodes[self.target_ntype].data:
raise ValueError("Missing node labels. Make sure labels are stored "
"under name {}.".format(self.label_feat_name))
if self.split_ratio is None:
if self.verbose:
print('Split ratio is not provided, '
'we will use the train_mask, val_mask and test_mask from the original graph.')
else:
if self.verbose:
print('Generating train/val/test masks...')
# utils.add_nodepred_split(self, self.split_ratio, self.target_ntype)
self.gene_mask(self.split_ratio, self.target_ntype, )
if self.prediction_ratio is None:
if self.verbose:
print("Prediction ratio is not provided, we will use the pred_mask from the original graph.")
elif self.label_mask_feat_name is not None:
self.gene_pred_mask(self.prediction_ratio, self.target_ntype)
else:
if self.verbose:
print('All nodes have label, will not predict.')
self._set_split_index(self.target_ntype)
self.multi_label = getattr(self.dataset, 'multi_label', None)
if self.multi_label is None:
self.multi_label = len(self.g.nodes[self.target_ntype].data[self.label_feat_name].shape) == 2
self.num_classes = getattr(self.dataset, 'num_classes', None)
if self.num_classes is None:
if self.multi_label:
self.num_classes = self.g.nodes[self.target_ntype].data[self.label_feat_name].shape[1]
else:
self.num_classes = len(F.unique(self.g.nodes[self.target_ntype].data[self.label_feat_name]))
self.meta_paths = getattr(self.dataset, 'meta_paths', None)
self.meta_paths_dict = getattr(self.dataset, 'meta_paths_dict', None)
def gene_pred_mask(self, ratio, ntype):
idx_tensor = torch.where(self.g.nodes[ntype].data[self.label_mask_feat_name] == 0)[0]
idx = idx_tensor.tolist()
len_nodes = len(self.g.nodes[ntype].data[self.label_mask_feat_name])
n = len(idx)
np.random.shuffle(idx)
n_pred = int(n * ratio)
pred_mask = utils.generate_mask_tensor(utils.idx2mask(idx[:n_pred], len_nodes))
self.g.nodes[ntype].data['pred_mask'] = pred_mask
def gene_mask(self, ratio, ntype):
if len(ratio) != 3:
raise ValueError(f'Split ratio must be a float triplet but got {ratio}.')
if self.label_mask_feat_name is None:
idx_tensor = self.g.nodes(ntype)
else:
idx_tensor = torch.nonzero(self.g.nodes[ntype].data[self.label_mask_feat_name]).squeeze(1)
idx = idx_tensor.tolist()
len_nodes = len(self.g.nodes(ntype))
n = len(idx)
# idx = np.arange(0, n)
np.random.shuffle(idx)
n_train, n_val, n_test = int(n * ratio[0]), int(n * ratio[1]), int(n * ratio[2])
train_mask = utils.generate_mask_tensor(utils.idx2mask(idx[:n_train], len_nodes))
val_mask = utils.generate_mask_tensor(utils.idx2mask(idx[n_train:n_train + n_val], len_nodes))
test_mask = utils.generate_mask_tensor(utils.idx2mask(idx[n_train + n_val:], len_nodes))
self.g.nodes[ntype].data['train_mask'] = train_mask
self.g.nodes[ntype].data['val_mask'] = val_mask
self.g.nodes[ntype].data['test_mask'] = test_mask
def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
def load(self):
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f)
if (info['split_ratio'] != self.split_ratio
or info['target_ntype'] != self.target_ntype):
raise ValueError('Provided split ratio is different from the cached file. '
'Re-process the dataset.')
self.split_ratio = info['split_ratio']
self.target_ntype = info['target_ntype']
self.num_classes = info['num_classes']
self.meta_paths_dict = info['meta_paths_dict']
self.meta_paths = info['meta_paths']
self.multi_label = info['multi_label']
self.label_feat_name = info['label_feat_name']
self.prediction_ratio = info['prediction_ratio']
self.label_mask_feat_name = info['label_mask_feat_name']
gs, _ = utils.load_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
self.g = gs[0]
self._set_split_index(self.target_ntype)
def save(self):
utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)), [self.g])
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({
'split_ratio': self.split_ratio,
'target_ntype': self.target_ntype,
'num_classes': self.num_classes,
'multi_label': self.multi_label,
'meta_paths_dict': self.meta_paths_dict,
'meta_paths': self.meta_paths,
'label_feat_name': self.label_feat_name,
'prediction_ratio': self.prediction_ratio,
'label_mask_feat_name': self.label_mask_feat_name}, f)
def __getitem__(self, idx):
return self.g
def __len__(self):
return 1
def _set_split_index(self, ntype):
"""Add train_idx/val_idx/test_idx as dataset attributes according to corresponding mask."""
ndata = self.g.nodes[self.target_ntype].data
self.train_idx = None
self.val_idx = None
self.test_idx = None
self.pred_idx = None
if 'train_mask' in ndata:
self.train_idx = F.nonzero_1d(ndata['train_mask'])
if 'val_mask' in ndata:
self.val_idx = F.nonzero_1d(ndata['val_mask'])
if 'test_mask' in ndata:
self.test_idx = F.nonzero_1d(ndata['test_mask'])
if 'pred_mask' in ndata:
self.pred_idx = F.nonzero_1d(ndata['pred_mask'])
elif self.label_mask_feat_name is not None:
if self.verbose:
print('No prediction mask exists, will predict all missing labels.')
idx_tensor = torch.where(self.g.nodes[ntype].data[self.label_mask_feat_name] == 0)[0]
self.pred_idx = idx_tensor
def get_split(self, *args, **kwargs):
return self.train_idx, self.val_idx, self.test_idx
def get_labels(self):
return self.g.nodes[self.target_ntype].data[self.label_feat_name]
@property
def category(self):
return self.target_ntype
class AsLinkPredictionDataset(DGLDataset):
"""Repurpose a dataset for link prediction task.
The created dataset will include data needed for link prediction.
It will keep only the first graph in the provided dataset and
generate train/val/test edges according to the given split ratio,
and the correspondent negative edges based on the neg_ratio. The generated
edges will be cached to disk for fast re-loading. If the provided split ratio
differs from the cached one, it will re-process the dataset properly.
Parameters
----------
dataset : DGLDataset
The dataset to be converted.
split_ratio : (float, float, float), optional
Split ratios for training, validation and test sets. Must sum to one.
neg_ratio : int, optional
Indicate how much negative samples to be sampled
The number of the negative samples will be equal or less than neg_ratio * num_positive_edges.
target_link : list[tuple[str, str, str]]
The edge types on which predictions are make.
target_link_r : list[tuple[str, str, str]], optional
The reverse edge types of the target links. Used to remove reverse edges of val/test edges from train graph.
neg_sampler : str, optional
Indicate how negative edges of val/test edges are sampled. 'global' or 'per_source'.
Attributes
-------
train_graph: DGLHeteroGraph
The DGLHeteroGraph for training
pos_val_graph: DGLHeteroGraph
The DGLHeteroGraph containing positive validation edges
pos_test_graph: DGLHeteroGraph
The DGLHeteroGraph containing positive test edges
neg_val_graph: DGLHeteroGraph
The DGLHeteroGraph containing negative validation edges
neg_test_graph: DGLHeteroGraph
The DGLHeteroGraph containing negative test edges
"""
def __init__(self,
dataset,
target_link,
target_link_r,
split_ratio=None,
neg_ratio=3,
neg_sampler='global',
**kwargs):
self.g = dataset[0]
self.num_nodes = self.g.num_nodes()
self.dataset = dataset
self.split_ratio = split_ratio
self.target_link = target_link
self.target_link_r = target_link_r
self.neg_ratio = neg_ratio
self.neg_sampler = neg_sampler
super().__init__(dataset.name + '-as-linkpred', hash_key=(
neg_ratio, target_link, target_link_r, split_ratio, neg_sampler, dataset.name, 'linkpred'), **kwargs)
def process(self):
if self.split_ratio is None:
for etype in self.target_link:
for mask in ['train_mask', 'val_mask', 'test_mask']:
assert mask in self.g.edges[etype].data, \
"{} is not provided for edge type {}, please specify split_ratio to generate the masks".format(
mask, etype)
else:
ratio = self.split_ratio
for etype in self.target_link:
n = self.g.num_edges(etype)
n_train, n_val, n_test = int(n * ratio[0]), int(n * ratio[1]), int(n * ratio[2])
idx = np.random.permutation(n)
train_idx = idx[:n_train]
val_idx = idx[n_train:n_train + n_val]
test_idx = idx[n_train + n_val:]
train_mask = th.zeros(n).bool()
train_mask[train_idx] = True
val_mask = th.zeros(n).bool()
val_mask[val_idx] = True
test_mask = th.zeros(n).bool()
test_mask[test_idx] = True
self.g.edges[etype].data['train_mask'] = train_mask
self.g.edges[etype].data['val_mask'] = val_mask
self.g.edges[etype].data['test_mask'] = test_mask
# create val and test graph(pos and neg respectively)
self.pos_val_graph, self.neg_val_graph = self._get_pos_and_neg_graph('val')
self.pos_test_graph, self.neg_test_graph = self._get_pos_and_neg_graph('test')
self.pred_edges = getattr(self.dataset, 'pred_edges', None)
if self.pred_edges is not None:
self.pred_graph = dgl.heterograph(self.pred_edges,
{ntype: self.g.num_nodes(ntype) for ntype in self.g.ntypes})
# create train graph
train_graph = self.g
for i, etype in enumerate(self.target_link):
# remove val and test edges
train_graph = dgl.remove_edges(train_graph,
th.cat((self.pos_val_graph.edges[etype].data[dgl.EID],
self.pos_test_graph.edges[etype].data[dgl.EID])),
etype)
# remove reverse edges of val and test edges
if self.target_link_r is not None:
reverse_etype = self.target_link_r[i]
train_graph = dgl.remove_edges(train_graph, th.arange(train_graph.num_edges(reverse_etype)),
reverse_etype)
edges = train_graph.edges(etype=etype)
train_graph = dgl.add_edges(train_graph, edges[1], edges[0], etype=reverse_etype)
self.train_graph = train_graph
self.meta_paths = getattr(self.dataset, 'meta_paths', None)
self.meta_paths_dict = getattr(self.dataset, 'meta_paths_dict', None)
def _get_pos_and_neg_graph(self, split):
if self.neg_sampler == 'global':
neg_sampler = GlobalUniform(self.neg_ratio)
elif self.neg_sampler == 'per_source':
neg_sampler = PerSourceUniform(self.neg_ratio)
else:
raise ValueError('Unsupported neg_sampler')
edges = {
etype: th.nonzero(self.g.edges[etype].data['{}_mask'.format(split)]).squeeze()
for etype in self.target_link}
pos_graph = dgl.edge_subgraph(self.g, edges, relabel_nodes=False, store_ids=True)
neg_edges = getattr(self.dataset, 'neg_{}_edges'.format(split), neg_sampler(self.g, edges))
neg_graph = dgl.heterograph(neg_edges, {ntype: pos_graph.num_nodes(ntype) for ntype in pos_graph.ntypes})
return pos_graph, neg_graph
def has_cache(self):
return os.path.isfile(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
def load(self):
gs, _ = utils.load_graphs(
os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)))
self.train_graph, self.pos_val_graph, self.pos_test_graph, self.neg_val_graph, self.neg_test_graph = \
gs[0], gs[1], gs[2], gs[3], gs[4]
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'r') as f:
info = json.load(f)
self.split_ratio = info["split_ratio"]
self.neg_ratio = info["neg_ratio"]
self.target_link = info["target_link"]
self.target_link_r = info["target_link_r"]
self.neg_sampler = info["neg_sampler"]
self.meta_paths_dict = info["meta_paths_dict"]
self.meta_paths = info["meta_paths"]
def save(self):
utils.save_graphs(os.path.join(self.save_path, 'graph_{}.bin'.format(self.hash)),
[self.train_graph, self.pos_val_graph, self.pos_test_graph, self.neg_val_graph,
self.neg_test_graph])
with open(os.path.join(self.save_path, 'info_{}.json'.format(self.hash)), 'w') as f:
json.dump({
'split_ratio': self.split_ratio,
'neg_ratio': self.neg_ratio,
'target_link': self.target_link,
'target_link_r': self.target_link_r,
'neg_sampler': self.neg_sampler,
'meta_paths_dict': self.meta_paths_dict,
'meta_paths': self.meta_paths,
}, f)
def get_split(self, *args, **kwargs):
return self.train_graph, self.pos_val_graph, self.pos_test_graph, self.neg_val_graph, self.neg_test_graph
def __getitem__(self, idx):
return self.g
def __len__(self):
return 1