-
Notifications
You must be signed in to change notification settings - Fork 13
/
dataset.py
executable file
·122 lines (99 loc) · 4.11 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python3
import torch
import numpy as np
import time
from config import *
from scipy.sparse import *
class TCGNN_dataset(torch.nn.Module):
"""
data loading for more graphs
"""
def __init__(self, path, dim, num_class, load_from_txt=True, verbose=False):
super(TCGNN_dataset, self).__init__()
self.nodes = set()
self.load_from_txt = load_from_txt
self.num_nodes = 0
self.num_features = dim
self.num_classes = num_class
self.edge_index = None
self.reorder_flag = False
self.verbose_flag = verbose
self.avg_degree = -1
self.avg_edgeSpan = -1
self.init_edges(path)
self.init_embedding(dim)
self.init_labels(num_class)
train = 1
val = 0.3
test = 0.1
self.train_mask = [1] * int(self.num_nodes * train) + [0] * (self.num_nodes - int(self.num_nodes * train))
self.val_mask = [1] * int(self.num_nodes * val)+ [0] * (self.num_nodes - int(self.num_nodes * val))
self.test_mask = [1] * int(self.num_nodes * test) + [0] * (self.num_nodes - int(self.num_nodes * test))
self.train_mask = torch.BoolTensor(self.train_mask).cuda()
self.val_mask = torch.BoolTensor(self.val_mask).cuda()
self.test_mask = torch.BoolTensor(self.test_mask).cuda()
def init_edges(self, path):
# loading from a txt graph file
if self.load_from_txt:
fp = open(path, "r")
src_li = []
dst_li = []
start = time.perf_counter()
for line in fp:
src, dst = line.strip('\n').split()
src, dst = int(src), int(dst)
src_li.append(src)
dst_li.append(dst)
self.nodes.add(src)
self.nodes.add(dst)
self.num_edges = len(src_li)
self.num_nodes = max(self.nodes) + 1
self.edge_index = np.stack([src_li, dst_li])
dur = time.perf_counter() - start
if self.verbose_flag:
print("# Loading (txt) {:.3f}s ".format(dur))
# loading from a .npz graph file
else:
if not path.endswith('.npz'):
raise ValueError("graph file must be a .npz file")
start = time.perf_counter()
graph_obj = np.load(path)
src_li = graph_obj['src_li']
dst_li = graph_obj['dst_li']
self.num_nodes = graph_obj['num_nodes']
self.num_edges = len(src_li)
self.edge_index = np.stack([src_li, dst_li])
dur = time.perf_counter() - start
if self.verbose_flag:
print("# Loading (npz)(s): {:.3f}".format(dur))
self.avg_degree = self.num_edges / self.num_nodes
self.avg_edgeSpan = np.mean(np.abs(np.subtract(src_li, dst_li)))
if self.verbose_flag:
print('# nodes: {}'.format(self.num_nodes))
print("# avg_degree: {:.2f}".format(self.avg_degree))
print("# avg_edgeSpan: {}".format(int(self.avg_edgeSpan)))
# Build graph CSR.
val = [1] * self.num_edges
start = time.perf_counter()
scipy_coo = coo_matrix((val, self.edge_index), shape=(self.num_nodes, self.num_nodes))
scipy_csr = scipy_coo.tocsr()
build_csr = time.perf_counter() - start
if self.verbose_flag:
print("# Build CSR (s): {:.3f}".format(build_csr))
self.column_index = torch.IntTensor(scipy_csr.indices)
self.row_pointers = torch.IntTensor(scipy_csr.indptr)
# Get degrees array.
degrees = (self.row_pointers[1:] - self.row_pointers[:-1]).tolist()
self.degrees = torch.sqrt(torch.FloatTensor(list(map(func, degrees)))).cuda()
def init_embedding(self, dim):
'''
Generate node embedding for nodes.
Called from __init__.
'''
self.x = torch.randn(self.num_nodes, dim).cuda()
def init_labels(self, num_class):
'''
Generate the node label.
Called from __init__.
'''
self.y = torch.ones(self.num_nodes).long().cuda()