utils_data_prep

gds/datasets/gsn/utils_data_prep.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
import os
import csv
import pickle
from collections import namedtuple
import networkx as nx
import numpy as np
import random
import torch
from sklearn.model_selection import StratifiedKFold
from ogb.graphproppred import PygGraphPropPredDataset
from torch_geometric.utils import to_undirected

class S2VGraph(object):
    def __init__(self, g, label, node_tags=None, node_features=None):
        '''
            Code obtained from here: https://github.com/weihua916/powerful-gnns
            
            g: a networkx graph
            label: an integer graph label
            node_tags: a list of integer node tags
            node_features: a torch float tensor, one-hot representation of the tag that is used as input to neural nets
            edge_mat: a torch long tensor, contain edge list, will be used to create torch sparse tensor
            neighbors: list of neighbors (without self-loop)
        '''
        self.label = label
        self.g = g
        self.node_tags = node_tags
        self.neighbors = []
        self.node_features = 0
        self.edge_mat = 0

        self.max_neighbor = 0


def load_data(path, name, degree_as_tag):
    '''
        Code obtained from here: https://github.com/weihua916/powerful-gnns
    
        dataset: name of dataset
        test_proportion: ratio of test train split
        seed: random seed for random splitting of dataset
    '''

    print('loading data')
    g_list = []
    label_dict = {}
    feat_dict = {}

    with open('%s/%s.txt' % (path, name), 'r') as f:
        n_g = int(f.readline().strip())
        for i in range(n_g):
            row = f.readline().strip().split()
            n, l = [int(w) for w in row]
            if not l in label_dict:
                mapped = len(label_dict)
                label_dict[l] = mapped
            g = nx.Graph()
            node_tags = []
            node_features = []
            n_edges = 0
            for j in range(n):
                g.add_node(j)
                row = f.readline().strip().split()
                tmp = int(row[1]) + 2
                if tmp == len(row):
                    # no node attributes
                    row = [int(w) for w in row]
                    attr = None
                else:
                    row, attr = [int(w) for w in row[:tmp]], np.array([float(w) for w in row[tmp:]])
                if not row[0] in feat_dict:
                    mapped = len(feat_dict)
                    feat_dict[row[0]] = mapped
                node_tags.append(feat_dict[row[0]])

                if tmp > len(row):
                    node_features.append(attr)

                n_edges += row[1]
                for k in range(2, len(row)):
                    g.add_edge(j, row[k])

            if node_features != []:
                node_features = np.stack(node_features)
                node_feature_flag = True
            else:
                node_features = None
                node_feature_flag = False

            assert len(g) == n

            g_list.append(S2VGraph(g, l, node_tags))

    #add labels and edge_mat       
    for g in g_list:
        g.neighbors = [[] for i in range(len(g.g))]
        for i, j in g.g.edges():
            g.neighbors[i].append(j)
            g.neighbors[j].append(i)
        degree_list = []
        for i in range(len(g.g)):
            g.neighbors[i] = g.neighbors[i]
            degree_list.append(len(g.neighbors[i]))
        g.max_neighbor = max(degree_list)

        g.label = label_dict[g.label]

        edges = [list(pair) for pair in g.g.edges()]
        edges.extend([[i, j] for j, i in edges])

        deg_list = list(dict(g.g.degree(range(len(g.g)))).values())
        g.edge_mat = torch.LongTensor(edges).transpose(0,1)

    if degree_as_tag:
        for g in g_list:
            g.node_tags = list(dict(g.g.degree).values())

    #Extracting unique tag labels   
    tagset = set([])
    for g in g_list:
        tagset = tagset.union(set(g.node_tags))

    tagset = list(tagset)
    tag2index = {tagset[i]:i for i in range(len(tagset))}

    for g in g_list:
        g.node_features = torch.zeros(len(g.node_tags), len(tagset))
        g.node_features[range(len(g.node_tags)), [tag2index[tag] for tag in g.node_tags]] = 1


    print('# classes: %d' % len(label_dict))
    print('# maximum node tag: %d' % len(tagset))

    print("# data: %d" % len(g_list))

    return g_list, len(label_dict)


def load_zinc_data(path, name, degree_as_tag, num_atom_type=28, num_bond_type=4):
    
     ### splits and preprocessing according to https://github.com/graphdeeplearning/benchmarking-gnns/blob/master/main_molecules_graph_regression.py
    
    assert name.upper() == 'ZINC'
    Graph = namedtuple('Graph', ['node_features', 'edge_mat', 'edge_features', 'label'])
    
    def _prepare(molecule):
    
        node_features = molecule['atom_type'].long()
        
        adj = molecule['bond_type']
        edge_list = (adj != 0).nonzero()  # converting adj matrix to edge_list
        edge_idxs_in_adj = edge_list.split(1, dim=1)
        edge_features = adj[edge_idxs_in_adj].reshape(-1).long()
        
        label = molecule['logP_SA_cycle_normalized']
        graph = Graph(node_features, edge_list.permute(1, 0), edge_features, label)

        return graph

    data = list()
    for split_name in ['train', 'val', 'test']:
        with open(os.path.join(path,'molecules','{}.pickle'.format(split_name)), "rb") as f:
            split_data = pickle.load(f)
        
        # loading the sampled indices from file ./zinc_molecules/<split>.index
        with open(os.path.join(path, 'indices', '{}.index'.format(split_name)), "r") as f:
            data_idx = [list(map(int, idx)) for idx in csv.reader(f)]
        
        split_data = [ split_data[i] for i in data_idx[0] ]
            
        for molecule in split_data:
            data.append(_prepare(molecule))

    return data, 1, num_atom_type, num_bond_type


def load_ogb_data(path, name, degree_as_tag):
    
     ### splits and preprocessing according to https://github.com/snap-stanford/ogb
        
    def add_zeros(data):
        data.x = torch.zeros(data.num_nodes, dtype=torch.long)
        return data
 
    if name == 'ogbg-ppa':
        transform = add_zeros
        print('Applying transform {} to dataset {}.'.format(transform, name))
        dataset = PygGraphPropPredDataset(name=name, root=path, transform=transform)
    else:
        dataset = PygGraphPropPredDataset(name=name, root=path)
    Graph = namedtuple('Graph', ['node_features', 'edge_mat', 'edge_features', 'label'])
    graph_list = list()
    for datum in dataset:
        graph = Graph(datum.x, datum.edge_index, datum.edge_attr, datum.y)
        graph_list.append(graph)
    num_classes = dataset.num_classes if name == 'ogbg-ppa' else dataset.num_tasks
    return graph_list, num_classes


def load_g6_graphs(path, name):
    
     ### code used to load SR graphs obtained from here http://users.cecs.anu.edu.au/~bdm/data/graphs.html
     ### we don't split the data, because no training is performed (the network is used with random weights for the SR experiment)

    dataset = nx.read_graph6(os.path.join(path, name+'.g6'))
    Graph = namedtuple('Graph', ['node_features', 'edge_mat','label'])
    graph_list = list()
    for i,datum in enumerate(dataset):
        x = torch.ones(datum.number_of_nodes(),1)
        edge_index = to_undirected(torch.tensor(list(datum.edges())).transpose(1,0))
        graph = Graph(x, edge_index, torch.tensor(i).long())
        graph_list.append(graph)
    num_classes = len(dataset)
    
    return graph_list, num_classes


def separate_data(graph_list, seed, fold_idx):
    
    ### Code obtained from here: https://github.com/weihua916/powerful-gnns
    
    assert 0 <= fold_idx and fold_idx < 10, "fold_idx must be from 0 to 9."
    skf = StratifiedKFold(n_splits=10, shuffle = True, random_state = seed)

    if hasattr(graph_list[0], 'label'):
        labels = [graph.label for graph in graph_list]
    elif hasattr(graph_list[0], 'y'):
        labels = [graph.y for graph in graph_list]
    else:
        raise NotImplementedError
        
    idx_list = []
    for idx in skf.split(np.zeros(len(labels)), labels):
        idx_list.append(idx)
    train_idx, test_idx = idx_list[fold_idx]

    train_graph_list = [graph_list[i] for i in train_idx]
    test_graph_list = [graph_list[i] for i in test_idx]

    return train_graph_list, test_graph_list

def separate_data_given_split(graph_list, path, fold_idx):
    
    ### Splits data based on pre-computed splits
    
    assert -1 <= fold_idx and fold_idx < 10, "Parameter fold_idx must be from -1 to 9, with -1 referring to the special model selection split."

    train_filename = os.path.join(path, '10fold_idx', 'train_idx-{}.txt'.format(fold_idx+1))
    test_filename = os.path.join(path, '10fold_idx', 'test_idx-{}.txt'.format(fold_idx+1))
    val_filename = os.path.join(path, '10fold_idx', 'val_idx-{}.txt'.format(fold_idx+1))
    train_idx = np.loadtxt(train_filename, dtype=int)
    test_idx = np.loadtxt(test_filename, dtype=int)
        
    train_graph_list = [graph_list[i] for i in train_idx]
    test_graph_list = [graph_list[i] for i in test_idx]
    val_graph_list = None                           
    
    if os.path.exists(val_filename):
        val_idx = np.loadtxt(val_filename, dtype=int)
        val_graph_list = [graph_list[i] for i in val_idx]

    return train_graph_list, test_graph_list, val_graph_list