1 import os
-
F401
'.utils_graph_processing.subgraph_isomorphism_edge_counts' imported but unused
-
F401
'.utils_graph_processing.subgraph_isomorphism_vertex_counts' imported but unused
-
F401
'.utils_graph_processing.induced_edge_automorphism_orbits' imported but unused
-
F401
'.utils_graph_processing.edge_automorphism_orbits' imported but unused
-
F401
'.utils_graph_processing.automorphism_orbits' imported but unused
-
E501
Line too long (107 > 79 characters)
2 from .utils_graph_processing import subgraph_isomorphism_edge_counts, subgraph_isomorphism_vertex_counts, \
-
E501
Line too long (83 > 79 characters)
3 induced_edge_automorphism_orbits, edge_automorphism_orbits, automorphism_orbits
-
F401
'.utils_ids.subgraph_counts2ids' imported but unused
4 from .utils_ids import subgraph_counts2ids
5 from .utils_data_gen import generate_dataset
-
F401
'.utils_graph_learning.multi_class_accuracy' imported but unused
6 from .utils_graph_learning import multi_class_accuracy
7 import torch
-
F401
'torch.nn' imported but unused
8 import torch.nn as nn
-
F401
'numpy as np' imported but unused
9 import numpy as np
10 from torch_geometric.data import Data
11 import networkx as nx
-
F811
Redefinition of unused 'Data' from line 10
12 from torch_geometric.data import Data
13 import glob
14 import re
15 import types
16
17
18 def get_custom_edge_list(ks, substructure_type=None, filename=None):
19 '''
20 Instantiates a list of `edge_list`s representing substructures
21 of type `substructure_type` with sizes specified by `ks`.
22 '''
23 if substructure_type is None and filename is None:
-
E501
Line too long (106 > 79 characters)
24 raise ValueError('You must specify either a type or a filename where to read substructures from.')
25 edge_lists = []
26 for k in ks:
27 if substructure_type is not None:
28 graphs_nx = getattr(nx, substructure_type)(k)
29 else:
-
E501
Line too long (87 > 79 characters)
30 graphs_nx = nx.read_graph6(os.path.join(filename, 'graph{}c.g6'.format(k)))
-
E501
Line too long (85 > 79 characters)
31 if isinstance(graphs_nx, list) or isinstance(graphs_nx, types.GeneratorType):
32 edge_lists += [list(graph_nx.edges) for graph_nx in graphs_nx]
33 else:
34 edge_lists.append(list(graphs_nx.edges))
35
36 return edge_lists
37
38
39 def prepare_dataset(path,
40 dataset,
41 name,
42 id_scope,
43 id_type,
44 k,
45 extract_ids_fn,
46 count_fn,
47 automorphism_fn,
48 multiprocessing,
49 num_processes,
50 **subgraph_params):
-
E501
Line too long (88 > 79 characters)
51 if dataset in ['bioinformatics', 'social', 'chemical', 'ogb', 'SR_graphs', 'MNIST']:
52 data_folder = os.path.join(path, 'processed', id_scope)
53 if not os.path.exists(data_folder):
54 os.makedirs(data_folder)
55 if id_type != 'custom':
56 if subgraph_params['induced']:
57 if subgraph_params['directed_orbits'] and id_scope == 'local':
-
E501
Line too long (112 > 79 characters)
58 data_file = os.path.join(data_folder, '{}_induced_directed_orbits_{}.pt'.format(id_type, k))
59 else:
-
E501
Line too long (96 > 79 characters)
60 data_file = os.path.join(data_folder, '{}_induced_{}.pt'.format(id_type, k))
61 else:
62 if subgraph_params['directed_orbits'] and id_scope == 'local':
-
E501
Line too long (104 > 79 characters)
63 data_file = os.path.join(data_folder, '{}_directed_orbits_{}.pt'.format(id_type, k))
64 else:
-
E501
Line too long (88 > 79 characters)
65 data_file = os.path.join(data_folder, '{}_{}.pt'.format(id_type, k))
66 maybe_load = True
67 else:
68 data_file = None # we don't save custom substructure counts
69 maybe_load = False
70 loaded = False
71 else:
-
E501
Line too long (98 > 79 characters)
72 raise NotImplementedError("Dataset family {} is not currently supported.".format(dataset))
73
74 # try to load, possibly downgrading
75 if maybe_load:
76
77 if os.path.exists(data_file): # load
-
E501
Line too long (84 > 79 characters)
78 graphs_ptg, num_classes, orbit_partition_sizes = load_dataset(data_file)
79 loaded = True
80
-
E501
Line too long (115 > 79 characters)
81 else: # try downgrading. Currently works only when for each k there is only one substructure in the family
82 if id_type in ['cycle_graph',
83 'path_graph',
84 'complete_graph',
85 'binomial_tree',
86 'star_graph']:
87 k_min = 2 if id_type == 'star_graph' else 3
-
E501
Line too long (103 > 79 characters)
88 succeded, graphs_ptg, num_classes, orbit_partition_sizes = try_downgrading(data_folder,
-
E501
Line too long (99 > 79 characters)
89 id_type,
-
E501
Line too long (118 > 79 characters)
90 subgraph_params['induced'],
-
E501
Line too long (107 > 79 characters)
91 subgraph_params[
-
E501
Line too long (113 > 79 characters)
92 'directed_orbits']
-
E501
Line too long (115 > 79 characters)
93 and id_scope == 'local',
-
E501
Line too long (100 > 79 characters)
94 k, k_min)
95 if succeded: # save the dataset
96 print("Saving dataset to {}".format(data_file))
-
E501
Line too long (91 > 79 characters)
97 torch.save((graphs_ptg, num_classes, orbit_partition_sizes), data_file)
98 loaded = True
99
100 if not loaded:
101
-
E501
Line too long (109 > 79 characters)
102 graphs_ptg, num_classes, num_node_type, num_edge_type, orbit_partition_sizes = generate_dataset(path,
-
E501
Line too long (109 > 79 characters)
103 name,
-
E501
Line too long (106 > 79 characters)
104 k,
-
E501
Line too long (119 > 79 characters)
105 extract_ids_fn,
-
E501
Line too long (113 > 79 characters)
106 count_fn,
-
E501
Line too long (120 > 79 characters)
107 automorphism_fn,
-
E501
Line too long (112 > 79 characters)
108 id_type,
-
E501
Line too long (120 > 79 characters)
109 multiprocessing,
-
E501
Line too long (118 > 79 characters)
110 num_processes,
-
E501
Line too long (122 > 79 characters)
111 **subgraph_params)
112 if data_file is not None:
113 print("Saving dataset to {}".format(data_file))
-
E501
Line too long (83 > 79 characters)
114 torch.save((graphs_ptg, num_classes, orbit_partition_sizes), data_file)
115
116 if num_node_type is not None:
-
E501
Line too long (111 > 79 characters)
117 torch.save((num_node_type, num_edge_type), os.path.join(path, 'processed', 'num_feature_types.pt'))
118
119 return graphs_ptg, num_classes, orbit_partition_sizes
120
121
122 def load_dataset(data_file):
123 '''
124 Loads dataset from `data_file`.
125 '''
126 print("Loading dataset from {}".format(data_file))
127 dataset_obj = torch.load(data_file)
128 graphs_ptg = dataset_obj[0]
129 num_classes = dataset_obj[1]
130 orbit_partition_sizes = dataset_obj[2]
131
132 return graphs_ptg, num_classes, orbit_partition_sizes
133
134
135 def try_downgrading(data_folder, id_type, induced, directed_orbits, k, k_min):
136 '''
-
E501
Line too long (90 > 79 characters)
137 Extracts the substructures of size up to the `k`, if a collection of substructures
138 with size larger than k has already been computed.
139 '''
-
E501
Line too long (102 > 79 characters)
140 found_data_filename, k_found = find_id_filename(data_folder, id_type, induced, directed_orbits, k)
141 if found_data_filename is not None:
-
E501
Line too long (90 > 79 characters)
142 graphs_ptg, num_classes, orbit_partition_sizes = load_dataset(found_data_filename)
143 print("Downgrading k from dataset {}...".format(found_data_filename))
-
E501
Line too long (100 > 79 characters)
144 graphs_ptg, orbit_partition_sizes = downgrade_k(graphs_ptg, k, orbit_partition_sizes, k_min)
145 return True, graphs_ptg, num_classes, orbit_partition_sizes
146 else:
147 return False, None, None, None
148
149
150 def find_id_filename(data_folder, id_type, induced, directed_orbits, k):
151 '''
-
E501
Line too long (93 > 79 characters)
-
W291
Trailing whitespace
152 Looks for existing precomputed datasets in `data_folder` with counts for substructure
153 `id_type` larger `k`.
154 '''
155 if induced:
156 if directed_orbits:
-
E501
Line too long (103 > 79 characters)
157 pattern = os.path.join(data_folder, '{}_induced_directed_orbits_[0-9]*.pt'.format(id_type))
158 else:
-
E501
Line too long (87 > 79 characters)
159 pattern = os.path.join(data_folder, '{}_induced_[0-9]*.pt'.format(id_type))
160 else:
161 if directed_orbits:
-
E501
Line too long (95 > 79 characters)
162 pattern = os.path.join(data_folder, '{}_directed_orbits_[0-9]*.pt'.format(id_type))
163 else:
164 pattern = os.path.join(data_folder, '{}_[0-9]*.pt'.format(id_type))
165 filenames = glob.glob(pattern)
166 for name in filenames:
167 k_found = int(re.findall(r'\d+', name)[-1])
168 if k_found >= k:
169 return name, k_found
170 return None, None
171
-
E302
Expected 2 blank lines, found 1
172 def downgrade_k(dataset, k, orbit_partition_sizes, k_min):
173 '''
-
E501
Line too long (87 > 79 characters)
174 Donwgrades `dataset` by keeping only the orbits of the requested substructures.
175 '''
176 feature_vector_size = sum(orbit_partition_sizes[0:k - k_min + 1])
177 graphs_ptg = list()
178 for data in dataset:
179 new_data = Data()
180 for attr in data.__iter__():
181 name, value = attr
182 setattr(new_data, name, value)
-
E501
Line too long (84 > 79 characters)
183 setattr(new_data, 'identifiers', data.identifiers[:, 0:feature_vector_size])
184 graphs_ptg.append(new_data)
185 return graphs_ptg, orbit_partition_sizes[0:k - k_min + 1]