⬅ datasets/gsn/utils_encoding.py source

1 import torch
2 import sys
  • F401 'sklearn.preprocessing.KBinsDiscretizer' imported but unused
  • F401 'sklearn.preprocessing.OneHotEncoder' imported but unused
  • F401 'sklearn.preprocessing.MinMaxScaler' imported but unused
  • F401 'sklearn.preprocessing.StandardScaler' imported but unused
  • E501 Line too long (95 > 79 characters)
3 from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler, StandardScaler
4  
5 import numpy as np
6  
7  
8 def encode(graphs, id_encoding, degree_encoding=None, **kwargs):
9 '''
  • E501 Line too long (89 > 79 characters)
10 Encodes categorical variables such as structural identifiers and degree features.
11 '''
12 encoder_ids, d_id = None, [1]*graphs[0].identifiers.shape[1]
13 if id_encoding is not None:
14 id_encoding_fn = getattr(sys.modules[__name__], id_encoding)
15 ids = [graph.identifiers for graph in graphs]
16 encoder_ids = id_encoding_fn(ids, **(kwargs['ids']))
17 encoded_ids = encoder_ids.fit(ids)
18 d_id = encoder_ids.d
  • W293 Blank line contains whitespace
19
20 encoder_degrees, d_degree = None, []
21 if degree_encoding is not None:
22 degree_encoding_fn = getattr(sys.modules[__name__], degree_encoding)
23 degrees = [graph.degrees.unsqueeze(1) for graph in graphs]
24 encoder_degrees = degree_encoding_fn(degrees, **(kwargs['degree']))
25 encoded_degrees = encoder_degrees.fit(degrees)
26 d_degree = encoder_degrees.d
  • W293 Blank line contains whitespace
27
28 for g, graph in enumerate(graphs):
29 if id_encoding is not None:
30 setattr(graph, 'identifiers', encoded_ids[g])
31 if degree_encoding is not None:
32 setattr(graph, 'degrees', encoded_degrees[g])
  • W293 Blank line contains whitespace
33
34 return graphs, encoder_ids, d_id, encoder_degrees, d_degree
35  
36  
37 class one_hot_unique:
  • W293 Blank line contains whitespace
38
39 def __init__(self, tensor_list, **kwargs):
40 tensor_list = torch.cat(tensor_list, 0)
41 self.d = list()
42 self.corrs = dict()
43 for col in range(tensor_list.shape[1]):
  • E501 Line too long (88 > 79 characters)
44 uniques, corrs = np.unique(tensor_list[:, col], return_inverse=True, axis=0)
45 self.d.append(len(uniques))
46 self.corrs[col] = corrs
  • W291 Trailing whitespace
47 return
  • W293 Blank line contains whitespace
48
49 def fit(self, tensor_list):
50 pointer = 0
51 encoded_tensors = list()
52 for tensor in tensor_list:
53 n = tensor.shape[0]
54 for col in range(tensor.shape[1]):
  • E501 Line too long (94 > 79 characters)
55 translated = torch.LongTensor(self.corrs[col][pointer:pointer+n]).unsqueeze(1)
  • F821 Undefined name 'encoded'
  • E501 Line too long (88 > 79 characters)
56 encoded = torch.cat((encoded, translated), 1) if col > 0 else translated
57 encoded_tensors.append(encoded)
58 pointer += n
59 return encoded_tensors
  • W293 Blank line contains whitespace
60
61  
62 class one_hot_max:
  • W293 Blank line contains whitespace
63
64 def __init__(self, tensor_list, **kwargs):
  • E231 Missing whitespace after ','
65 tensor_list = torch.cat(tensor_list,0)
  • E231 Missing whitespace after ','
  • E501 Line too long (85 > 79 characters)
66 self.d = [int(tensor_list[:,i].max()+1) for i in range(tensor_list.shape[1])]
  • W293 Blank line contains whitespace
67
68 def fit(self, tensor_list):
69 return tensor_list
70  
  • W293 Blank line contains whitespace
71
  • E501 Line too long (103 > 79 characters)
  • W291 Trailing whitespace
72 # NB: this encoding scheme has been implemented, but never tested in experiments: use at your own risk.
73 '''
74 class minmax:
  • W293 Blank line contains whitespace
75
76 def __init__(self, tensor_list, **kwargs):
  • W293 Blank line contains whitespace
77
  • E501 Line too long (81 > 79 characters)
78 range_scaler = [0.0, 1.0] if kwargs['range'] is None else kwargs['range']
79 self.encoder = MinMaxScaler(feature_range=range_scaler)
80 self.d = [1 for i in range(tensor_list[0].shape[1])]
  • W293 Blank line contains whitespace
81
82 def fit(self, tensor_list):
  • W293 Blank line contains whitespace
83
84 catted = torch.cat(tensor_list, 0).cpu().float().numpy()
85 self.encoder.fit(catted)
86 translated = self.encoder.transform(catted)
  • W293 Blank line contains whitespace
87
88 pointer = 0
89 encoded_tensors = list()
90 for tensor in tensor_list:
91 n = tensor.shape[0]
92 encoded = torch.FloatTensor(translated[pointer:pointer+n,:])
93 encoded_tensors.append(encoded)
94 pointer += n
  • W293 Blank line contains whitespace
95
96 return encoded_tensors
97 '''
  • W293 Blank line contains whitespace
98
99  
  • E501 Line too long (103 > 79 characters)
100 # NB: this encoding scheme has been implemented, but never tested in experiments: use at your own risk.
101 '''
102 class binning:
  • W293 Blank line contains whitespace
103
104 def __init__(self, tensor_list, **kwargs):
  • W293 Blank line contains whitespace
105
106 self.n_bins = kwargs['bins'][0]
107 self.strategy = kwargs['strategy']
  • W293 Blank line contains whitespace
108
  • W293 Blank line contains whitespace
109
110 def fit(self, tensor_list):
  • W293 Blank line contains whitespace
111
  • W293 Blank line contains whitespace
112
113 catted = torch.cat(tensor_list, 0)
114 translated = None
115 d = []
116 for col in range(catted.shape[1]):
117 tensor_column = catted[:, col].unsqueeze(1).cpu().numpy()
118 print(col, np.unique(tensor_column))
119 # B = min([self.n_bins[col], len(np.unique(tensor_column))])
120 B = min([self.n_bins, len(np.unique(tensor_column))])
121 if B == 1:
122 result = torch.ones(tensor_column.shape)
123 else:
  • E501 Line too long (94 > 79 characters)
124 encoder = KBinsDiscretizer(n_bins=B, encode='ordinal', strategy=self.strategy)
125 d.append(encoder.n_bins)
126 encoder.fit(tensor_column)
127 result = encoder.transform(tensor_column)
128 result = torch.LongTensor(result)
  • E501 Line too long (83 > 79 characters)
129 translated = result if col == 0 else torch.cat((translated, result), 1)
  • W293 Blank line contains whitespace
130
131 pointer = 0
132 encoded_tensors = list()
133 for tensor in tensor_list:
134 n = tensor.shape[0]
135 encoded_tensors.append(translated[pointer:pointer+n])
136 pointer += n
  • W293 Blank line contains whitespace
137
138 self.d = d
139 return encoded_tensors
140 '''