utils_graph_learning

gds/datasets/gsn/utils_graph_learning.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262	import torch import torch.nn.functional as F import torch.nn as nn from torch_geometric.utils import degree from .models_misc import mlp from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims def multi_class_accuracy(y_hat, y, reduction='sum'): pred = y_hat.max(1)[1] if reduction == 'sum': acc = pred.eq(y).sum().float() elif reduction == 'mean': acc = pred.eq(y).mean().float() else: raise NotImplementedError('Reduction {} not currently implemented.'.format(reduction)) return acc def global_add_pool_sparse(x, batch): #-------------- global sum pooling index = torch.stack([batch, torch.tensor(list(range(batch.shape[0])), device=x.device)], 0) x_sparse = torch.sparse.FloatTensor(index, x, torch.Size([torch.max(batch)+1, x.shape[0], x.shape[1]])) return torch.sparse.sum(x_sparse, 1).to_dense() def global_mean_pool_sparse(x, batch): #-------------- global average pooling index = torch.stack([batch, torch.tensor(list(range(batch.shape[0])), device=x.device)], 0) x_sparse = torch.sparse.FloatTensor(index, x, torch.Size([torch.max(batch)+1, x.shape[0], x.shape[1]])) graph_sizes = degree(batch).float() graph_sizes[graph_sizes==0.0] = 1.0 return torch.sparse.sum(x_sparse, 1).to_dense() / graph_sizes.unsqueeze(1) class DiscreteEmbedding(torch.nn.Module): def __init__(self, encoder_name, d_in_features, d_in_encoder, d_out_encoder, *kwargs): super(DiscreteEmbedding, self).__init__() #-------------- various different embedding layers kwargs['init'] = None if 'init' not in kwargs else kwargs['init'] self.encoder_name = encoder_name # d_in_features: input feature size (e.g. if already one hot encoded), # d_in_encoder: number of unique values that will be encoded (size of embedding vocabulary) #-------------- fill embedding with zeros if encoder_name == 'zero_encoder': self.encoder = zero_encoder(d_out_encoder) d_out = d_out_encoder #-------------- linear pojection elif encoder_name == 'linear': self.encoder = nn.Linear(d_in_features, d_out_encoder, bias=True) d_out = d_out_encoder #-------------- mlp elif encoder_name == 'mlp': self.encoder = mlp(d_in_features, d_out_encoder, d_out_encoder, kwargs['seed'], kwargs['activation_mlp'], kwargs['bn_mlp']) d_out = d_out_encoder #-------------- multi hot encoding of categorical data elif encoder_name == 'one_hot_encoder': self.encoder = one_hot_encoder(d_in_encoder) d_out = sum(d_in_encoder) #-------------- embedding of categorical data (linear projection without bias of one hot encodings) elif encoder_name == 'embedding': self.encoder = multi_embedding(d_in_encoder, d_out_encoder, kwargs['aggr'], kwargs['init']) if kwargs['aggr'] == 'concat': d_out = len(d_in_encoder) d_out_encoder else: d_out = d_out_encoder #-------------- for ogb: multi hot encoding of node features elif encoder_name == 'atom_one_hot_encoder': full_atom_feature_dims = get_atom_feature_dims() if kwargs['features_scope'] == 'full' else get_atom_feature_dims()[:2] self.encoder = one_hot_encoder(full_atom_feature_dims) d_out = sum(full_atom_feature_dims) #-------------- for ogb: multi hot encoding of edge features elif encoder_name == 'bond_one_hot_encoder': full_bond_feature_dims = get_bond_feature_dims() if kwargs['features_scope'] == 'full' else get_bond_feature_dims()[:2] self.encoder = one_hot_encoder(full_bond_feature_dims) d_out = sum(full_bond_feature_dims) #-------------- for ogb: embedding of node features elif encoder_name == 'atom_encoder': self.encoder = AtomEncoder(d_out_encoder) d_out = d_out_encoder #-------------- for ogb: embedding of edge features elif encoder_name == 'bond_encoder': self.encoder = BondEncoder(emb_dim = d_out_encoder) d_out = d_out_encoder #-------------- no embedding, use as is elif encoder_name == 'None': self.encoder = None d_out = d_in_features else: raise NotImplementedError('Encoder {} is not currently supported.'.format(encoder_name)) self.d_out = d_out return def forward(self, x): x = x.unsqueeze(-1) if x.dim() == 1 else x if self.encoder is not None: x = x.float() if self.encoder_name == 'linear' or self.encoder_name == 'mlp' else x.long() return self.encoder(x) else: return x.float() class multi_embedding(torch.nn.Module): def __init__(self, d_in, d_out, aggr = 'concat', init=None): super(multi_embedding, self).__init__() #-------------- embedding of multiple categorical features. Summation or concatenation of the embeddings is allowed self.d_in = d_in self.aggr = aggr self.encoder = [] for i in range(len(d_in)): self.encoder.append(nn.Embedding(d_in[i], d_out)) if init == 'zeros': print('### INITIALIZING EMBEDDING TO ZERO ###') torch.nn.init.constant_(self.encoder[i].weight.data, 0) else: torch.nn.init.xavier_uniform_(self.encoder[-1].weight.data) self.encoder = nn.ModuleList(self.encoder) return def forward(self, tensor): for i in range(tensor.shape[1]): embedding_i = self.encoder[i](tensor[:,i]) if self.aggr == 'concat': embedding = torch.cat((embedding, embedding_i),1) if i>0 else embedding_i elif self.aggr == 'sum': embedding = embedding + embedding_i if i>0 else embedding_i else: raise NotImplementedError('multi embedding aggregation {} is not currently supported.'.format(self.aggr)) return embedding class one_hot_encoder(torch.nn.Module): def __init__(self, d_in): super(one_hot_encoder, self).__init__() self.d_in = d_in return def forward(self, tensor): for i in range(tensor.shape[1]): onehot_i = torch.zeros((tensor.shape[0], self.d_in[i]), device=tensor.device) onehot_i.scatter_(1, tensor[:,i:i+1], 1) onehot = torch.cat((onehot, onehot_i), 1) if i>0 else onehot_i return onehot def __repr__(self): return '{}({})'.format(self.__class__.__name__, self.d_in) class zero_encoder(torch.nn.Module): def __init__(self, d_out): super(zero_encoder, self).__init__() self.d_out = d_out return def forward(self, tensor): return torch.zeros((tensor.shape[0], self.d_out), device=tensor.device) def __repr__(self): return '{}({})'.format(self.__class__.__name__, self.d_out) class central_encoder(nn.Module): def __init__(self, nb_encoder, d_ef, extend=True): super(central_encoder, self).__init__() #-------------- For the neighbor aggregation: central node embedding #-------------- This is a way to create a dummy variable that represents self loops. #-------------- Useful when working with edge features or GSN-e #-------------- Two ways are allowed: extra dummy variable (one hot or embedding) or a vector filled with zeros self.extend = extend self.nb_encoder = nb_encoder if self.extend: print('##### EXTENDING EDGE FEATURE DIMENSIONS #####') if 'one_hot_encoder' in nb_encoder: if self.extend: self.encoder = DiscreteEmbedding('one_hot_encoder', 1, [d_ef+1], None) self.d_out = d_ef+1 else: self.d_out = d_ef else: self.d_out = d_ef if self.extend: self.encoder = DiscreteEmbedding('embedding', None, [1], d_ef, aggr='sum') else: pass return def forward(self, x_nb, num_nodes): if 'one_hot_encoder' in self.nb_encoder: if self.extend: zero_extension = torch.zeros((x_nb.shape[0], 1), device=x_nb.device) x_nb = torch.cat((zero_extension, x_nb), -1) x_central = torch.zeros((num_nodes,1), device=x_nb.device).long() x_central = self.encoder(x_central) else: x_central = torch.zeros((num_nodes, self.d_out), device=x_nb.device) else: if self.extend: x_central = torch.zeros((num_nodes,1), device=x_nb.device).long() x_central = self.encoder(x_central) else: x_central = torch.zeros((num_nodes, self.d_out), device=x_nb.device) return x_central, x_nb

gds/datasets/gsn/utils_graph_learning.py

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch_geometric.utils import degree

from .models_misc import mlp
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
from ogb.utils.features import get_atom_feature_dims, get_bond_feature_dims 


def multi_class_accuracy(y_hat, y, reduction='sum'):
    
    pred = y_hat.max(1)[1]
    if reduction == 'sum':
        acc = pred.eq(y).sum().float()
    elif reduction == 'mean':
        acc = pred.eq(y).mean().float()
    else:
        raise NotImplementedError('Reduction {} not currently implemented.'.format(reduction))
    return acc


def global_add_pool_sparse(x, batch):
    
    #-------------- global sum pooling
    index = torch.stack([batch, torch.tensor(list(range(batch.shape[0])), device=x.device)], 0)  
    x_sparse = torch.sparse.FloatTensor(index, x, torch.Size([torch.max(batch)+1, x.shape[0], x.shape[1]]))
        
    return torch.sparse.sum(x_sparse, 1).to_dense()


def global_mean_pool_sparse(x, batch):
    
    #-------------- global average pooling
    index = torch.stack([batch, torch.tensor(list(range(batch.shape[0])), device=x.device)], 0)  
    x_sparse = torch.sparse.FloatTensor(index, x, torch.Size([torch.max(batch)+1, x.shape[0], x.shape[1]]))

    graph_sizes = degree(batch).float()
    graph_sizes[graph_sizes==0.0] = 1.0

    return torch.sparse.sum(x_sparse, 1).to_dense() / graph_sizes.unsqueeze(1)


class DiscreteEmbedding(torch.nn.Module):

    def __init__(self, encoder_name, d_in_features, d_in_encoder, d_out_encoder, **kwargs):

        super(DiscreteEmbedding, self).__init__()
        
        #-------------- various different embedding layers
        kwargs['init'] = None if 'init' not in kwargs else kwargs['init']
    
        self.encoder_name = encoder_name
        # d_in_features: input feature size (e.g. if already one hot encoded), 
        # d_in_encoder: number of unique values that will be encoded (size of embedding vocabulary)
        
        #-------------- fill embedding with zeros
        if encoder_name == 'zero_encoder':
            self.encoder = zero_encoder(d_out_encoder)
            d_out = d_out_encoder

        #-------------- linear pojection
        elif encoder_name == 'linear':
            self.encoder = nn.Linear(d_in_features,  d_out_encoder, bias=True)
            d_out = d_out_encoder

        #-------------- mlp
        elif encoder_name == 'mlp':
            self.encoder = mlp(d_in_features,
                               d_out_encoder,           
                               d_out_encoder,
                               kwargs['seed'],
                               kwargs['activation_mlp'],
                               kwargs['bn_mlp'])
            d_out = d_out_encoder

        #-------------- multi hot encoding of categorical data
        elif encoder_name == 'one_hot_encoder':
            self.encoder = one_hot_encoder(d_in_encoder)
            d_out = sum(d_in_encoder)

        #-------------- embedding of categorical data (linear projection without bias of one hot encodings)
        elif encoder_name == 'embedding':
            self.encoder = multi_embedding(d_in_encoder, d_out_encoder, kwargs['aggr'], kwargs['init'])
            if kwargs['aggr'] == 'concat':
                d_out = len(d_in_encoder) * d_out_encoder
            else:
                d_out = d_out_encoder
                
        #-------------- for ogb: multi hot encoding of node features
        elif encoder_name == 'atom_one_hot_encoder':
            full_atom_feature_dims = get_atom_feature_dims() if kwargs['features_scope'] == 'full' else get_atom_feature_dims()[:2]
            self.encoder = one_hot_encoder(full_atom_feature_dims)
            d_out = sum(full_atom_feature_dims)
        
        #-------------- for ogb: multi hot encoding of edge features
        elif encoder_name  == 'bond_one_hot_encoder':
            full_bond_feature_dims = get_bond_feature_dims() if kwargs['features_scope'] == 'full' else  get_bond_feature_dims()[:2]
            self.encoder  = one_hot_encoder(full_bond_feature_dims)
            d_out = sum(full_bond_feature_dims)
                
        #-------------- for ogb: embedding of node features
        elif encoder_name == 'atom_encoder':
            self.encoder  = AtomEncoder(d_out_encoder)
            d_out = d_out_encoder

        #-------------- for ogb: embedding of edge features
        elif encoder_name  == 'bond_encoder':
            self.encoder  = BondEncoder(emb_dim = d_out_encoder)
            d_out = d_out_encoder

        #-------------- no embedding, use as is
        elif encoder_name == 'None':
            self.encoder  = None
            d_out = d_in_features

        else:
            raise NotImplementedError('Encoder {} is not currently supported.'.format(encoder_name))
            
        self.d_out = d_out
        
        return

    def forward(self, x):
        
        x = x.unsqueeze(-1) if x.dim() == 1 else x
        if self.encoder is not None:
            x = x.float() if self.encoder_name ==  'linear' or self.encoder_name == 'mlp' else x.long()
            return self.encoder(x)
        else:
            return x.float()    


class multi_embedding(torch.nn.Module):
    
    def __init__(self, d_in, d_out, aggr = 'concat', init=None):
        
        super(multi_embedding, self).__init__()
        
        #-------------- embedding of multiple categorical features. Summation or concatenation of the embeddings is allowed
        
        self.d_in = d_in
        self.aggr = aggr
        self.encoder = []
        for i in range(len(d_in)):
            self.encoder.append(nn.Embedding(d_in[i], d_out))
            if init == 'zeros':
                print('### INITIALIZING EMBEDDING TO ZERO ###')
                torch.nn.init.constant_(self.encoder[i].weight.data, 0)
            else:
                torch.nn.init.xavier_uniform_(self.encoder[-1].weight.data)
        self.encoder = nn.ModuleList(self.encoder)   
        
        return 

    def forward(self, tensor):
        
        for i in range(tensor.shape[1]):
            embedding_i = self.encoder[i](tensor[:,i])
            if self.aggr == 'concat':
                embedding = torch.cat((embedding, embedding_i),1) if i>0 else embedding_i
            elif self.aggr == 'sum':
                embedding = embedding + embedding_i if i>0 else embedding_i
            else:
                raise NotImplementedError('multi embedding aggregation {} is not currently supported.'.format(self.aggr))
        
        return embedding


class one_hot_encoder(torch.nn.Module):
    
    def __init__(self, d_in):
        
        super(one_hot_encoder, self).__init__()
        
        self.d_in = d_in
        
        return 

    def forward(self, tensor):
        
        for i in range(tensor.shape[1]):
            onehot_i = torch.zeros((tensor.shape[0], self.d_in[i]), device=tensor.device)
            onehot_i.scatter_(1, tensor[:,i:i+1], 1)
            onehot = torch.cat((onehot, onehot_i), 1) if i>0 else onehot_i
        
        return onehot
    
    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, self.d_in)
    

class zero_encoder(torch.nn.Module):
    
    def __init__(self, d_out):
        
        super(zero_encoder, self).__init__()
        
        self.d_out = d_out
        
        return 

    def forward(self, tensor):
        
        return torch.zeros((tensor.shape[0], self.d_out), device=tensor.device)
    
    def __repr__(self):
        return '{}({})'.format(self.__class__.__name__, self.d_out) 


class central_encoder(nn.Module):
    
    def __init__(self, nb_encoder, d_ef, extend=True):
        
        super(central_encoder, self).__init__()
        
        #-------------- For the neighbor aggregation: central node embedding
        #-------------- This is a way to create a dummy variable that represents self loops.
        #-------------- Useful when working with edge features or GSN-e
        #-------------- Two ways are allowed: extra dummy variable (one hot or embedding) or a vector filled with zeros
        
        self.extend = extend
        self.nb_encoder = nb_encoder
        
        if self.extend:
            print('##### EXTENDING EDGE FEATURE DIMENSIONS #####')
        
        if 'one_hot_encoder' in nb_encoder:
            if self.extend:
                self.encoder = DiscreteEmbedding('one_hot_encoder', 1, [d_ef+1], None)
                self.d_out = d_ef+1
            else:
                self.d_out = d_ef
        else:
            self.d_out = d_ef
            if self.extend:
                self.encoder = DiscreteEmbedding('embedding',  None, [1], d_ef, aggr='sum')
            else:
                pass
            
        return

    def forward(self, x_nb, num_nodes):
        
        if 'one_hot_encoder' in self.nb_encoder:
            if self.extend:
                zero_extension = torch.zeros((x_nb.shape[0], 1), device=x_nb.device)
                x_nb = torch.cat((zero_extension, x_nb), -1)
                x_central = torch.zeros((num_nodes,1), device=x_nb.device).long()
                x_central = self.encoder(x_central)
            else:
                x_central = torch.zeros((num_nodes, self.d_out), device=x_nb.device)
        else:
            if self.extend:
                x_central = torch.zeros((num_nodes,1), device=x_nb.device).long()
                x_central = self.encoder(x_central)
            else:
                x_central = torch.zeros((num_nodes, self.d_out), device=x_nb.device)
            
        return x_central, x_nb