gds.common.utils

gds/common/utils.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144	import numpy as np import torch from pandas.api.types import CategoricalDtype def minimum(numbers, empty_val=0.): if isinstance(numbers, torch.Tensor): if numbers.numel() == 0: return torch.tensor(empty_val, device=numbers.device) else: return numbers[~torch.isnan(numbers)].min() elif isinstance(numbers, np.ndarray): if numbers.size == 0: return np.array(empty_val) else: return np.nanmin(numbers) else: if len(numbers) == 0: return empty_val else: return min(numbers) def maximum(numbers, empty_val=0.): if isinstance(numbers, torch.Tensor): if numbers.numel() == 0: return torch.tensor(empty_val, device=numbers.device) else: return numbers[~torch.isnan(numbers)].max() elif isinstance(numbers, np.ndarray): if numbers.size == 0: return np.array(empty_val) else: return np.nanmax(numbers) else: if len(numbers) == 0: return empty_val else: return max(numbers) def split_into_groups(g): """ Args: - g (Tensor): Vector of groups Returns: - groups (Tensor): Unique groups present in g - group_indices (list): List of Tensors, where the i-th tensor is the indices of the elements of g that equal groups[i]. Has the same length as len(groups). - unique_counts (Tensor): Counts of each element in groups. Has the same length as len(groups). """ unique_groups, unique_counts = torch.unique(g, sorted=False, return_counts=True) group_indices = [] for group in unique_groups: group_indices.append( torch.nonzero(g == group, as_tuple=True)[0]) return unique_groups, group_indices, unique_counts def get_counts(g, n_groups): """ This differs from split_into_groups in how it handles missing groups. get_counts always returns a count Tensor of length n_groups, whereas split_into_groups returns a unique_counts Tensor whose length is the number of unique groups present in g. Args: - g (Tensor): Vector of groups Returns: - counts (Tensor): A list of length n_groups, denoting the count of each group. """ unique_groups, unique_counts = torch.unique(g, sorted=False, return_counts=True) counts = torch.zeros(n_groups, device=g.device) counts[unique_groups] = unique_counts.float() return counts def avg_over_groups(v, g, n_groups): """ Args: v (Tensor): Vector containing the quantity to average over. g (Tensor): Vector of the same length as v, containing group information. Returns: group_avgs (Tensor): Vector of length num_groups group_counts (Tensor) """ import torch_scatter assert v.device == g.device assert v.numel() == g.numel() group_count = get_counts(g, n_groups) group_avgs = torch_scatter.scatter(src=v, index=g, dim_size=n_groups, reduce='mean') return group_avgs, group_count def map_to_id_array(df, ordered_map={}): maps = {} array = np.zeros(df.shape) for i, c in enumerate(df.columns): if c in ordered_map: category_type = CategoricalDtype(categories=ordered_map[c], ordered=True) else: category_type = 'category' series = df[c].astype(category_type) maps[c] = series.cat.categories.values array[:, i] = series.cat.codes.values return maps, array def subsample_idxs(idxs, num=5000, take_rest=False, seed=None): seed = (seed + 541433) if seed is not None else None rng = np.random.default_rng(seed) idxs = idxs.copy() rng.shuffle(idxs) if take_rest: idxs = idxs[num:] else: idxs = idxs[:num] return idxs def shuffle_arr(arr, seed=None): seed = (seed + 548207) if seed is not None else None rng = np.random.default_rng(seed) arr = arr.copy() rng.shuffle(arr) return arr def threshold_at_recall(y_pred, y_true, global_recall=60): """ Calculate the model threshold to use to achieve a desired global_recall level. Assumes that y_true is a vector of the true binary labels.""" return np.percentile(y_pred[y_true == 1], 100 - global_recall) def numel(obj): if torch.is_tensor(obj): return obj.numel() elif isinstance(obj, list): return len(obj) else: raise TypeError("Invalid type for numel")

gds/common/utils.py

import numpy as np
import torch
from pandas.api.types import CategoricalDtype


def minimum(numbers, empty_val=0.):
    if isinstance(numbers, torch.Tensor):
        if numbers.numel() == 0:
            return torch.tensor(empty_val, device=numbers.device)
        else:
            return numbers[~torch.isnan(numbers)].min()
    elif isinstance(numbers, np.ndarray):
        if numbers.size == 0:
            return np.array(empty_val)
        else:
            return np.nanmin(numbers)
    else:
        if len(numbers) == 0:
            return empty_val
        else:
            return min(numbers)


def maximum(numbers, empty_val=0.):
    if isinstance(numbers, torch.Tensor):
        if numbers.numel() == 0:
            return torch.tensor(empty_val, device=numbers.device)
        else:
            return numbers[~torch.isnan(numbers)].max()
    elif isinstance(numbers, np.ndarray):
        if numbers.size == 0:
            return np.array(empty_val)
        else:
            return np.nanmax(numbers)
    else:
        if len(numbers) == 0:
            return empty_val
        else:
            return max(numbers)


def split_into_groups(g):
    """
    Args:
        - g (Tensor): Vector of groups
    Returns:
        - groups (Tensor): Unique groups present in g
        - group_indices (list): List of Tensors, where the i-th tensor is the indices of the
                                elements of g that equal groups[i].
                                Has the same length as len(groups).
        - unique_counts (Tensor): Counts of each element in groups.
                                 Has the same length as len(groups).
    """
    unique_groups, unique_counts = torch.unique(g, sorted=False, return_counts=True)
    group_indices = []
    for group in unique_groups:
        group_indices.append(
            torch.nonzero(g == group, as_tuple=True)[0])
    return unique_groups, group_indices, unique_counts


def get_counts(g, n_groups):
    """
    This differs from split_into_groups in how it handles missing groups.
    get_counts always returns a count Tensor of length n_groups,
    whereas split_into_groups returns a unique_counts Tensor
    whose length is the number of unique groups present in g.
    Args:
        - g (Tensor): Vector of groups
    Returns:
        - counts (Tensor): A list of length n_groups, denoting the count of each group.
    """
    unique_groups, unique_counts = torch.unique(g, sorted=False, return_counts=True)
    counts = torch.zeros(n_groups, device=g.device)
    counts[unique_groups] = unique_counts.float()
    return counts


def avg_over_groups(v, g, n_groups):
    """
    Args:
        v (Tensor): Vector containing the quantity to average over.
        g (Tensor): Vector of the same length as v, containing group information.
    Returns:
        group_avgs (Tensor): Vector of length num_groups
        group_counts (Tensor)
    """
    import torch_scatter
    assert v.device == g.device
    assert v.numel() == g.numel()
    group_count = get_counts(g, n_groups)
    group_avgs = torch_scatter.scatter(src=v, index=g, dim_size=n_groups, reduce='mean')
    return group_avgs, group_count


def map_to_id_array(df, ordered_map={}):
    maps = {}
    array = np.zeros(df.shape)
    for i, c in enumerate(df.columns):
        if c in ordered_map:
            category_type = CategoricalDtype(categories=ordered_map[c], ordered=True)
        else:
            category_type = 'category'
        series = df[c].astype(category_type)
        maps[c] = series.cat.categories.values
        array[:, i] = series.cat.codes.values
    return maps, array


def subsample_idxs(idxs, num=5000, take_rest=False, seed=None):
    seed = (seed + 541433) if seed is not None else None
    rng = np.random.default_rng(seed)

    idxs = idxs.copy()
    rng.shuffle(idxs)
    if take_rest:
        idxs = idxs[num:]
    else:
        idxs = idxs[:num]
    return idxs


def shuffle_arr(arr, seed=None):
    seed = (seed + 548207) if seed is not None else None
    rng = np.random.default_rng(seed)

    arr = arr.copy()
    rng.shuffle(arr)
    return arr


def threshold_at_recall(y_pred, y_true, global_recall=60):
    """ Calculate the model threshold to use to achieve a desired global_recall level. Assumes that
    y_true is a vector of the true binary labels."""
    return np.percentile(y_pred[y_true == 1], 100 - global_recall)


def numel(obj):
    if torch.is_tensor(obj):
        return obj.numel()
    elif isinstance(obj, list):
        return len(obj)
    else:
        raise TypeError("Invalid type for numel")