Source code for graphslim.condensation.utils

from graphslim.utils import *
import torch_geometric
import math
from numpy.linalg import eigh
from scipy.sparse.linalg import eigsh
from collections import Counter
from torch_geometric.data import Data
from torch_geometric.utils import convert
import networkx as nx



[docs]
def match_loss(gw_syn, gw_real, args, device):
    """
    Computes the loss between synthetic and real gradients based on the specified distance metric.

    Parameters
    ----------
    gw_syn : list of torch.Tensor
        List of synthetic gradients for different model parameters.
    gw_real : list of torch.Tensor
        List of real gradients for different model parameters.
    args : Namespace
        Arguments object containing hyperparameters for training and model.
    device : torch.device
        Device (CPU or GPU) on which computations are performed.

    Returns
    -------
    torch.Tensor
        The computed distance (loss) between synthetic and real gradients.
    """
    dis = torch.tensor(0.0).to(device)

    if args.dis_metric == 'ours':

        for ig in range(len(gw_real)):
            gwr = gw_real[ig]
            gws = gw_syn[ig]
            dis += distance_wb(gwr, gws)

    elif args.dis_metric == 'mse':
        gw_real_vec = []
        gw_syn_vec = []
        for ig in range(len(gw_real)):
            gw_real_vec.append(gw_real[ig].reshape((-1)))
            gw_syn_vec.append(gw_syn[ig].reshape((-1)))
        gw_real_vec = torch.cat(gw_real_vec, dim=0)
        gw_syn_vec = torch.cat(gw_syn_vec, dim=0)
        dis = torch.sum((gw_syn_vec - gw_real_vec) ** 2)

    elif args.dis_metric == 'cos':
        gw_real_vec = []
        gw_syn_vec = []
        for ig in range(len(gw_real)):
            gw_real_vec.append(gw_real[ig].reshape((-1)))
            gw_syn_vec.append(gw_syn[ig].reshape((-1)))
        gw_real_vec = torch.cat(gw_real_vec, dim=0)
        gw_syn_vec = torch.cat(gw_syn_vec, dim=0)
        dis = 1 - torch.sum(gw_real_vec * gw_syn_vec, dim=-1) / (
                torch.norm(gw_real_vec, dim=-1) * torch.norm(gw_syn_vec, dim=-1) + 0.000001)

    else:
        exit('DC error: unknown distance function')

    return dis




[docs]
def distance_wb(gwr, gws):
    """
    Computes the distance between two tensors representing gradients using cosine similarity.

    Parameters
    ----------
    gwr : torch.Tensor
        The real gradient tensor.
    gws : torch.Tensor
        The synthetic gradient tensor.

    Returns
    -------
    torch.Tensor
        The computed distance between the real and synthetic gradients.
    """
    shape = gwr.shape

    if len(gwr.shape) == 2:
        gwr = gwr.T
        gws = gws.T

    if len(shape) == 4:  # conv, out*in*h*w
        gwr = gwr.reshape(shape[0], shape[1] * shape[2] * shape[3])
        gws = gws.reshape(shape[0], shape[1] * shape[2] * shape[3])
    elif len(shape) == 3:  # layernorm, C*h*w
        gwr = gwr.reshape(shape[0], shape[1] * shape[2])
        gws = gws.reshape(shape[0], shape[1] * shape[2])
    elif len(shape) == 2:  # linear, out*in
        tmp = 'do nothing'
    elif len(shape) == 1:  # batchnorm/instancenorm, C; groupnorm x, bias
        gwr = gwr.reshape(1, shape[0])
        gws = gws.reshape(1, shape[0])
        return 0

    dis_weight = torch.sum(
        1 - torch.sum(gwr * gws, dim=-1) / (torch.norm(gwr, dim=-1) * torch.norm(gws, dim=-1) + 0.000001))
    dis = dis_weight
    return dis



# GCSNTK utils

[docs]
def sub_E(idx, A):
    """
    Generates a sparse adjacency matrix of the subgraph defined by the given indices.

    Parameters
    ----------
    idx : torch.Tensor
        A tensor containing the indices of the nodes that define the subgraph.
    A : torch.Tensor
        The original adjacency matrix of the graph.

    Returns
    -------
    torch.sparse_coo_tensor
        The sparse adjacency matrix of the subgraph.
    """
    n = A.shape[0]
    n_neig = len(idx)
    operator = torch.zeros([n, n_neig])
    operator[idx, range(n_neig)] = 1
    sub_A = torch.matmul(torch.matmul(operator.t(), A), operator)

    ind = torch.where(sub_A != 0)
    inds = torch.cat([ind[0], ind[1]]).reshape(2, len(ind[0]))
    values = torch.ones(len(ind[0]))
    sub_E = torch.sparse_coo_tensor(inds, values, torch.Size([n_neig, n_neig])).to(A.device)

    return sub_E




[docs]
def update_E(x_s, neig):
    """
    Update the adjacency matrix based on the features of the nodes and the average number of neighbors.

    Parameters
    ----------
    x_s : torch.Tensor
        A tensor containing the feature vectors of the nodes.
    neig : float
        The average number of neighbors each node should have.

    Returns
    -------
    torch.sparse_coo_tensor
        The sparse adjacency matrix based on the updated similarities.
    """
    n = x_s.shape[0]
    K = torch.empty(n, n)
    A = torch.zeros(n * n)

    for i in range(n):
        for j in range(i, n):
            K[i, j] = torch.norm(x_s[i] - x_s[j])
            K[j, i] = K[i, j]

    edge = int(n + torch.round(torch.tensor(neig * n / 2)))
    if (edge % 2) != 0:
        edge += 1
    else:
        pass

    Simil = torch.flatten(K)
    _, indices = torch.sort(Simil)
    A[indices[0:edge]] = 1
    A = A.reshape(n, n)
    ind = torch.where(A == 1)

    ind = torch.cat([ind[0], ind[1]]).reshape(2, edge)
    values = torch.ones(edge)
    E = torch.sparse_coo_tensor(ind, values, torch.Size([n, n])).to(x_s.device)

    return E



# utils for GCSNTK

[docs]
def normalize_data(data):
    """
    Normalize the input data using mean and standard deviation.

    Parameters
    ----------
    data : torch.Tensor
        The data to be normalized. Each column represents a feature, and normalization is applied to each feature independently.

    Returns
    -------
    torch.Tensor
        The normalized data where each feature has zero mean and unit variance.
    """
    mean = data.mean(dim=0)
    std = data.std(dim=0)
    std[std == 0] = 1
    normalized_data = (data - mean) / std
    return normalized_data




[docs]
def GCF(adj, x, k=1):
    """
    Apply Graph Convolution Filter (GCF) to features using the adjacency matrix.

    Parameters
    ----------
    adj : torch.Tensor
        Adjacency matrix of the graph. It must include self-loops. Shape: (N, N), where N is the number of nodes.

    x : torch.Tensor
        Node features. Shape: (N, F), where F is the number of features for each node.

    k : int, optional
        Number of hops (or layers) to apply the filter. Default is 1.

    Returns
    -------
    torch.Tensor
        Filtered features after applying the graph convolution. Shape: (N, F).
    """
    D = torch.sum(adj, dim=1)
    D = torch.pow(D, -0.5)
    D = torch.diag(D)

    filter = torch.matmul(torch.matmul(D, adj), D)
    for i in range(k):
        x = torch.matmul(filter, x)
    return x



# geom

[docs]
def neighborhood_difficulty_measurer(data, adj, label):
    """
    Measure the difficulty of neighborhoods in the graph based on the label distribution.

    Parameters
    ----------
    data : Data
        PyG Data object containing node features and labels.

    adj : torch.Tensor
        Sparse adjacency matrix of the graph. The shape is (N, N) where N is the number of nodes.

    label : torch.Tensor
        Tensor containing the label of each node. Shape: (N,)

    Returns
    -------
    torch.Tensor
        Difficulty scores for each node. Higher scores indicate more difficult neighborhoods.
    """
    edge_index = adj.coalesce().indices()
    edge_value = adj.coalesce().values()

    neighbor_label, _ = torch_geometric.utils.add_self_loops(edge_index)  # [[1, 1, 1, 1],[2, 3, 4, 5]]

    neighbor_label[1] = label[neighbor_label[1]]  # [[1, 1, 1, 1],[40, 20, 19, 21]]

    neighbor_label = torch.transpose(neighbor_label, 0, 1)  # [[1, 40], [1, 20], [1, 19], [1, 21]]

    index, count = torch.unique(neighbor_label, sorted=True, return_counts=True, dim=0)

    neighbor_class = torch.sparse_coo_tensor(index.T, count)
    neighbor_class = neighbor_class.to_dense().float()

    neighbor_class = neighbor_class[data.idx_train]
    neighbor_class = F.normalize(neighbor_class, 1.0, 1)
    neighbor_entropy = -1 * neighbor_class * torch.log(neighbor_class + torch.exp(torch.tensor(-20)))  # 防止log里面是0出现异常
    local_difficulty = neighbor_entropy.sum(1)

    return local_difficulty




[docs]
def difficulty_measurer(data, adj, label):
    """
    Measure the difficulty of nodes in the graph based on their neighborhood label distribution.

    Parameters
    ----------
    data : Data
        PyG Data object containing node features and labels.

    adj : torch.Tensor
        Sparse adjacency matrix of the graph. The shape is (N, N) where N is the number of nodes.

    label : torch.Tensor
        Tensor containing the label of each node. Shape: (N,)

    Returns
    -------
    torch.Tensor
        Difficulty scores for each node. Higher scores indicate more difficult nodes.
    """
    local_difficulty = neighborhood_difficulty_measurer(data, adj, label)
    # global_difficulty = feature_difficulty_measurer(data, label, embedding)
    node_difficulty = local_difficulty
    return node_difficulty




[docs]
def sort_training_nodes(data, adj, label):
    """
    Sort training nodes based on their difficulty measured by neighborhood label distribution.

    Parameters
    ----------
    data : Data
        PyG Data object containing node features and labels.

    adj : torch.Tensor
        Sparse adjacency matrix of the graph (shape: N x N) with self-loops.

    label : torch.Tensor
        Tensor containing the label of each node (shape: N,).

    Returns
    -------
    numpy.ndarray
        Indices of the training nodes sorted by their difficulty, from easiest to hardest.
    """
    node_difficulty = difficulty_measurer(data, adj, label)
    _, indices = torch.sort(node_difficulty)
    indices = indices.cpu().numpy()
    sorted_trainset = data.idx_train[indices]
    return sorted_trainset




[docs]
def neighborhood_difficulty_measurer_in(data, adj, label):
    """
    Measure the difficulty of each node in a graph based on the entropy of neighbor labels.

    Parameters
    ----------
    data : Data
        PyG Data object containing node features and labels.

    adj : torch.Tensor
        Sparse adjacency matrix of the graph (shape: N x N) with self-loops.

    label : torch.Tensor
        Tensor containing the label of each node (shape: N,).

    Returns
    -------
    torch.Tensor
        Tensor of local difficulty scores for each node.
    """
    edge_index = adj.coalesce().indices()
    edge_value = adj.coalesce().values()

    neighbor_label, _ = torch_geometric.utils.add_self_loops(edge_index)  # [[1, 1, 1, 1],[2, 3, 4, 5]]

    neighbor_label[1] = label[neighbor_label[1]]  # [[1, 1, 1, 1],[40, 20, 19, 21]]

    neighbor_label = torch.transpose(neighbor_label, 0, 1)  # [[1, 40], [1, 20], [1, 19], [1, 21]]

    index, count = torch.unique(neighbor_label, sorted=True, return_counts=True, dim=0)

    neighbor_class = torch.sparse_coo_tensor(index.T, count)
    neighbor_class = neighbor_class.to_dense().float()

    neighbor_class = F.normalize(neighbor_class, 1.0, 1)
    neighbor_entropy = -1 * neighbor_class * torch.log(neighbor_class + torch.exp(torch.tensor(-20)))  # 防止log里面是0出现异常
    local_difficulty = neighbor_entropy.sum(1)

    return local_difficulty




[docs]
def difficulty_measurer_in(data, adj, label):
    """
    Measure the difficulty of each node in a graph based on local entropy of neighbor labels.

    Parameters
    ----------
    data : Data
        PyG Data object containing node features and labels.

    adj : torch.Tensor
        Sparse adjacency matrix of the graph (shape: N x N) with self-loops.

    label : torch.Tensor
        Tensor containing the label of each node (shape: N,).

    Returns
    -------
    torch.Tensor
        Tensor of local difficulty scores for each node.
    """
    local_difficulty = neighborhood_difficulty_measurer_in(data, adj, label)
    # global_difficulty = feature_difficulty_measurer(data, label, embedding)
    node_difficulty = local_difficulty
    return node_difficulty




[docs]
def sort_training_nodes_in(data, adj, label):
    """
    Sort training nodes based on their difficulty scores in ascending order.

    Parameters
    ----------
    data : Data
        PyG Data object containing node features and labels.

    adj : torch.Tensor
        Sparse adjacency matrix of the graph (shape: N x N) with self-loops.

    label : torch.Tensor
        Tensor containing the label of each node (shape: N,).

    Returns
    -------
    numpy.ndarray
        Indices of training nodes sorted by difficulty scores.
    """
    node_difficulty = difficulty_measurer_in(data, adj, label)
    _, indices = torch.sort(node_difficulty)
    indices = indices.cpu().numpy()
    return indices




[docs]
def training_scheduler(lam, t, T, scheduler='geom'):
    """
    Adjust the value of a parameter based on the chosen scheduling strategy.

    Parameters
    ----------
    lam : float
        The initial value or a baseline value for the parameter (0 <= lam <= 1).

    t : int
        The current training iteration or epoch.

    T : int
        The total number of training iterations or epochs.

    scheduler : str, optional
        The type of scheduling strategy to use. Options are 'linear', 'root', or 'geom'.
        Default is 'geom'.

    Returns
    -------
    float
        The adjusted value of the parameter at iteration `t` based on the scheduling strategy.
    """
    if scheduler == 'linear':
        return min(1, lam + (1 - lam) * t / T)
    elif scheduler == 'root':
        return min(1, math.sqrt(lam ** 2 + (1 - lam ** 2) * t / T))
    elif scheduler == 'geom':
        return min(1, 2 ** (math.log2(lam) - math.log2(lam) * t / T))




[docs]
def get_largest_cc(adj, num_nodes, data_name):
    mx = (adj).tocoo()
    edge_index = torch.LongTensor(np.vstack([mx.row, mx.col]))
    data = Data(edge_index=edge_index, num_nodes=num_nodes)
    nx_graph = convert.to_networkx(data, to_undirected=True)

    largest_cc = max(nx.connected_components(nx_graph), key=len)
    idx_lcc = list(largest_cc)
    largest_cc_graph = nx_graph.subgraph(largest_cc)

    if len(idx_lcc) == num_nodes:
        adj_lcc = adj
        adj_norm_lcc = gcn_normalize_adj(adj_lcc)
    else:
        adj_lcc = nx.to_scipy_sparse_array(largest_cc_graph)
        adj_norm_lcc = gcn_normalize_adj(adj_lcc)

    return idx_lcc, adj_norm_lcc, adj_lcc



[docs]
def get_syn_eigen(real_eigenvals, real_eigenvecs, eigen_k, ratio, step=1):
    k1 = math.ceil(eigen_k * ratio)
    k2 = eigen_k - k1
    print("k1:", k1, ",", "k2:", k2)
    k1_end = (k1 - 1) * step + 1
    eigen_sum = real_eigenvals.shape[0]
    k2_end = eigen_sum - (k2 - 1) * step - 1
    k1_list = range(0, k1_end, step)
    k2_list = range(k2_end, eigen_sum, step)
    eigenvals = torch.cat(
        [real_eigenvals[k1_list], real_eigenvals[k2_list]]
    )
    eigenvecs = torch.cat(
        [real_eigenvecs[:, k1_list], real_eigenvecs[:, k2_list]], dim=1,
    )

    return eigenvals, eigenvecs




[docs]
def get_subspace_embed(eigenvecs, x):
    x_trans = eigenvecs.T @ x  # kd
    u_unsqueeze = (eigenvecs.T).unsqueeze(2)  # kn1
    x_trans_unsqueeze = x_trans.unsqueeze(1)  # k1d
    sub_embed = torch.bmm(u_unsqueeze, x_trans_unsqueeze)  # kn1 @ k1d = knd
    return x_trans, sub_embed




[docs]
def get_subspace_covariance_matrix(eigenvecs, x):
    x_trans = eigenvecs.T @ x  # kd
    x_trans = F.normalize(input=x_trans, p=2, dim=1)
    x_trans_unsqueeze = x_trans.unsqueeze(1)  # k1d
    co_matrix = torch.bmm(x_trans_unsqueeze.permute(0, 2, 1), x_trans_unsqueeze)  # kd1 @ k1d = kdd
    return co_matrix




[docs]
def get_embed_sum(eigenvals, eigenvecs, x):
    x_trans = eigenvecs.T @ x  # kd
    x_trans = torch.diag(1 - eigenvals) @ x_trans  # kd
    embed_sum = eigenvecs @ x_trans  # nk @ kd = nd
    return embed_sum




[docs]
def get_embed_mean(embed_sum, label):
    class_matrix = F.one_hot(label).float()  # nc
    class_matrix = class_matrix.T  # cn
    embed_sum = class_matrix @ embed_sum  # cd
    mean_weight = (1 / class_matrix.sum(1)).unsqueeze(-1)  # c1
    embed_mean = mean_weight * embed_sum
    embed_mean = F.normalize(input=embed_mean, p=2, dim=1)
    return embed_mean




[docs]
def get_eigh(laplacian_matrix, data_name, save=True):
    dir = os.path.join("./data", data_name)
    if not os.path.isdir(dir):
        os.makedirs(dir)

    val_file_name = "eigenvalues.npy"
    vec_file_name = "eigenvectors.npy"

    eigvals_path = os.path.join(dir, val_file_name)
    eigvecs_path = os.path.join(dir, vec_file_name)

    if os.path.exists(eigvals_path) and os.path.exists(eigvecs_path):
        eigenvalues = np.load(eigvals_path)
        eigenvectors = np.load(eigvecs_path)
    else:
        if data_name in ['ogbn-arxiv', 'flickr', 'reddit', 'twitch-gamer']:
            eigenvalues, eigenvectors = eigsh(A=laplacian_matrix, k=1000, which="SA", tol=1e-5)
        else:
            if sp.issparse(laplacian_matrix):
                laplacian_matrix = laplacian_matrix.todense()
            eigenvalues, eigenvectors = eigh(laplacian_matrix)

        if save:
            np.save(eigvals_path, eigenvalues)
            np.save(eigvecs_path, eigenvectors)

    if data_name in ['ogbn-arxiv', 'flickr', 'reddit', 'twitch-gamer']:
        val_file_name = "eigenvalues_la.npy"
        vec_file_name = "eigenvectors_la.npy"
        eigvals_path = os.path.join(dir, val_file_name)
        eigvecs_path = os.path.join(dir, vec_file_name)

        if os.path.exists(eigvals_path) and os.path.exists(eigvecs_path):
            eigenvalues_la = np.load(eigvals_path)
            eigenvectors_la = np.load(eigvecs_path)
        else:
            eigenvalues_la, eigenvectors_la = eigsh(A=laplacian_matrix, k=1000, which="LA", tol=1e-5)
            if save:
                np.save(eigvals_path, eigenvalues_la)
                np.save(eigvecs_path, eigenvectors_la)

        eigenvalues = np.hstack([eigenvalues, eigenvalues_la])
        eigenvectors = np.hstack([eigenvectors, eigenvectors_la])

    idx = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[idx[:]]
    eigenvectors = eigenvectors[:, idx[:]]

    return eigenvalues, eigenvectors




[docs]
def load_eigen(dataset,load_path):
    dir = os.path.join(load_path, dataset) 
    val_file_name = "eigenvalues.npy"
    vec_file_name = "eigenvectors.npy"

    eigvals_path = os.path.join(dir, val_file_name)
    eigvecs_path = os.path.join(dir, vec_file_name)
    eigenvalues = np.load(eigvals_path)
    eigenvectors = np.load(eigvecs_path)

    if dataset in ['ogbn-arxiv', 'flickr', 'reddit', 'twitch-gamer']:
        val_file_name = "eigenvalues_la.npy"
        vec_file_name = "eigenvectors_la.npy"
        eigvals_path = os.path.join(dir, val_file_name)
        eigvecs_path = os.path.join(dir, vec_file_name)
        eigenvalues_la = np.load(eigvals_path)
        eigenvectors_la = np.load(eigvecs_path)

        eigenvalues = np.hstack([eigenvalues, eigenvalues_la])
        eigenvectors = np.hstack([eigenvectors, eigenvectors_la])

    idx = np.argsort(eigenvalues)
    eigenvalues = eigenvalues[idx[:]]
    eigenvectors = eigenvectors[:, idx[:]]

    return eigenvalues, eigenvectors




[docs]
def get_train_lcc(idx_lcc, idx_train, y_full, num_nodes, num_classes):
    idx_train_lcc = list(set(idx_train).intersection(set(idx_lcc)))
    y_full = y_full.cpu().numpy()
    if len(idx_lcc) == num_nodes:
        idx_map = idx_train
    else:
        y_train = y_full[idx_train]
        y_train_lcc = y_full[idx_train_lcc]

        y_lcc_idx = list((set(range(num_nodes)) - set(idx_train)).intersection(set(idx_lcc)))
        y_lcc_ = y_full[y_lcc_idx]
        counter_train = Counter(y_train)
        counter_train_lcc = Counter(y_train_lcc)
        idx = np.arange(len(y_lcc_))
        for c in range(num_classes):
            num_c = counter_train[c] - counter_train_lcc[c]
            if num_c > 0:
                idx_c = list(idx[y_lcc_ == c])
                idx_c = np.array(y_lcc_idx)[idx_c]
                idx_train_lcc += list(np.random.permutation(idx_c)[:num_c])
        idx_map = [idx_lcc.index(i) for i in idx_train_lcc]

    return idx_train_lcc, idx_map