Source code for graphslim.condensation.utils

from graphslim.utils import *
import torch_geometric
import math
from numpy.linalg import eigh
from scipy.sparse.linalg import eigsh
from collections import Counter
from torch_geometric.data import Data
from torch_geometric.utils import convert
import networkx as nx


[docs] def match_loss(gw_syn, gw_real, args, device): """ Computes the loss between synthetic and real gradients based on the specified distance metric. Parameters ---------- gw_syn : list of torch.Tensor List of synthetic gradients for different model parameters. gw_real : list of torch.Tensor List of real gradients for different model parameters. args : Namespace Arguments object containing hyperparameters for training and model. device : torch.device Device (CPU or GPU) on which computations are performed. Returns ------- torch.Tensor The computed distance (loss) between synthetic and real gradients. """ dis = torch.tensor(0.0).to(device) if args.dis_metric == 'ours': for ig in range(len(gw_real)): gwr = gw_real[ig] gws = gw_syn[ig] dis += distance_wb(gwr, gws) elif args.dis_metric == 'mse': gw_real_vec = [] gw_syn_vec = [] for ig in range(len(gw_real)): gw_real_vec.append(gw_real[ig].reshape((-1))) gw_syn_vec.append(gw_syn[ig].reshape((-1))) gw_real_vec = torch.cat(gw_real_vec, dim=0) gw_syn_vec = torch.cat(gw_syn_vec, dim=0) dis = torch.sum((gw_syn_vec - gw_real_vec) ** 2) elif args.dis_metric == 'cos': gw_real_vec = [] gw_syn_vec = [] for ig in range(len(gw_real)): gw_real_vec.append(gw_real[ig].reshape((-1))) gw_syn_vec.append(gw_syn[ig].reshape((-1))) gw_real_vec = torch.cat(gw_real_vec, dim=0) gw_syn_vec = torch.cat(gw_syn_vec, dim=0) dis = 1 - torch.sum(gw_real_vec * gw_syn_vec, dim=-1) / ( torch.norm(gw_real_vec, dim=-1) * torch.norm(gw_syn_vec, dim=-1) + 0.000001) else: exit('DC error: unknown distance function') return dis
[docs] def distance_wb(gwr, gws): """ Computes the distance between two tensors representing gradients using cosine similarity. Parameters ---------- gwr : torch.Tensor The real gradient tensor. gws : torch.Tensor The synthetic gradient tensor. Returns ------- torch.Tensor The computed distance between the real and synthetic gradients. """ shape = gwr.shape if len(gwr.shape) == 2: gwr = gwr.T gws = gws.T if len(shape) == 4: # conv, out*in*h*w gwr = gwr.reshape(shape[0], shape[1] * shape[2] * shape[3]) gws = gws.reshape(shape[0], shape[1] * shape[2] * shape[3]) elif len(shape) == 3: # layernorm, C*h*w gwr = gwr.reshape(shape[0], shape[1] * shape[2]) gws = gws.reshape(shape[0], shape[1] * shape[2]) elif len(shape) == 2: # linear, out*in tmp = 'do nothing' elif len(shape) == 1: # batchnorm/instancenorm, C; groupnorm x, bias gwr = gwr.reshape(1, shape[0]) gws = gws.reshape(1, shape[0]) return 0 dis_weight = torch.sum( 1 - torch.sum(gwr * gws, dim=-1) / (torch.norm(gwr, dim=-1) * torch.norm(gws, dim=-1) + 0.000001)) dis = dis_weight return dis
# GCSNTK utils
[docs] def sub_E(idx, A): """ Generates a sparse adjacency matrix of the subgraph defined by the given indices. Parameters ---------- idx : torch.Tensor A tensor containing the indices of the nodes that define the subgraph. A : torch.Tensor The original adjacency matrix of the graph. Returns ------- torch.sparse_coo_tensor The sparse adjacency matrix of the subgraph. """ n = A.shape[0] n_neig = len(idx) operator = torch.zeros([n, n_neig]) operator[idx, range(n_neig)] = 1 sub_A = torch.matmul(torch.matmul(operator.t(), A), operator) ind = torch.where(sub_A != 0) inds = torch.cat([ind[0], ind[1]]).reshape(2, len(ind[0])) values = torch.ones(len(ind[0])) sub_E = torch.sparse_coo_tensor(inds, values, torch.Size([n_neig, n_neig])).to(A.device) return sub_E
[docs] def update_E(x_s, neig): """ Update the adjacency matrix based on the features of the nodes and the average number of neighbors. Parameters ---------- x_s : torch.Tensor A tensor containing the feature vectors of the nodes. neig : float The average number of neighbors each node should have. Returns ------- torch.sparse_coo_tensor The sparse adjacency matrix based on the updated similarities. """ n = x_s.shape[0] K = torch.empty(n, n) A = torch.zeros(n * n) for i in range(n): for j in range(i, n): K[i, j] = torch.norm(x_s[i] - x_s[j]) K[j, i] = K[i, j] edge = int(n + torch.round(torch.tensor(neig * n / 2))) if (edge % 2) != 0: edge += 1 else: pass Simil = torch.flatten(K) _, indices = torch.sort(Simil) A[indices[0:edge]] = 1 A = A.reshape(n, n) ind = torch.where(A == 1) ind = torch.cat([ind[0], ind[1]]).reshape(2, edge) values = torch.ones(edge) E = torch.sparse_coo_tensor(ind, values, torch.Size([n, n])).to(x_s.device) return E
# utils for GCSNTK
[docs] def normalize_data(data): """ Normalize the input data using mean and standard deviation. Parameters ---------- data : torch.Tensor The data to be normalized. Each column represents a feature, and normalization is applied to each feature independently. Returns ------- torch.Tensor The normalized data where each feature has zero mean and unit variance. """ mean = data.mean(dim=0) std = data.std(dim=0) std[std == 0] = 1 normalized_data = (data - mean) / std return normalized_data
[docs] def GCF(adj, x, k=1): """ Apply Graph Convolution Filter (GCF) to features using the adjacency matrix. Parameters ---------- adj : torch.Tensor Adjacency matrix of the graph. It must include self-loops. Shape: (N, N), where N is the number of nodes. x : torch.Tensor Node features. Shape: (N, F), where F is the number of features for each node. k : int, optional Number of hops (or layers) to apply the filter. Default is 1. Returns ------- torch.Tensor Filtered features after applying the graph convolution. Shape: (N, F). """ D = torch.sum(adj, dim=1) D = torch.pow(D, -0.5) D = torch.diag(D) filter = torch.matmul(torch.matmul(D, adj), D) for i in range(k): x = torch.matmul(filter, x) return x
# geom
[docs] def neighborhood_difficulty_measurer(data, adj, label): """ Measure the difficulty of neighborhoods in the graph based on the label distribution. Parameters ---------- data : Data PyG Data object containing node features and labels. adj : torch.Tensor Sparse adjacency matrix of the graph. The shape is (N, N) where N is the number of nodes. label : torch.Tensor Tensor containing the label of each node. Shape: (N,) Returns ------- torch.Tensor Difficulty scores for each node. Higher scores indicate more difficult neighborhoods. """ edge_index = adj.coalesce().indices() edge_value = adj.coalesce().values() neighbor_label, _ = torch_geometric.utils.add_self_loops(edge_index) # [[1, 1, 1, 1],[2, 3, 4, 5]] neighbor_label[1] = label[neighbor_label[1]] # [[1, 1, 1, 1],[40, 20, 19, 21]] neighbor_label = torch.transpose(neighbor_label, 0, 1) # [[1, 40], [1, 20], [1, 19], [1, 21]] index, count = torch.unique(neighbor_label, sorted=True, return_counts=True, dim=0) neighbor_class = torch.sparse_coo_tensor(index.T, count) neighbor_class = neighbor_class.to_dense().float() neighbor_class = neighbor_class[data.idx_train] neighbor_class = F.normalize(neighbor_class, 1.0, 1) neighbor_entropy = -1 * neighbor_class * torch.log(neighbor_class + torch.exp(torch.tensor(-20))) # 防止log里面是0出现异常 local_difficulty = neighbor_entropy.sum(1) return local_difficulty
[docs] def difficulty_measurer(data, adj, label): """ Measure the difficulty of nodes in the graph based on their neighborhood label distribution. Parameters ---------- data : Data PyG Data object containing node features and labels. adj : torch.Tensor Sparse adjacency matrix of the graph. The shape is (N, N) where N is the number of nodes. label : torch.Tensor Tensor containing the label of each node. Shape: (N,) Returns ------- torch.Tensor Difficulty scores for each node. Higher scores indicate more difficult nodes. """ local_difficulty = neighborhood_difficulty_measurer(data, adj, label) # global_difficulty = feature_difficulty_measurer(data, label, embedding) node_difficulty = local_difficulty return node_difficulty
[docs] def sort_training_nodes(data, adj, label): """ Sort training nodes based on their difficulty measured by neighborhood label distribution. Parameters ---------- data : Data PyG Data object containing node features and labels. adj : torch.Tensor Sparse adjacency matrix of the graph (shape: N x N) with self-loops. label : torch.Tensor Tensor containing the label of each node (shape: N,). Returns ------- numpy.ndarray Indices of the training nodes sorted by their difficulty, from easiest to hardest. """ node_difficulty = difficulty_measurer(data, adj, label) _, indices = torch.sort(node_difficulty) indices = indices.cpu().numpy() sorted_trainset = data.idx_train[indices] return sorted_trainset
[docs] def neighborhood_difficulty_measurer_in(data, adj, label): """ Measure the difficulty of each node in a graph based on the entropy of neighbor labels. Parameters ---------- data : Data PyG Data object containing node features and labels. adj : torch.Tensor Sparse adjacency matrix of the graph (shape: N x N) with self-loops. label : torch.Tensor Tensor containing the label of each node (shape: N,). Returns ------- torch.Tensor Tensor of local difficulty scores for each node. """ edge_index = adj.coalesce().indices() edge_value = adj.coalesce().values() neighbor_label, _ = torch_geometric.utils.add_self_loops(edge_index) # [[1, 1, 1, 1],[2, 3, 4, 5]] neighbor_label[1] = label[neighbor_label[1]] # [[1, 1, 1, 1],[40, 20, 19, 21]] neighbor_label = torch.transpose(neighbor_label, 0, 1) # [[1, 40], [1, 20], [1, 19], [1, 21]] index, count = torch.unique(neighbor_label, sorted=True, return_counts=True, dim=0) neighbor_class = torch.sparse_coo_tensor(index.T, count) neighbor_class = neighbor_class.to_dense().float() neighbor_class = F.normalize(neighbor_class, 1.0, 1) neighbor_entropy = -1 * neighbor_class * torch.log(neighbor_class + torch.exp(torch.tensor(-20))) # 防止log里面是0出现异常 local_difficulty = neighbor_entropy.sum(1) return local_difficulty
[docs] def difficulty_measurer_in(data, adj, label): """ Measure the difficulty of each node in a graph based on local entropy of neighbor labels. Parameters ---------- data : Data PyG Data object containing node features and labels. adj : torch.Tensor Sparse adjacency matrix of the graph (shape: N x N) with self-loops. label : torch.Tensor Tensor containing the label of each node (shape: N,). Returns ------- torch.Tensor Tensor of local difficulty scores for each node. """ local_difficulty = neighborhood_difficulty_measurer_in(data, adj, label) # global_difficulty = feature_difficulty_measurer(data, label, embedding) node_difficulty = local_difficulty return node_difficulty
[docs] def sort_training_nodes_in(data, adj, label): """ Sort training nodes based on their difficulty scores in ascending order. Parameters ---------- data : Data PyG Data object containing node features and labels. adj : torch.Tensor Sparse adjacency matrix of the graph (shape: N x N) with self-loops. label : torch.Tensor Tensor containing the label of each node (shape: N,). Returns ------- numpy.ndarray Indices of training nodes sorted by difficulty scores. """ node_difficulty = difficulty_measurer_in(data, adj, label) _, indices = torch.sort(node_difficulty) indices = indices.cpu().numpy() return indices
[docs] def training_scheduler(lam, t, T, scheduler='geom'): """ Adjust the value of a parameter based on the chosen scheduling strategy. Parameters ---------- lam : float The initial value or a baseline value for the parameter (0 <= lam <= 1). t : int The current training iteration or epoch. T : int The total number of training iterations or epochs. scheduler : str, optional The type of scheduling strategy to use. Options are 'linear', 'root', or 'geom'. Default is 'geom'. Returns ------- float The adjusted value of the parameter at iteration `t` based on the scheduling strategy. """ if scheduler == 'linear': return min(1, lam + (1 - lam) * t / T) elif scheduler == 'root': return min(1, math.sqrt(lam ** 2 + (1 - lam ** 2) * t / T)) elif scheduler == 'geom': return min(1, 2 ** (math.log2(lam) - math.log2(lam) * t / T))
[docs] def get_largest_cc(adj, num_nodes, data_name): mx = (adj).tocoo() edge_index = torch.LongTensor(np.vstack([mx.row, mx.col])) data = Data(edge_index=edge_index, num_nodes=num_nodes) nx_graph = convert.to_networkx(data, to_undirected=True) largest_cc = max(nx.connected_components(nx_graph), key=len) idx_lcc = list(largest_cc) largest_cc_graph = nx_graph.subgraph(largest_cc) if len(idx_lcc) == num_nodes: adj_lcc = adj adj_norm_lcc = gcn_normalize_adj(adj_lcc) else: adj_lcc = nx.to_scipy_sparse_array(largest_cc_graph) adj_norm_lcc = gcn_normalize_adj(adj_lcc) return idx_lcc, adj_norm_lcc, adj_lcc
[docs] def get_syn_eigen(real_eigenvals, real_eigenvecs, eigen_k, ratio, step=1): k1 = math.ceil(eigen_k * ratio) k2 = eigen_k - k1 print("k1:", k1, ",", "k2:", k2) k1_end = (k1 - 1) * step + 1 eigen_sum = real_eigenvals.shape[0] k2_end = eigen_sum - (k2 - 1) * step - 1 k1_list = range(0, k1_end, step) k2_list = range(k2_end, eigen_sum, step) eigenvals = torch.cat( [real_eigenvals[k1_list], real_eigenvals[k2_list]] ) eigenvecs = torch.cat( [real_eigenvecs[:, k1_list], real_eigenvecs[:, k2_list]], dim=1, ) return eigenvals, eigenvecs
[docs] def get_subspace_embed(eigenvecs, x): x_trans = eigenvecs.T @ x # kd u_unsqueeze = (eigenvecs.T).unsqueeze(2) # kn1 x_trans_unsqueeze = x_trans.unsqueeze(1) # k1d sub_embed = torch.bmm(u_unsqueeze, x_trans_unsqueeze) # kn1 @ k1d = knd return x_trans, sub_embed
[docs] def get_subspace_covariance_matrix(eigenvecs, x): x_trans = eigenvecs.T @ x # kd x_trans = F.normalize(input=x_trans, p=2, dim=1) x_trans_unsqueeze = x_trans.unsqueeze(1) # k1d co_matrix = torch.bmm(x_trans_unsqueeze.permute(0, 2, 1), x_trans_unsqueeze) # kd1 @ k1d = kdd return co_matrix
[docs] def get_embed_sum(eigenvals, eigenvecs, x): x_trans = eigenvecs.T @ x # kd x_trans = torch.diag(1 - eigenvals) @ x_trans # kd embed_sum = eigenvecs @ x_trans # nk @ kd = nd return embed_sum
[docs] def get_embed_mean(embed_sum, label): class_matrix = F.one_hot(label).float() # nc class_matrix = class_matrix.T # cn embed_sum = class_matrix @ embed_sum # cd mean_weight = (1 / class_matrix.sum(1)).unsqueeze(-1) # c1 embed_mean = mean_weight * embed_sum embed_mean = F.normalize(input=embed_mean, p=2, dim=1) return embed_mean
[docs] def get_eigh(laplacian_matrix, data_name, save=True): dir = os.path.join("./data", data_name) if not os.path.isdir(dir): os.makedirs(dir) val_file_name = "eigenvalues.npy" vec_file_name = "eigenvectors.npy" eigvals_path = os.path.join(dir, val_file_name) eigvecs_path = os.path.join(dir, vec_file_name) if os.path.exists(eigvals_path) and os.path.exists(eigvecs_path): eigenvalues = np.load(eigvals_path) eigenvectors = np.load(eigvecs_path) else: if data_name in ['ogbn-arxiv', 'flickr', 'reddit', 'twitch-gamer']: eigenvalues, eigenvectors = eigsh(A=laplacian_matrix, k=1000, which="SA", tol=1e-5) else: if sp.issparse(laplacian_matrix): laplacian_matrix = laplacian_matrix.todense() eigenvalues, eigenvectors = eigh(laplacian_matrix) if save: np.save(eigvals_path, eigenvalues) np.save(eigvecs_path, eigenvectors) if data_name in ['ogbn-arxiv', 'flickr', 'reddit', 'twitch-gamer']: val_file_name = "eigenvalues_la.npy" vec_file_name = "eigenvectors_la.npy" eigvals_path = os.path.join(dir, val_file_name) eigvecs_path = os.path.join(dir, vec_file_name) if os.path.exists(eigvals_path) and os.path.exists(eigvecs_path): eigenvalues_la = np.load(eigvals_path) eigenvectors_la = np.load(eigvecs_path) else: eigenvalues_la, eigenvectors_la = eigsh(A=laplacian_matrix, k=1000, which="LA", tol=1e-5) if save: np.save(eigvals_path, eigenvalues_la) np.save(eigvecs_path, eigenvectors_la) eigenvalues = np.hstack([eigenvalues, eigenvalues_la]) eigenvectors = np.hstack([eigenvectors, eigenvectors_la]) idx = np.argsort(eigenvalues) eigenvalues = eigenvalues[idx[:]] eigenvectors = eigenvectors[:, idx[:]] return eigenvalues, eigenvectors
[docs] def load_eigen(dataset,load_path): dir = os.path.join(load_path, dataset) val_file_name = "eigenvalues.npy" vec_file_name = "eigenvectors.npy" eigvals_path = os.path.join(dir, val_file_name) eigvecs_path = os.path.join(dir, vec_file_name) eigenvalues = np.load(eigvals_path) eigenvectors = np.load(eigvecs_path) if dataset in ['ogbn-arxiv', 'flickr', 'reddit', 'twitch-gamer']: val_file_name = "eigenvalues_la.npy" vec_file_name = "eigenvectors_la.npy" eigvals_path = os.path.join(dir, val_file_name) eigvecs_path = os.path.join(dir, vec_file_name) eigenvalues_la = np.load(eigvals_path) eigenvectors_la = np.load(eigvecs_path) eigenvalues = np.hstack([eigenvalues, eigenvalues_la]) eigenvectors = np.hstack([eigenvectors, eigenvectors_la]) idx = np.argsort(eigenvalues) eigenvalues = eigenvalues[idx[:]] eigenvectors = eigenvectors[:, idx[:]] return eigenvalues, eigenvectors
[docs] def get_train_lcc(idx_lcc, idx_train, y_full, num_nodes, num_classes): idx_train_lcc = list(set(idx_train).intersection(set(idx_lcc))) y_full = y_full.cpu().numpy() if len(idx_lcc) == num_nodes: idx_map = idx_train else: y_train = y_full[idx_train] y_train_lcc = y_full[idx_train_lcc] y_lcc_idx = list((set(range(num_nodes)) - set(idx_train)).intersection(set(idx_lcc))) y_lcc_ = y_full[y_lcc_idx] counter_train = Counter(y_train) counter_train_lcc = Counter(y_train_lcc) idx = np.arange(len(y_lcc_)) for c in range(num_classes): num_c = counter_train[c] - counter_train_lcc[c] if num_c > 0: idx_c = list(idx[y_lcc_ == c]) idx_c = np.array(y_lcc_idx)[idx_c] idx_train_lcc += list(np.random.permutation(idx_c)[:num_c]) idx_map = [idx_lcc.index(i) for i in idx_train_lcc] return idx_train_lcc, idx_map