|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
|
import json |
|
|
from pathlib import Path |
|
|
from typing import Dict, List, Tuple |
|
|
|
|
|
import torch |
|
|
import torch.nn.functional as F |
|
|
from torch_geometric.datasets import Planetoid |
|
|
import torch_geometric.transforms as T |
|
|
|
|
|
def load_cora(normalize=True): |
|
|
ds = Planetoid(root="/tmp/Cora", name="Cora", transform=T.NormalizeFeatures() if normalize else None) |
|
|
return ds[0], ds.num_classes |
|
|
|
|
|
def read_seed_json(path: str, num_nodes: int) -> torch.Tensor: |
|
|
obj = json.loads(Path(path).read_text()) |
|
|
cid_of_node: Dict[int, int] = {} |
|
|
K_guess = 0 |
|
|
for c in obj["clusters"]: |
|
|
cid = int(c.get("cluster_id", K_guess)) |
|
|
K_guess = max(K_guess, cid + 1) |
|
|
for u in c["members"]: |
|
|
cid_of_node[int(u)] = cid |
|
|
cluster_id = torch.full((num_nodes,), -1, dtype=torch.long) |
|
|
for u, cid in cid_of_node.items(): |
|
|
if 0 <= u < num_nodes: |
|
|
cluster_id[u] = cid |
|
|
return cluster_id |
|
|
|
|
|
def fix_uncovered_nodes(cluster_id: torch.Tensor) -> torch.Tensor: |
|
|
|
|
|
|
|
|
N = cluster_id.numel() |
|
|
next_cid = int(cluster_id.max().item()) + 1 if (cluster_id >= 0).any() else 0 |
|
|
for u in range(N): |
|
|
if cluster_id[u] < 0: |
|
|
cluster_id[u] = next_cid |
|
|
next_cid += 1 |
|
|
return cluster_id |
|
|
|
|
|
def prototypes_from_partition(X: torch.Tensor, cluster_id: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: |
|
|
K = int(cluster_id.max().item() + 1) |
|
|
F = X.size(1) |
|
|
device = X.device |
|
|
sums = torch.zeros(K, F, device=device, dtype=X.dtype) |
|
|
sizes = torch.bincount(cluster_id, minlength=K).to(device) |
|
|
sums.index_add_(0, cluster_id, X) |
|
|
sizes = sizes.clamp_min(1).to(X.dtype).unsqueeze(1) |
|
|
protos = sums / sizes |
|
|
sizes = sizes.squeeze(1) |
|
|
return protos, sizes |
|
|
|
|
|
def weighted_kmeans(protos: torch.Tensor, weights: torch.Tensor, target_K: int, iters: int = 30, seed: int = 0) -> torch.Tensor: |
|
|
""" |
|
|
protos: [K0, F] seed prototypes |
|
|
weights: [K0] positive weights (e.g., cluster sizes) |
|
|
returns: [K0] meta-cluster id in [0, target_K) |
|
|
""" |
|
|
torch.manual_seed(seed) |
|
|
K0, F = protos.shape |
|
|
target_K = min(target_K, K0) |
|
|
|
|
|
centers = torch.empty(target_K, F, device=protos.device, dtype=protos.dtype) |
|
|
chosen = torch.zeros(K0, dtype=torch.bool, device=protos.device) |
|
|
|
|
|
p0 = (weights / weights.sum()).clamp(min=1e-12) |
|
|
idx0 = torch.multinomial(p0, 1).item() |
|
|
centers[0] = protos[idx0] |
|
|
chosen[idx0] = True |
|
|
dist2 = (protos - centers[0:1]).pow(2).sum(dim=1) |
|
|
for k in range(1, target_K): |
|
|
|
|
|
prob = (weights * dist2).clamp(min=1e-12) |
|
|
prob = prob / prob.sum() |
|
|
idx = torch.multinomial(prob, 1).item() |
|
|
centers[k] = protos[idx] |
|
|
chosen[idx] = True |
|
|
dist2 = torch.minimum(dist2, (protos - centers[k:k+1]).pow(2).sum(dim=1)) |
|
|
|
|
|
|
|
|
assign = torch.zeros(K0, dtype=torch.long, device=protos.device) |
|
|
for _ in range(iters): |
|
|
|
|
|
d2 = (protos[:, None, :] - centers[None, :, :]).pow(2).sum(dim=2) |
|
|
assign = d2.argmin(dim=1) |
|
|
|
|
|
new_centers = torch.zeros_like(centers) |
|
|
counts = torch.zeros(target_K, device=protos.device, dtype=protos.dtype) |
|
|
new_centers.index_add_(0, assign, protos * weights.unsqueeze(1)) |
|
|
counts.index_add_(0, assign, weights) |
|
|
mask = counts > 0 |
|
|
new_centers[mask] = new_centers[mask] / counts[mask].unsqueeze(1).clamp_min(1e-12) |
|
|
|
|
|
centers = torch.where(mask.unsqueeze(1), new_centers, centers) |
|
|
return assign |
|
|
|
|
|
def majority_vote_upper_bound(cluster_id: torch.Tensor, y: torch.Tensor) -> float: |
|
|
K = int(cluster_id.max().item() + 1) |
|
|
correct = 0 |
|
|
for k in range(K): |
|
|
idx = (cluster_id == k) |
|
|
ys = y[idx] |
|
|
if ys.numel() == 0: |
|
|
continue |
|
|
_, counts = torch.unique(ys, return_counts=True) |
|
|
correct += int(counts.max().item()) |
|
|
return correct / y.size(0) |
|
|
|
|
|
def cluster_size_stats(cluster_id: torch.Tensor) -> str: |
|
|
sizes = torch.bincount(cluster_id, minlength=int(cluster_id.max().item() + 1)).to(torch.float) |
|
|
singletons = (sizes == 1).float().mean().item() |
|
|
med = sizes.median().item() |
|
|
mean = sizes.mean().item() |
|
|
K = sizes.numel() |
|
|
return f"K={K}, singleton_rate={singletons:.3f}, mean_size={mean:.2f}, median_size={med:.2f}" |
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser() |
|
|
ap.add_argument("--seeds_json", type=str, required=True) |
|
|
ap.add_argument("--out_json", type=str, required=True) |
|
|
ap.add_argument("--target_k", type=int, default=None, help="Exact target number of clusters.") |
|
|
ap.add_argument("--k_ratio", type=float, default=None, help="Use target_k = ceil(k_ratio * N).") |
|
|
ap.add_argument("--iters", type=int, default=30) |
|
|
ap.add_argument("--seed", type=int, default=0) |
|
|
args = ap.parse_args() |
|
|
|
|
|
data, num_classes = load_cora(normalize=True) |
|
|
N = data.num_nodes |
|
|
|
|
|
cluster_id = read_seed_json(args.seeds_json, N) |
|
|
if (cluster_id < 0).any(): |
|
|
print("[warn] Some nodes uncovered by seeds. Assigning unique temp clusters to uncovered nodes.") |
|
|
cluster_id = fix_uncovered_nodes(cluster_id) |
|
|
|
|
|
print("Before:", cluster_size_stats(cluster_id)) |
|
|
ub_before = majority_vote_upper_bound(cluster_id, data.y) |
|
|
print(f"Majority-vote UB (before) = {ub_before:.3f}") |
|
|
|
|
|
|
|
|
if args.target_k is None and args.k_ratio is None: |
|
|
raise SystemExit("Provide either --target_k or --k_ratio.") |
|
|
target_K = int(args.target_k) if args.target_k is not None else int((args.k_ratio * N) + 0.999) |
|
|
|
|
|
X = data.x.to(torch.float) |
|
|
protos, sizes = prototypes_from_partition(X, cluster_id) |
|
|
K0 = protos.size(0) |
|
|
if target_K >= K0: |
|
|
print(f"[info] target_K ({target_K}) >= current K ({K0}); nothing to coarsen. Copying input to output.") |
|
|
out_cluster_id = cluster_id |
|
|
else: |
|
|
assign = weighted_kmeans(protos, sizes.clamp_min(1), target_K, iters=args.iters, seed=args.seed) |
|
|
out_cluster_id = assign[cluster_id] |
|
|
|
|
|
print("After: ", cluster_size_stats(out_cluster_id)) |
|
|
ub_after = majority_vote_upper_bound(out_cluster_id, data.y) |
|
|
print(f"Majority-vote UB (after) = {ub_after:.3f}") |
|
|
|
|
|
|
|
|
K_final = int(out_cluster_id.max().item() + 1) |
|
|
clusters: List[Dict] = [] |
|
|
for k in range(K_final): |
|
|
members = torch.nonzero(out_cluster_id == k, as_tuple=False).view(-1).tolist() |
|
|
clusters.append({"cluster_id": int(k), "members": members}) |
|
|
out = {"clusters": clusters} |
|
|
Path(args.out_json).write_text(json.dumps(out)) |
|
|
print(f"Wrote coarsened seeds to {args.out_json}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|