clique / src /1_build_lrmc_levels.py
qingy2024's picture
Upload folder using huggingface_hub
bf620c6 verified
raw
history blame
8.54 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import json
import os
import re
import shutil
import subprocess
from collections import defaultdict
from typing import Dict, List, Tuple, Iterable, Set
def ensure_dir(p: str):
os.makedirs(p, exist_ok=True)
def read_edgelist(path: str) -> Iterable[Tuple[int, int]]:
with open(path, 'r') as f:
for line in f:
s = line.strip()
if not s or s.startswith('#'):
continue
parts = s.split()
if len(parts) < 2:
continue
try:
u = int(parts[0]); v = int(parts[1])
except ValueError:
continue
if u == v:
continue
a, b = (u, v) if u < v else (v, u)
yield a, b
def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]):
with open(path, 'w') as f:
for u, v in edges:
f.write(f"{u} {v}\n")
def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]:
"""
Return (node_to_cluster_index, sorted_cluster_ids).
The cluster indices are 0..C-1, sorted by cluster_id.
- On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id.
"""
with open(path, 'r') as f:
js = json.load(f)
clusters = js.get('clusters', [])
# Sort clusters by cluster_id for stable indexing
clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0))
cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)]
cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)}
# Build node->(best_cluster_idx, best_score, best_cid)
node_choice: Dict[int, Tuple[int, float, int]] = {}
for c in clusters_sorted:
cid = c.get('cluster_id', None)
if cid is None:
continue
idx = cluster_id_to_idx[cid]
members = c.get('members', [])
score = float(c.get('score', 0.0))
for u in members:
prev = node_choice.get(u, None)
if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]):
node_choice[u] = (idx, score, cid)
node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()}
return node_to_cluster, cluster_id_list
def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int:
node_to_cluster, cluster_id_list = parse_seeds(seeds_json)
edges_set: Set[Tuple[int, int]] = set()
missing_nodes = 0
for u, v in read_edgelist(prev_edgelist):
cu = node_to_cluster.get(u, None)
cv = node_to_cluster.get(v, None)
if cu is None or cv is None:
# If a node isn't present in any cluster JSON, skip or count as missing
missing_nodes += 1
continue
if cu == cv:
continue
a, b = (cu, cv) if cu < cv else (cv, cu)
edges_set.add((a, b))
write_edgelist(out_edgelist, sorted(edges_set))
return missing_nodes
def run_java(java_exec: str, class_name: str, edgelist_path: str, out_json_path: str,
epsilon: str, java_opts: List[str]) -> None:
cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon]
print("[run]", " ".join(cmd))
subprocess.run(cmd, check=True)
def build_single_graph_levels(args):
ensure_dir(args.out_dir)
# Stage 0 edgelist is the input; optionally copy for record
stage0_dir = os.path.join(args.out_dir, "stage0")
ensure_dir(stage0_dir)
e0_copy = os.path.join(stage0_dir, "edgelist_0.txt")
if args.copy_inputs:
shutil.copyfile(args.input_edgelist, e0_copy)
prev_edgelist = args.input_edgelist
for lvl in range(args.levels):
stage_dir = os.path.join(args.out_dir, f"stage{lvl}")
ensure_dir(stage_dir)
seeds_out = os.path.join(stage_dir, "seeds.json")
# Run Java to produce seeds at this level
run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts)
# Prepare next-level edgelist (unless last level)
if lvl < args.levels - 1:
next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}")
ensure_dir(next_stage_dir)
next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt")
missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist)
if missing > 0:
print(f"[warn] stage{lvl}: {missing} edges had nodes missing from seeds; skipped.")
prev_edgelist = next_edgelist
def build_multigraph_levels(args):
ensure_dir(args.out_dir)
# Enumerate graph files
graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob)))
if not graph_files:
raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}")
pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') # capture numeric id
def graph_id_from_path(p: str) -> str:
base = os.path.basename(p)
m = pattern.match(base)
if m:
return m.group(2).zfill(6) # zero-pad to 6 for consistency
# fallback: strip extension
stem = os.path.splitext(base)[0]
m2 = re.search(r'(\d+)$', stem)
return (m2.group(1).zfill(6) if m2 else stem)
# Stage 0: run Java for each graph
prev_stage_edgelists: Dict[str, str] = {}
for lvl in range(args.levels):
stage_dir = os.path.join(args.out_dir, f"stage{lvl}")
ensure_dir(stage_dir)
if lvl == 0:
for gpath in graph_files:
gid = graph_id_from_path(gpath)
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json")
run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts)
prev_stage_edgelists[gid] = gpath
else:
# For each graph, coarsen previous edgelist using previous seeds, then run Java
for gpath in graph_files:
gid = graph_id_from_path(gpath)
prev_edgelist = prev_stage_edgelists[gid]
prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json")
next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt")
missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist)
if missing > 0:
print(f"[warn] stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.")
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json")
run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts)
prev_stage_edgelists[gid] = next_edgelist
def main():
ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.")
mode = ap.add_mutually_exclusive_group(required=True)
mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt')
mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)')
ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)')
ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)')
ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)')
# Java settings
ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)')
ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name')
ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)')
ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")')
ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)')
args = ap.parse_args()
# Parse java_opts into a list if provided
args.java_opts = args.java_opts.split() if args.java_opts else []
if args.input_edgelist:
build_single_graph_levels(args)
else:
build_multigraph_levels(args)
if __name__ == '__main__':
main()