#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import glob import json import os import re import shutil import subprocess from collections import defaultdict from typing import Dict, List, Tuple, Iterable, Set def ensure_dir(p: str): os.makedirs(p, exist_ok=True) def read_edgelist(path: str) -> Iterable[Tuple[int, int]]: with open(path, 'r') as f: for line in f: s = line.strip() if not s or s.startswith('#'): continue parts = s.split() if len(parts) < 2: continue try: u = int(parts[0]); v = int(parts[1]) except ValueError: continue if u == v: continue a, b = (u, v) if u < v else (v, u) yield a, b def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]): with open(path, 'w') as f: for u, v in edges: f.write(f"{u} {v}\n") def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]: """ Return (node_to_cluster_index, sorted_cluster_ids). The cluster indices are 0..C-1, sorted by cluster_id. - On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id. """ with open(path, 'r') as f: js = json.load(f) clusters = js.get('clusters', []) # Sort clusters by cluster_id for stable indexing clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0)) cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)] cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)} # Build node->(best_cluster_idx, best_score, best_cid) node_choice: Dict[int, Tuple[int, float, int]] = {} for c in clusters_sorted: cid = c.get('cluster_id', None) if cid is None: continue idx = cluster_id_to_idx[cid] members = c.get('members', []) score = float(c.get('score', 0.0)) for u in members: prev = node_choice.get(u, None) if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]): node_choice[u] = (idx, score, cid) node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()} return node_to_cluster, cluster_id_list def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int: node_to_cluster, cluster_id_list = parse_seeds(seeds_json) edges_set: Set[Tuple[int, int]] = set() missing_nodes = 0 for u, v in read_edgelist(prev_edgelist): cu = node_to_cluster.get(u, None) cv = node_to_cluster.get(v, None) if cu is None or cv is None: # If a node isn't present in any cluster JSON, skip or count as missing missing_nodes += 1 continue if cu == cv: continue a, b = (cu, cv) if cu < cv else (cv, cu) edges_set.add((a, b)) write_edgelist(out_edgelist, sorted(edges_set)) return missing_nodes def run_java(java_exec: str, class_name: str, edgelist_path: str, out_json_path: str, epsilon: str, java_opts: List[str]) -> None: cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon] print("[run]", " ".join(cmd)) subprocess.run(cmd, check=True) def build_single_graph_levels(args): ensure_dir(args.out_dir) # Stage 0 edgelist is the input; optionally copy for record stage0_dir = os.path.join(args.out_dir, "stage0") ensure_dir(stage0_dir) e0_copy = os.path.join(stage0_dir, "edgelist_0.txt") if args.copy_inputs: shutil.copyfile(args.input_edgelist, e0_copy) prev_edgelist = args.input_edgelist for lvl in range(args.levels): stage_dir = os.path.join(args.out_dir, f"stage{lvl}") ensure_dir(stage_dir) seeds_out = os.path.join(stage_dir, "seeds.json") # Run Java to produce seeds at this level run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts) # Prepare next-level edgelist (unless last level) if lvl < args.levels - 1: next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}") ensure_dir(next_stage_dir) next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt") missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist) if missing > 0: print(f"[warn] stage{lvl}: {missing} edges had nodes missing from seeds; skipped.") prev_edgelist = next_edgelist def build_multigraph_levels(args): ensure_dir(args.out_dir) # Enumerate graph files graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob))) if not graph_files: raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}") pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') # capture numeric id def graph_id_from_path(p: str) -> str: base = os.path.basename(p) m = pattern.match(base) if m: return m.group(2).zfill(6) # zero-pad to 6 for consistency # fallback: strip extension stem = os.path.splitext(base)[0] m2 = re.search(r'(\d+)$', stem) return (m2.group(1).zfill(6) if m2 else stem) # Stage 0: run Java for each graph prev_stage_edgelists: Dict[str, str] = {} for lvl in range(args.levels): stage_dir = os.path.join(args.out_dir, f"stage{lvl}") ensure_dir(stage_dir) if lvl == 0: for gpath in graph_files: gid = graph_id_from_path(gpath) seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts) prev_stage_edgelists[gid] = gpath else: # For each graph, coarsen previous edgelist using previous seeds, then run Java for gpath in graph_files: gid = graph_id_from_path(gpath) prev_edgelist = prev_stage_edgelists[gid] prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json") next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt") missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist) if missing > 0: print(f"[warn] stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.") seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts) prev_stage_edgelists[gid] = next_edgelist def main(): ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.") mode = ap.add_mutually_exclusive_group(required=True) mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt') mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)') ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)') ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)') ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)') # Java settings ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)') ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name') ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)') ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")') ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)') args = ap.parse_args() # Parse java_opts into a list if provided args.java_opts = args.java_opts.split() if args.java_opts else [] if args.input_edgelist: build_single_graph_levels(args) else: build_multigraph_levels(args) if __name__ == '__main__': main()