|
|
|
|
|
|
|
|
import argparse |
|
|
import glob |
|
|
import json |
|
|
import os |
|
|
import re |
|
|
import shutil |
|
|
import subprocess |
|
|
from collections import defaultdict |
|
|
from typing import Dict, List, Tuple, Iterable, Set |
|
|
|
|
|
def ensure_dir(p: str): |
|
|
os.makedirs(p, exist_ok=True) |
|
|
|
|
|
def read_edgelist(path: str) -> Iterable[Tuple[int, int]]: |
|
|
with open(path, 'r') as f: |
|
|
for line in f: |
|
|
s = line.strip() |
|
|
if not s or s.startswith('#'): |
|
|
continue |
|
|
parts = s.split() |
|
|
if len(parts) < 2: |
|
|
continue |
|
|
try: |
|
|
u = int(parts[0]); v = int(parts[1]) |
|
|
except ValueError: |
|
|
continue |
|
|
if u == v: |
|
|
continue |
|
|
a, b = (u, v) if u < v else (v, u) |
|
|
yield a, b |
|
|
|
|
|
def write_edgelist(path: str, edges: Iterable[Tuple[int, int]]): |
|
|
with open(path, 'w') as f: |
|
|
for u, v in edges: |
|
|
f.write(f"{u} {v}\n") |
|
|
|
|
|
def parse_seeds(path: str) -> Tuple[Dict[int, int], List[int]]: |
|
|
""" |
|
|
Return (node_to_cluster_index, sorted_cluster_ids). |
|
|
The cluster indices are 0..C-1, sorted by cluster_id. |
|
|
- On overlapping membership, choose the cluster with higher 'score', then smaller cluster_id. |
|
|
""" |
|
|
with open(path, 'r') as f: |
|
|
js = json.load(f) |
|
|
clusters = js.get('clusters', []) |
|
|
|
|
|
clusters_sorted = sorted(clusters, key=lambda c: c.get('cluster_id', 0)) |
|
|
cluster_id_list = [c.get('cluster_id', i) for i, c in enumerate(clusters_sorted)] |
|
|
cluster_id_to_idx = {cid: i for i, cid in enumerate(cluster_id_list)} |
|
|
|
|
|
|
|
|
node_choice: Dict[int, Tuple[int, float, int]] = {} |
|
|
|
|
|
for c in clusters_sorted: |
|
|
cid = c.get('cluster_id', None) |
|
|
if cid is None: |
|
|
continue |
|
|
idx = cluster_id_to_idx[cid] |
|
|
members = c.get('members', []) |
|
|
score = float(c.get('score', 0.0)) |
|
|
for u in members: |
|
|
prev = node_choice.get(u, None) |
|
|
if prev is None or (score > prev[1]) or (score == prev[1] and cid < prev[2]): |
|
|
node_choice[u] = (idx, score, cid) |
|
|
|
|
|
node_to_cluster = {u: idx for u, (idx, score, cid) in node_choice.items()} |
|
|
return node_to_cluster, cluster_id_list |
|
|
|
|
|
def coarsen_edgelist(prev_edgelist: str, seeds_json: str, out_edgelist: str) -> int: |
|
|
node_to_cluster, cluster_id_list = parse_seeds(seeds_json) |
|
|
edges_set: Set[Tuple[int, int]] = set() |
|
|
missing_nodes = 0 |
|
|
for u, v in read_edgelist(prev_edgelist): |
|
|
cu = node_to_cluster.get(u, None) |
|
|
cv = node_to_cluster.get(v, None) |
|
|
if cu is None or cv is None: |
|
|
|
|
|
missing_nodes += 1 |
|
|
continue |
|
|
if cu == cv: |
|
|
continue |
|
|
a, b = (cu, cv) if cu < cv else (cv, cu) |
|
|
edges_set.add((a, b)) |
|
|
|
|
|
write_edgelist(out_edgelist, sorted(edges_set)) |
|
|
return missing_nodes |
|
|
|
|
|
def run_java(java_exec: str, class_name: str, edgelist_path: str, out_json_path: str, |
|
|
epsilon: str, java_opts: List[str]) -> None: |
|
|
cmd = [java_exec] + java_opts + [class_name, edgelist_path, out_json_path, epsilon] |
|
|
print("[run]", " ".join(cmd)) |
|
|
subprocess.run(cmd, check=True) |
|
|
|
|
|
def build_single_graph_levels(args): |
|
|
ensure_dir(args.out_dir) |
|
|
|
|
|
stage0_dir = os.path.join(args.out_dir, "stage0") |
|
|
ensure_dir(stage0_dir) |
|
|
e0_copy = os.path.join(stage0_dir, "edgelist_0.txt") |
|
|
if args.copy_inputs: |
|
|
shutil.copyfile(args.input_edgelist, e0_copy) |
|
|
|
|
|
prev_edgelist = args.input_edgelist |
|
|
for lvl in range(args.levels): |
|
|
stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
|
|
ensure_dir(stage_dir) |
|
|
seeds_out = os.path.join(stage_dir, "seeds.json") |
|
|
|
|
|
run_java(args.java, args.class_name, prev_edgelist, seeds_out, args.epsilon, args.java_opts) |
|
|
|
|
|
|
|
|
if lvl < args.levels - 1: |
|
|
next_stage_dir = os.path.join(args.out_dir, f"stage{lvl+1}") |
|
|
ensure_dir(next_stage_dir) |
|
|
next_edgelist = os.path.join(next_stage_dir, f"edgelist_{lvl+1}.txt") |
|
|
missing = coarsen_edgelist(prev_edgelist, seeds_out, next_edgelist) |
|
|
if missing > 0: |
|
|
print(f"[warn] stage{lvl}: {missing} edges had nodes missing from seeds; skipped.") |
|
|
prev_edgelist = next_edgelist |
|
|
|
|
|
def build_multigraph_levels(args): |
|
|
ensure_dir(args.out_dir) |
|
|
|
|
|
graph_files = sorted(glob.glob(os.path.join(args.graphs_dir, args.glob))) |
|
|
if not graph_files: |
|
|
raise SystemExit(f"No graph files found in {args.graphs_dir} with pattern {args.glob}") |
|
|
|
|
|
pattern = re.compile(r'(.*?)(\d+)(\.\w+)$') |
|
|
def graph_id_from_path(p: str) -> str: |
|
|
base = os.path.basename(p) |
|
|
m = pattern.match(base) |
|
|
if m: |
|
|
return m.group(2).zfill(6) |
|
|
|
|
|
stem = os.path.splitext(base)[0] |
|
|
m2 = re.search(r'(\d+)$', stem) |
|
|
return (m2.group(1).zfill(6) if m2 else stem) |
|
|
|
|
|
|
|
|
prev_stage_edgelists: Dict[str, str] = {} |
|
|
for lvl in range(args.levels): |
|
|
stage_dir = os.path.join(args.out_dir, f"stage{lvl}") |
|
|
ensure_dir(stage_dir) |
|
|
|
|
|
if lvl == 0: |
|
|
for gpath in graph_files: |
|
|
gid = graph_id_from_path(gpath) |
|
|
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
|
|
run_java(args.java, args.class_name, gpath, seeds_out, args.epsilon, args.java_opts) |
|
|
prev_stage_edgelists[gid] = gpath |
|
|
else: |
|
|
|
|
|
for gpath in graph_files: |
|
|
gid = graph_id_from_path(gpath) |
|
|
prev_edgelist = prev_stage_edgelists[gid] |
|
|
prev_seeds = os.path.join(args.out_dir, f"stage{lvl-1}", f"graph_{gid}.json") |
|
|
next_edgelist = os.path.join(stage_dir, f"graph_{gid}.txt") |
|
|
missing = coarsen_edgelist(prev_edgelist, prev_seeds, next_edgelist) |
|
|
if missing > 0: |
|
|
print(f"[warn] stage{lvl-1} graph_{gid}: {missing} edges had nodes missing from seeds; skipped.") |
|
|
|
|
|
seeds_out = os.path.join(stage_dir, f"graph_{gid}.json") |
|
|
run_java(args.java, args.class_name, next_edgelist, seeds_out, args.epsilon, args.java_opts) |
|
|
prev_stage_edgelists[gid] = next_edgelist |
|
|
|
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser(description="Build LRMC seeds across multiple levels by invoking the Java LRMC tool and coarsening between levels.") |
|
|
mode = ap.add_mutually_exclusive_group(required=True) |
|
|
mode.add_argument('--input_edgelist', type=str, help='Single-graph mode: path to original edgelist.txt') |
|
|
mode.add_argument('--graphs_dir', type=str, help='Multi-graph mode: directory containing per-graph edgelist files (e.g., graph_000000.txt)') |
|
|
ap.add_argument('--glob', type=str, default='graph_*.txt', help='Multi-graph mode: glob pattern for graph files (default: graph_*.txt)') |
|
|
ap.add_argument('--out_dir', type=str, required=True, help='Output directory (stages will be created here)') |
|
|
ap.add_argument('--levels', type=int, required=True, help='Number of levels to build (e.g., 3)') |
|
|
|
|
|
ap.add_argument('--java', type=str, default='java', help='Java executable (default: java)') |
|
|
ap.add_argument('--class_name', type=str, default='LRMCGenerateSingleCluster', help='Fully qualified Java class name') |
|
|
ap.add_argument('--epsilon', type=str, default='1e6', help='Epsilon argument for the Java tool (default: 1e6)') |
|
|
ap.add_argument('--java_opts', type=str, default='', help='Extra options for java (e.g., "-Xmx16g -cp my.jar")') |
|
|
ap.add_argument('--copy_inputs', action='store_true', help='Copy original edgelist under stage0 for record (single-graph mode)') |
|
|
args = ap.parse_args() |
|
|
|
|
|
|
|
|
args.java_opts = args.java_opts.split() if args.java_opts else [] |
|
|
|
|
|
if args.input_edgelist: |
|
|
build_single_graph_levels(args) |
|
|
else: |
|
|
build_multigraph_levels(args) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|