""" make_edgelists.py Create a canonical edgelist (or a directory of edgelists). Usage ----- python make_edgelists.py [--data_root ] Arguments --------- dataset_name The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.). edges_out * If the dataset contains a single graph (e.g. Planetoid Cora) – this is a file path (`graph.txt`, `edges.txt`, …). * If the dataset contains many graphs (e.g. TUDataset) – this is a directory path where each graph is written as `graph_000000.txt`, `graph_000001.txt`, … Examples -------- # One‑graph dataset (Planetoid Cora) python make_edgelists.py Cora ./cora_edges.txt # Many‑graph dataset (TUDataset Facebook) python make_edgelists.py Facebook ./facebook_edgelists """ from __future__ import annotations import argparse from pathlib import Path from typing import Iterable, Tuple, Set # ------------------------------------------------------------- def canonical_edges(edge_index) -> Set[Tuple[int, int]]: """Return a set of undirected (u,v) pairs with u v: u, v = v, u seen.add((u, v)) return seen def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None: """Write `u v` per line to `out_file`.""" out_file.parent.mkdir(parents=True, exist_ok=True) with out_file.open("w") as f: for u, v in sorted(edges): f.write(f"{u} {v}\n") def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path): """Planetoid datasets contain a single graph.""" from torch_geometric.datasets import Planetoid ds = Planetoid(root=str(root), name=name) data = ds[0] # the only graph edges = canonical_edges(data.edge_index) if isinstance(out_dir, Path) and out_dir.is_dir(): out_file = out_dir / "graph_000000.txt" else: out_file = out_dir write_edges(out_file, edges) # No output to stdout – the edgelist(s) are written to disk def process_tudataset(root: Path, name: str, out_dir: Path): """TUDataset may contain many graphs – write each to /graph_XXXXXX.txt.""" from torch_geometric.datasets import TUDataset ds = TUDataset(root=str(root), name=name) out_dir.mkdir(parents=True, exist_ok=True) for i, data in enumerate(ds): edges = canonical_edges(data.edge_index) out_file = out_dir / f"graph_{i:06d}.txt" write_edges(out_file, edges) def main() -> None: parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( "--data_root", default="./data", help="Root directory for PyG datasets" ) parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)") parser.add_argument( "edges_out", help=( "File path (for single‑graph datasets) or directory " "(for multi‑graph datasets) to write the canonical edgelist(s)" ), ) args = parser.parse_args() root = Path(args.data_root) out_path = Path(args.edges_out) # We try to guess whether the requested dataset is a Planetoid or TUDataset. # If it can be loaded as a Planetoid we use that; otherwise we fall back to TUDataset. try: from torch_geometric.datasets import Planetoid _ = Planetoid(root=str(root), name=args.dataset_name) dataset_type = "Planetoid" except Exception: # pragma: no cover – normal branch failure from torch_geometric.datasets import TUDataset _ = TUDataset(root=str(root), name=args.dataset_name) dataset_type = "TUDataset" # Dispatch if dataset_type == "Planetoid": process_planetoid_dataset(root, args.dataset_name, out_path) else: # TUDataset if out_path.is_file(): raise ValueError( "For multi‑graph datasets (e.g. TUDataset) the output must be a directory" ) process_tudataset(root, args.dataset_name, out_path) if __name__ == "__main__": main()