"""
make_edgelists.py

Create a canonical edgelist (or a directory of edgelists).

Usage
-----
    python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out>

Arguments
---------
dataset_name
    The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.).
edges_out
    * If the dataset contains a single graph (e.g. Planetoid Cora) – this is a
      file path (`graph.txt`, `edges.txt`, …).
    * If the dataset contains many graphs (e.g. TUDataset) – this is a
      directory path where each graph is written as
      `graph_000000.txt`, `graph_000001.txt`, …

Examples
--------
# One‑graph dataset (Planetoid Cora)
python make_edgelists.py Cora ./cora_edges.txt

# Many‑graph dataset (TUDataset Facebook)
python make_edgelists.py Facebook ./facebook_edgelists
"""

from __future__ import annotations

import argparse
from pathlib import Path
from typing import Iterable, Tuple, Set

# -------------------------------------------------------------

def canonical_edges(edge_index) -> Set[Tuple[int, int]]:
    """Return a set of undirected (u,v) pairs with u<v and u!=v."""
    seen: Set[Tuple[int, int]] = set()
    for u, v in edge_index.t().tolist():
        if u == v:
            continue
        if u > v:
            u, v = v, u
        seen.add((u, v))
    return seen


def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None:
    """Write `u v` per line to `out_file`."""
    out_file.parent.mkdir(parents=True, exist_ok=True)
    with out_file.open("w") as f:
        for u, v in sorted(edges):
            f.write(f"{u} {v}\n")


def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path):
    """Planetoid datasets contain a single graph."""
    from torch_geometric.datasets import Planetoid

    ds = Planetoid(root=str(root), name=name)
    data = ds[0]  # the only graph
    edges = canonical_edges(data.edge_index)

    if isinstance(out_dir, Path) and out_dir.is_dir():
        out_file = out_dir / "graph_000000.txt"
    else:
        out_file = out_dir

    write_edges(out_file, edges)
    # No output to stdout – the edgelist(s) are written to disk


def process_tudataset(root: Path, name: str, out_dir: Path):
    """TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt."""
    from torch_geometric.datasets import TUDataset

    ds = TUDataset(root=str(root), name=name)
    out_dir.mkdir(parents=True, exist_ok=True)

    for i, data in enumerate(ds):
        edges = canonical_edges(data.edge_index)
        out_file = out_dir / f"graph_{i:06d}.txt"
        write_edges(out_file, edges)


def main() -> None:
    parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter)
    parser.add_argument(
        "--data_root", default="./data", help="Root directory for PyG datasets"
    )
    parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)")
    parser.add_argument(
        "edges_out",
        help=(
            "File path (for single‑graph datasets) or directory "
            "(for multi‑graph datasets) to write the canonical edgelist(s)"
        ),
    )
    args = parser.parse_args()

    root = Path(args.data_root)
    out_path = Path(args.edges_out)

    # We try to guess whether the requested dataset is a Planetoid or TUDataset.
    # If it can be loaded as a Planetoid we use that; otherwise we fall back to TUDataset.
    try:
        from torch_geometric.datasets import Planetoid

        _ = Planetoid(root=str(root), name=args.dataset_name)
        dataset_type = "Planetoid"
    except Exception:  # pragma: no cover – normal branch failure
        from torch_geometric.datasets import TUDataset

        _ = TUDataset(root=str(root), name=args.dataset_name)
        dataset_type = "TUDataset"

    # Dispatch
    if dataset_type == "Planetoid":
        process_planetoid_dataset(root, args.dataset_name, out_path)
    else:  # TUDataset
        if out_path.is_file():
            raise ValueError(
                "For multi‑graph datasets (e.g. TUDataset) the output must be a directory"
            )
        process_tudataset(root, args.dataset_name, out_path)


if __name__ == "__main__":
    main()