|
|
""" |
|
|
make_edgelists.py |
|
|
|
|
|
Create a canonical edgelist (or a directory of edgelists). |
|
|
|
|
|
Usage |
|
|
----- |
|
|
python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out> |
|
|
|
|
|
Arguments |
|
|
--------- |
|
|
dataset_name |
|
|
The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.). |
|
|
edges_out |
|
|
* If the dataset contains a single graph (e.g. Planetoid Cora) – this is a |
|
|
file path (`graph.txt`, `edges.txt`, …). |
|
|
* If the dataset contains many graphs (e.g. TUDataset) – this is a |
|
|
directory path where each graph is written as |
|
|
`graph_000000.txt`, `graph_000001.txt`, … |
|
|
|
|
|
Examples |
|
|
-------- |
|
|
# One‑graph dataset (Planetoid Cora) |
|
|
python make_edgelists.py Cora ./cora_edges.txt |
|
|
|
|
|
# Many‑graph dataset (TUDataset Facebook) |
|
|
python make_edgelists.py Facebook ./facebook_edgelists |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
import argparse |
|
|
from pathlib import Path |
|
|
from typing import Iterable, Tuple, Set |
|
|
|
|
|
|
|
|
|
|
|
def canonical_edges(edge_index) -> Set[Tuple[int, int]]: |
|
|
"""Return a set of undirected (u,v) pairs with u<v and u!=v.""" |
|
|
seen: Set[Tuple[int, int]] = set() |
|
|
for u, v in edge_index.t().tolist(): |
|
|
if u == v: |
|
|
continue |
|
|
if u > v: |
|
|
u, v = v, u |
|
|
seen.add((u, v)) |
|
|
return seen |
|
|
|
|
|
|
|
|
def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None: |
|
|
"""Write `u v` per line to `out_file`.""" |
|
|
out_file.parent.mkdir(parents=True, exist_ok=True) |
|
|
with out_file.open("w") as f: |
|
|
for u, v in sorted(edges): |
|
|
f.write(f"{u} {v}\n") |
|
|
|
|
|
|
|
|
def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path): |
|
|
"""Planetoid datasets contain a single graph.""" |
|
|
from torch_geometric.datasets import Planetoid |
|
|
|
|
|
ds = Planetoid(root=str(root), name=name) |
|
|
data = ds[0] |
|
|
edges = canonical_edges(data.edge_index) |
|
|
|
|
|
if isinstance(out_dir, Path) and out_dir.is_dir(): |
|
|
out_file = out_dir / "graph_000000.txt" |
|
|
else: |
|
|
out_file = out_dir |
|
|
|
|
|
write_edges(out_file, edges) |
|
|
|
|
|
|
|
|
|
|
|
def process_tudataset(root: Path, name: str, out_dir: Path): |
|
|
"""TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt.""" |
|
|
from torch_geometric.datasets import TUDataset |
|
|
|
|
|
ds = TUDataset(root=str(root), name=name) |
|
|
out_dir.mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
for i, data in enumerate(ds): |
|
|
edges = canonical_edges(data.edge_index) |
|
|
out_file = out_dir / f"graph_{i:06d}.txt" |
|
|
write_edges(out_file, edges) |
|
|
|
|
|
|
|
|
def main() -> None: |
|
|
parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter) |
|
|
parser.add_argument( |
|
|
"--data_root", default="./data", help="Root directory for PyG datasets" |
|
|
) |
|
|
parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)") |
|
|
parser.add_argument( |
|
|
"edges_out", |
|
|
help=( |
|
|
"File path (for single‑graph datasets) or directory " |
|
|
"(for multi‑graph datasets) to write the canonical edgelist(s)" |
|
|
), |
|
|
) |
|
|
args = parser.parse_args() |
|
|
|
|
|
root = Path(args.data_root) |
|
|
out_path = Path(args.edges_out) |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
from torch_geometric.datasets import Planetoid |
|
|
|
|
|
_ = Planetoid(root=str(root), name=args.dataset_name) |
|
|
dataset_type = "Planetoid" |
|
|
except Exception: |
|
|
from torch_geometric.datasets import TUDataset |
|
|
|
|
|
_ = TUDataset(root=str(root), name=args.dataset_name) |
|
|
dataset_type = "TUDataset" |
|
|
|
|
|
|
|
|
if dataset_type == "Planetoid": |
|
|
process_planetoid_dataset(root, args.dataset_name, out_path) |
|
|
else: |
|
|
if out_path.is_file(): |
|
|
raise ValueError( |
|
|
"For multi‑graph datasets (e.g. TUDataset) the output must be a directory" |
|
|
) |
|
|
process_tudataset(root, args.dataset_name, out_path) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|