clique / src /export_edgelist.py
qingy2024's picture
Upload folder using huggingface_hub
f74dd01 verified
"""
make_edgelists.py
Create a canonical edgelist (or a directory of edgelists).
Usage
-----
python make_edgelists.py [--data_root <root>] <dataset_name> <edges_out>
Arguments
---------
dataset_name
The name of the PyG dataset (e.g. "Cora", "TUDatasetName", etc.).
edges_out
* If the dataset contains a single graph (e.g. Planetoid Cora) – this is a
file path (`graph.txt`, `edges.txt`, …).
* If the dataset contains many graphs (e.g. TUDataset) – this is a
directory path where each graph is written as
`graph_000000.txt`, `graph_000001.txt`, …
Examples
--------
# One‑graph dataset (Planetoid Cora)
python make_edgelists.py Cora ./cora_edges.txt
# Many‑graph dataset (TUDataset Facebook)
python make_edgelists.py Facebook ./facebook_edgelists
"""
from __future__ import annotations
import argparse
from pathlib import Path
from typing import Iterable, Tuple, Set
# -------------------------------------------------------------
def canonical_edges(edge_index) -> Set[Tuple[int, int]]:
"""Return a set of undirected (u,v) pairs with u<v and u!=v."""
seen: Set[Tuple[int, int]] = set()
for u, v in edge_index.t().tolist():
if u == v:
continue
if u > v:
u, v = v, u
seen.add((u, v))
return seen
def write_edges(out_file: Path, edges: Iterable[Tuple[int, int]]) -> None:
"""Write `u v` per line to `out_file`."""
out_file.parent.mkdir(parents=True, exist_ok=True)
with out_file.open("w") as f:
for u, v in sorted(edges):
f.write(f"{u} {v}\n")
def process_planetoid_dataset(root: Path, name: str, out_dir: Path | Path):
"""Planetoid datasets contain a single graph."""
from torch_geometric.datasets import Planetoid
ds = Planetoid(root=str(root), name=name)
data = ds[0] # the only graph
edges = canonical_edges(data.edge_index)
if isinstance(out_dir, Path) and out_dir.is_dir():
out_file = out_dir / "graph_000000.txt"
else:
out_file = out_dir
write_edges(out_file, edges)
# No output to stdout – the edgelist(s) are written to disk
def process_tudataset(root: Path, name: str, out_dir: Path):
"""TUDataset may contain many graphs – write each to <out_dir>/graph_XXXXXX.txt."""
from torch_geometric.datasets import TUDataset
ds = TUDataset(root=str(root), name=name)
out_dir.mkdir(parents=True, exist_ok=True)
for i, data in enumerate(ds):
edges = canonical_edges(data.edge_index)
out_file = out_dir / f"graph_{i:06d}.txt"
write_edges(out_file, edges)
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__.strip(), formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"--data_root", default="./data", help="Root directory for PyG datasets"
)
parser.add_argument("dataset_name", help="PyG dataset name (e.g. Cora)")
parser.add_argument(
"edges_out",
help=(
"File path (for single‑graph datasets) or directory "
"(for multi‑graph datasets) to write the canonical edgelist(s)"
),
)
args = parser.parse_args()
root = Path(args.data_root)
out_path = Path(args.edges_out)
# We try to guess whether the requested dataset is a Planetoid or TUDataset.
# If it can be loaded as a Planetoid we use that; otherwise we fall back to TUDataset.
try:
from torch_geometric.datasets import Planetoid
_ = Planetoid(root=str(root), name=args.dataset_name)
dataset_type = "Planetoid"
except Exception: # pragma: no cover – normal branch failure
from torch_geometric.datasets import TUDataset
_ = TUDataset(root=str(root), name=args.dataset_name)
dataset_type = "TUDataset"
# Dispatch
if dataset_type == "Planetoid":
process_planetoid_dataset(root, args.dataset_name, out_path)
else: # TUDataset
if out_path.is_file():
raise ValueError(
"For multi‑graph datasets (e.g. TUDataset) the output must be a directory"
)
process_tudataset(root, args.dataset_name, out_path)
if __name__ == "__main__":
main()