#!/usr/bin/env bash
set -euo pipefail

KERNEL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
cd "$KERNEL_DIR"

export KERNEL_DIR

detect_variant() {
python - <<'PY'
import os
import pathlib

root = pathlib.Path(os.environ["KERNEL_DIR"])
build_dir = root / "build"
variant = None

try:
    from kernels.utils import build_variant as _build_variant
except Exception:
    _build_variant = None

if _build_variant is not None:
    try:
        variant = _build_variant()
    except Exception:
        variant = None

if variant is None:
    candidates = sorted(build_dir.glob("torch*-rocm64-*") or build_dir.glob("torch*-cu*"))
    if candidates:
        variant = candidates[0].name

if variant is None:
    raise SystemExit("Could not determine MegaBlocks build variant. Run build.py first.")

print(variant)
PY
}

VARIANT=$(detect_variant)

STAGED_DIR="$KERNEL_DIR/build/$VARIANT"
find_staged_lib() {
  local base="$1"
  local candidates=(
    "$base/_megablocks_rocm.so"
    "$base/megablocks/_megablocks_rocm.so"
  )
  for path in "${candidates[@]}"; do
    if [[ -f "$path" ]]; then
      echo "$path"
      return 0
    fi
  done
  return 1
}

STAGED_LIB=$(find_staged_lib "$STAGED_DIR") || true

if [[ -z "${STAGED_LIB:-}" ]]; then
  echo "Staged ROCm extension not found under $STAGED_DIR; rebuilding kernels..."
  python build.py
  VARIANT=$(detect_variant)
  STAGED_DIR="$KERNEL_DIR/build/$VARIANT"
  STAGED_LIB=$(find_staged_lib "$STAGED_DIR") || true
  if [[ -z "${STAGED_LIB:-}" ]]; then
    echo "ERROR: build.py completed but no extension was found under $STAGED_DIR" >&2
    exit 1
  fi
fi

export PYTHONPATH="$STAGED_DIR:${PYTHONPATH:-}"

echo "Using MegaBlocks build variant: $VARIANT"

declare -i GPU_COUNT
GPU_COUNT=$(python - <<'PY'
import torch
print(torch.cuda.device_count() if torch.cuda.is_available() else 0)
PY
)

if (( GPU_COUNT == 0 )); then
  echo "ERROR: No HIP/CUDA GPUs detected. Tests require at least one visible accelerator." >&2
  exit 1
fi

echo "Detected $GPU_COUNT visible GPU(s)."

log() {
  echo
  echo "==> $1"
}

run_pytest() {
  local label="$1"
  shift
  log "$label"
  set -x
  "$@"
  { set +x; } 2>/dev/null || true
}

SINGLE_GPU_ENV=(HIP_VISIBLE_DEVICES=0 CUDA_VISIBLE_DEVICES=0 WORLD_SIZE=1)
MULTI2_GPU_ENV=(HIP_VISIBLE_DEVICES=0,1 CUDA_VISIBLE_DEVICES=0,1 WORLD_SIZE=2)
MULTI8_GPU_ENV=(HIP_VISIBLE_DEVICES=$(seq -s, 0 7) CUDA_VISIBLE_DEVICES=$(seq -s, 0 7) WORLD_SIZE=8)

SINGLE_TESTS=(
  "test_mb_moe.py"
  "test_mb_moe_shared_expert.py"
  "layer_test.py"
  "test_gg.py"
  "ops_test.py"
)

for test in "${SINGLE_TESTS[@]}"; do
  run_pytest "Single-GPU pytest ${test}" env "${SINGLE_GPU_ENV[@]}" python -m pytest "tests/${test}" -q
done

if (( GPU_COUNT >= 2 )); then
  run_pytest "Distributed layer smoke (2 GPUs)" env "${MULTI2_GPU_ENV[@]}" python -m pytest "tests/parallel_layer_test.py::test_megablocks_moe_mlp_functionality" -q
else
  log "Skipping 2-GPU distributed layer test (requires >=2 GPUs, detected ${GPU_COUNT})."
fi

run_pytest "Shared expert functionality (world_size=1)" env "${SINGLE_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_functionality[1]' -q
run_pytest "Shared expert weighted sum (world_size=1)" env "${SINGLE_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_weighted_sum[1]' -q

if (( GPU_COUNT >= 8 )); then
  run_pytest "Shared expert functionality (world_size=8)" env "${MULTI8_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_functionality[8]' -q
  run_pytest "Shared expert weighted sum (world_size=8)" env "${MULTI8_GPU_ENV[@]}" python -m pytest 'tests/test_mb_moe_shared_expert_multi.py::test_shared_expert_distributed_weighted_sum[8]' -q
else
  log "Skipping 8-GPU shared expert tests (requires >=8 GPUs, detected ${GPU_COUNT})."
fi

echo
echo "All requested tests completed."