quantization / cutlass_w8a8 /scaled_mm_c3x_sm90_int8_dispatch.cuh

Sync with vLLM

0da5bf5 4 months ago

5.55 kB

	#pragma once

	#include "scaled_mm_c3x.cuh"

	/**
	* This file defines Gemm kernel configurations for SM90 (int8) based on the
	* Gemm shape.
	*/

	namespace vllm {

	template <typename InType, typename OutType,
	template <typename, typename, typename> typename Epilogue>
	struct sm90_int8_config_default {
	// For M > 128 and any N
	static_assert(std::is_same<InType, int8_t>());
	using KernelSchedule =
	typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
	using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
	using TileShape = Shape<_128, _128, _128>;
	using ClusterShape = Shape<_2, _1, _1>;
	using Cutlass3xGemm =
	cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
	KernelSchedule, EpilogueSchedule>;
	};

	template <typename InType, typename OutType,
	template <typename, typename, typename> typename Epilogue>
	struct sm90_int8_config_M128 {
	// For M in (64, 128] and any N
	static_assert(std::is_same<InType, int8_t>());
	using KernelSchedule =
	typename cutlass::gemm::KernelTmaWarpSpecializedPingpong;
	using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
	using TileShape = Shape<_64, _128, _128>;
	using ClusterShape = Shape<_2, _1, _1>;
	using Cutlass3xGemm =
	cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
	KernelSchedule, EpilogueSchedule>;
	};

	template <typename InType, typename OutType,
	template <typename, typename, typename> typename Epilogue>
	struct sm90_int8_config_M64 {
	// For M in (32, 64] and any N
	static_assert(std::is_same<InType, int8_t>());
	using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
	using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
	using TileShape = Shape<_64, _64, _256>;
	using ClusterShape = Shape<_1, _1, _1>;
	using Cutlass3xGemm =
	cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
	KernelSchedule, EpilogueSchedule>;
	};

	template <typename InType, typename OutType,
	template <typename, typename, typename> typename Epilogue>
	struct sm90_int8_config_M32_NBig {
	// For M in [1, 32] and N >= 8192
	static_assert(std::is_same<InType, int8_t>());
	using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
	using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
	using TileShape = Shape<_64, _128, _256>;
	using ClusterShape = Shape<_1, _4, _1>;
	using Cutlass3xGemm =
	cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
	KernelSchedule, EpilogueSchedule>;
	};

	template <typename InType, typename OutType,
	template <typename, typename, typename> typename Epilogue>
	struct sm90_int8_config_M32_NSmall {
	// For M in [1, 32] and N < 8192
	static_assert(std::is_same<InType, int8_t>());
	using KernelSchedule = typename cutlass::gemm::KernelTmaWarpSpecialized;
	using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
	using TileShape = Shape<_64, _64, _256>;
	using ClusterShape = Shape<_1, _8, _1>;
	using Cutlass3xGemm =
	cutlass_3x_gemm<InType, OutType, Epilogue, TileShape, ClusterShape,
	KernelSchedule, EpilogueSchedule>;
	};

	template <typename InType, typename OutType,
	template <typename, typename, typename> typename Epilogue,
	typename... EpilogueArgs>
	inline void cutlass_gemm_sm90_int8_dispatch(torch::Tensor& out,
	torch::Tensor const& a,
	torch::Tensor const& b,
	EpilogueArgs&&... args) {
	static_assert(std::is_same<InType, int8_t>());
	TORCH_CHECK(a.dtype() == torch::kInt8);
	TORCH_CHECK(b.dtype() == torch::kInt8);

	using Cutlass3xGemmDefault =
	typename sm90_int8_config_default<InType, OutType,
	Epilogue>::Cutlass3xGemm;
	using Cutlass3xGemmM128 =
	typename sm90_int8_config_M128<InType, OutType, Epilogue>::Cutlass3xGemm;
	using Cutlass3xGemmM64 =
	typename sm90_int8_config_M64<InType, OutType, Epilogue>::Cutlass3xGemm;
	using Cutlass3xGemmM32NBig =
	typename sm90_int8_config_M32_NBig<InType, OutType,
	Epilogue>::Cutlass3xGemm;
	using Cutlass3xGemmM32NSmall =
	typename sm90_int8_config_M32_NSmall<InType, OutType,
	Epilogue>::Cutlass3xGemm;

	uint32_t const n = out.size(1);
	bool const is_small_n = n < 8192;

	uint32_t const m = a.size(0);
	uint32_t const mp2 =
	std::max(static_cast<uint32_t>(32), next_pow_2(m)); // next power of 2

	if (mp2 <= 32) {
	// m in [1, 32]
	if (is_small_n) {
	return cutlass_gemm_caller<Cutlass3xGemmM32NSmall>(
	out, a, b, std::forward<EpilogueArgs>(args)...);
	} else {
	return cutlass_gemm_caller<Cutlass3xGemmM32NBig>(
	out, a, b, std::forward<EpilogueArgs>(args)...);
	}
	} else if (mp2 <= 64) {
	// m in (32, 64]
	return cutlass_gemm_caller<Cutlass3xGemmM64>(
	out, a, b, std::forward<EpilogueArgs>(args)...);
	} else if (mp2 <= 128) {
	// m in (64, 128]
	return cutlass_gemm_caller<Cutlass3xGemmM128>(
	out, a, b, std::forward<EpilogueArgs>(args)...);
	} else {
	// m in (128, inf)
	return cutlass_gemm_caller<Cutlass3xGemmDefault>(
	out, a, b, std::forward<EpilogueArgs>(args)...);
	}
	}

	} // namespace vllm