paged-attention / paged-attention-metal /float8.metal

EricB HF Staff

Add metal paged attention

ed30f9d 7 days ago

3.91 kB

	#include <metal_stdlib>
	using namespace metal;

	// Helpers ------------------------------------------------------------
	static inline uint as_bits(float x) { return as_type<uint>(x); }
	static inline float from_bits(uint b) { return as_type<float>(b); }

	// -------------------------------------------------------------------
	// FP8 E4M3 (bias = 7)
	// -------------------------------------------------------------------
	inline float fp8_e4m3_to_float(uchar v) {
	const uint s = v >> 7;
	const uint exp = (v >> 3) & 0xF;
	const uint man = v & 0x7;

	if (exp == 0) { // zero / sub-normal
	if (man == 0)
	return s ? -0.f : 0.f;
	const float m = float(man) / 8.f; // already scaled by 2^-3
	float val = ldexp(m, 1 - 7); // 2^(1-bias) = 2^-6
	return s ? -val : val;
	}

	if (exp == 0xF) { // Inf / NaN (E4M3FN keeps only NaN)
	if (man != 0)
	return NAN;
	return s ? -INFINITY : INFINITY;
	}

	const float m = 1.f + float(man) / 8.f;
	float val = ldexp(m, int(exp) - 7);
	return s ? -val : val;
	}

	// -------------------------------------------------------------------
	// FP8 E5M2 (bias = 15)
	// -------------------------------------------------------------------
	inline float fp8_e5m2_to_float(uchar v) {
	const uint s = v >> 7;
	const uint exp = (v >> 2) & 0x1F;
	const uint man = v & 0x3;

	if (exp == 0) {
	if (man == 0)
	return s ? -0.f : 0.f;
	const float m = float(man) / 4.f;
	float val = ldexp(m, 1 - 15); // 2^(1-bias) = 2^-14
	return s ? -val : val;
	}

	if (exp == 0x1F) {
	if (man != 0)
	return NAN;
	return s ? -INFINITY : INFINITY;
	}

	const float m = 1.f + float(man) / 4.f;
	float val = ldexp(m, int(exp) - 15);
	return s ? -val : val;
	}

	// -------------------------------------------------------------------
	// Encoding helpers (round-to-nearest-even, gradual under-flow, sat-to-∞)
	// -------------------------------------------------------------------
	namespace detail {
	template <int EXP_BITS, int MAN_BITS, int BIAS>
	inline uchar fp32_to_fp8(float f) {
	const uint bits = as_bits(f);
	const uint s = bits >> 31;
	const uint abs = bits & 0x7FFFFFFF;

	// NaN propagates, Inf saturates
	if (abs >= 0x7F800000u) {
	return uchar((s << 7) \| (((1u << EXP_BITS) - 1u) << MAN_BITS) \|
	(abs != 0x7F800000u));
	}

	int e = int((abs >> 23) & 0xFF) - 127; // unbiased exponent
	uint m = abs & 0x7FFFFFu; // 23-bit mantissa
	const int EXP_MAX = (1 << EXP_BITS) - 2; // last finite exponent

	// ---------- Normal path -------------------------------------------------
	int e_fp8 = e + BIAS;
	if (e_fp8 >= 1 && e_fp8 <= EXP_MAX) {
	// round-to-nearest-even
	const int shift = 23 - MAN_BITS;
	uint mant = m >> shift;
	const uint lsb = mant & 1u;
	const uint round = (m >> (shift - 1)) & 1u;
	const uint sticky = (m & ((1u << (shift - 1)) - 1u)) != 0u;
	mant += (round & (sticky \| lsb));
	if (mant >> MAN_BITS) { // mantissa overflow
	mant = 0;
	++e_fp8;
	if (e_fp8 > EXP_MAX)
	return uchar((s << 7) \| (((1u << EXP_BITS) - 1u) << MAN_BITS)); // ∞
	}
	return uchar((s << 7) \| (uint(e_fp8) << MAN_BITS) \|
	(mant & ((1u << MAN_BITS) - 1u)));
	}

	// ---------- Sub-normal / under-flow ------------------------------------
	if (e_fp8 < 1 - MAN_BITS) // too small -> ±0
	return uchar(s << 7);

	// shift so that exponent becomes 1
	int rshift = (1 - e_fp8) + (23 - MAN_BITS);
	uint mant = (0x800000u \| m); // implicit 1
	uint rounded = (mant + (1u << (rshift - 1))) >> rshift;
	if (rounded == 0)
	return uchar(s << 7); // rounds to zero

	return uchar((s << 7) \| (rounded & ((1u << MAN_BITS) - 1u)));
	}
	} // namespace detail

	inline uchar float_to_fp8_e4m3(float f) {
	return detail::fp32_to_fp8<4, 3, 7>(f);
	}
	inline uchar float_to_fp8_e5m2(float f) {
	return detail::fp32_to_fp8<5, 2, 15>(f);
	}