File size: 3,911 Bytes
ed30f9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#include <metal_stdlib>
using namespace metal;

// Helpers ------------------------------------------------------------
static inline uint as_bits(float x) { return as_type<uint>(x); }
static inline float from_bits(uint b) { return as_type<float>(b); }

// -------------------------------------------------------------------
// FP8 E4M3 (bias = 7)
// -------------------------------------------------------------------
inline float fp8_e4m3_to_float(uchar v) {
  const uint s = v >> 7;
  const uint exp = (v >> 3) & 0xF;
  const uint man = v & 0x7;

  if (exp == 0) { // zero / sub-normal
    if (man == 0)
      return s ? -0.f : 0.f;
    const float m = float(man) / 8.f; // already scaled by 2^-3
    float val = ldexp(m, 1 - 7);      // 2^(1-bias) = 2^-6
    return s ? -val : val;
  }

  if (exp == 0xF) { // Inf / NaN  (E4M3FN keeps only NaN)
    if (man != 0)
      return NAN;
    return s ? -INFINITY : INFINITY;
  }

  const float m = 1.f + float(man) / 8.f;
  float val = ldexp(m, int(exp) - 7);
  return s ? -val : val;
}

// -------------------------------------------------------------------
// FP8 E5M2 (bias = 15)
// -------------------------------------------------------------------
inline float fp8_e5m2_to_float(uchar v) {
  const uint s = v >> 7;
  const uint exp = (v >> 2) & 0x1F;
  const uint man = v & 0x3;

  if (exp == 0) {
    if (man == 0)
      return s ? -0.f : 0.f;
    const float m = float(man) / 4.f;
    float val = ldexp(m, 1 - 15); // 2^(1-bias) = 2^-14
    return s ? -val : val;
  }

  if (exp == 0x1F) {
    if (man != 0)
      return NAN;
    return s ? -INFINITY : INFINITY;
  }

  const float m = 1.f + float(man) / 4.f;
  float val = ldexp(m, int(exp) - 15);
  return s ? -val : val;
}

// -------------------------------------------------------------------
// Encoding helpers (round-to-nearest-even, gradual under-flow, sat-to-∞)
// -------------------------------------------------------------------
namespace detail {
template <int EXP_BITS, int MAN_BITS, int BIAS>
inline uchar fp32_to_fp8(float f) {
  const uint bits = as_bits(f);
  const uint s = bits >> 31;
  const uint abs = bits & 0x7FFFFFFF;

  // NaN propagates, Inf saturates
  if (abs >= 0x7F800000u) {
    return uchar((s << 7) | (((1u << EXP_BITS) - 1u) << MAN_BITS) |
                 (abs != 0x7F800000u));
  }

  int e = int((abs >> 23) & 0xFF) - 127;   // unbiased exponent
  uint m = abs & 0x7FFFFFu;                // 23-bit mantissa
  const int EXP_MAX = (1 << EXP_BITS) - 2; // last finite exponent

  // ---------- Normal path -------------------------------------------------
  int e_fp8 = e + BIAS;
  if (e_fp8 >= 1 && e_fp8 <= EXP_MAX) {
    // round-to-nearest-even
    const int shift = 23 - MAN_BITS;
    uint mant = m >> shift;
    const uint lsb = mant & 1u;
    const uint round = (m >> (shift - 1)) & 1u;
    const uint sticky = (m & ((1u << (shift - 1)) - 1u)) != 0u;
    mant += (round & (sticky | lsb));
    if (mant >> MAN_BITS) { // mantissa overflow
      mant = 0;
      ++e_fp8;
      if (e_fp8 > EXP_MAX)
        return uchar((s << 7) | (((1u << EXP_BITS) - 1u) << MAN_BITS)); // ∞
    }
    return uchar((s << 7) | (uint(e_fp8) << MAN_BITS) |
                 (mant & ((1u << MAN_BITS) - 1u)));
  }

  // ---------- Sub-normal / under-flow ------------------------------------
  if (e_fp8 < 1 - MAN_BITS) // too small -> ±0
    return uchar(s << 7);

  // shift so that exponent becomes 1
  int rshift = (1 - e_fp8) + (23 - MAN_BITS);
  uint mant = (0x800000u | m); // implicit 1
  uint rounded = (mant + (1u << (rshift - 1))) >> rshift;
  if (rounded == 0)
    return uchar(s << 7); // rounds to zero

  return uchar((s << 7) | (rounded & ((1u << MAN_BITS) - 1u)));
}
} // namespace detail

inline uchar float_to_fp8_e4m3(float f) {
  return detail::fp32_to_fp8<4, 3, 7>(f);
}
inline uchar float_to_fp8_e5m2(float f) {
  return detail::fp32_to_fp8<5, 2, 15>(f);
}