Spaces:
Running
on
Zero
Running
on
Zero
Create fp8_optimization_utils.py
Browse files- utils/fp8_optimization_utils.py +277 -0
utils/fp8_optimization_utils.py
ADDED
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
import torch.nn.functional as F
|
4 |
+
|
5 |
+
from tqdm import tqdm
|
6 |
+
|
7 |
+
|
8 |
+
def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
|
9 |
+
"""
|
10 |
+
Calculate the maximum representable value in FP8 format.
|
11 |
+
Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
|
12 |
+
|
13 |
+
Args:
|
14 |
+
exp_bits (int): Number of exponent bits
|
15 |
+
mantissa_bits (int): Number of mantissa bits
|
16 |
+
sign_bits (int): Number of sign bits (0 or 1)
|
17 |
+
|
18 |
+
Returns:
|
19 |
+
float: Maximum value representable in FP8 format
|
20 |
+
"""
|
21 |
+
assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
|
22 |
+
|
23 |
+
# Calculate exponent bias
|
24 |
+
bias = 2 ** (exp_bits - 1) - 1
|
25 |
+
|
26 |
+
# Calculate maximum mantissa value
|
27 |
+
mantissa_max = 1.0
|
28 |
+
for i in range(mantissa_bits - 1):
|
29 |
+
mantissa_max += 2 ** -(i + 1)
|
30 |
+
|
31 |
+
# Calculate maximum value
|
32 |
+
max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
|
33 |
+
|
34 |
+
return max_value
|
35 |
+
|
36 |
+
|
37 |
+
def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
|
38 |
+
"""
|
39 |
+
Quantize a tensor to FP8 format.
|
40 |
+
|
41 |
+
Args:
|
42 |
+
tensor (torch.Tensor): Tensor to quantize
|
43 |
+
scale (float or torch.Tensor): Scale factor
|
44 |
+
exp_bits (int): Number of exponent bits
|
45 |
+
mantissa_bits (int): Number of mantissa bits
|
46 |
+
sign_bits (int): Number of sign bits
|
47 |
+
|
48 |
+
Returns:
|
49 |
+
tuple: (quantized_tensor, scale_factor)
|
50 |
+
"""
|
51 |
+
# Create scaled tensor
|
52 |
+
scaled_tensor = tensor / scale
|
53 |
+
|
54 |
+
# Calculate FP8 parameters
|
55 |
+
bias = 2 ** (exp_bits - 1) - 1
|
56 |
+
|
57 |
+
if max_value is None:
|
58 |
+
# Calculate max and min values
|
59 |
+
max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
|
60 |
+
min_value = -max_value if sign_bits > 0 else 0.0
|
61 |
+
|
62 |
+
# Clamp tensor to range
|
63 |
+
clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
|
64 |
+
|
65 |
+
# Quantization process
|
66 |
+
abs_values = torch.abs(clamped_tensor)
|
67 |
+
nonzero_mask = abs_values > 0
|
68 |
+
|
69 |
+
# Calculate logF scales (only for non-zero elements)
|
70 |
+
log_scales = torch.zeros_like(clamped_tensor)
|
71 |
+
if nonzero_mask.any():
|
72 |
+
log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
|
73 |
+
|
74 |
+
# Limit log scales and calculate quantization factor
|
75 |
+
log_scales = torch.clamp(log_scales, min=1.0)
|
76 |
+
quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
|
77 |
+
|
78 |
+
# Quantize and dequantize
|
79 |
+
quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
|
80 |
+
|
81 |
+
return quantized, scale
|
82 |
+
|
83 |
+
|
84 |
+
def optimize_state_dict_with_fp8(
|
85 |
+
state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
|
86 |
+
):
|
87 |
+
"""
|
88 |
+
Optimize Linear layer weights in a model's state dict to FP8 format.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
state_dict (dict): State dict to optimize, replaced in-place
|
92 |
+
calc_device (str): Device to quantize tensors on
|
93 |
+
target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers)
|
94 |
+
exclude_layer_keys (list, optional): Layer key patterns to exclude
|
95 |
+
exp_bits (int): Number of exponent bits
|
96 |
+
mantissa_bits (int): Number of mantissa bits
|
97 |
+
move_to_device (bool): Move optimized tensors to the calculating device
|
98 |
+
|
99 |
+
Returns:
|
100 |
+
dict: FP8 optimized state dict
|
101 |
+
"""
|
102 |
+
if exp_bits == 4 and mantissa_bits == 3:
|
103 |
+
fp8_dtype = torch.float8_e4m3fn
|
104 |
+
elif exp_bits == 5 and mantissa_bits == 2:
|
105 |
+
fp8_dtype = torch.float8_e5m2
|
106 |
+
else:
|
107 |
+
raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
|
108 |
+
|
109 |
+
# Calculate FP8 max value
|
110 |
+
max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
|
111 |
+
min_value = -max_value # this function supports only signed FP8
|
112 |
+
|
113 |
+
# Create optimized state dict
|
114 |
+
optimized_count = 0
|
115 |
+
|
116 |
+
# Enumerate tarket keys
|
117 |
+
target_state_dict_keys = []
|
118 |
+
for key in state_dict.keys():
|
119 |
+
# Check if it's a weight key and matches target patterns
|
120 |
+
is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
|
121 |
+
is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
|
122 |
+
is_target = is_target and not is_excluded
|
123 |
+
|
124 |
+
if is_target and isinstance(state_dict[key], torch.Tensor):
|
125 |
+
target_state_dict_keys.append(key)
|
126 |
+
|
127 |
+
# Process each key
|
128 |
+
for key in tqdm(target_state_dict_keys):
|
129 |
+
value = state_dict[key]
|
130 |
+
|
131 |
+
# Save original device and dtype
|
132 |
+
original_device = value.device
|
133 |
+
original_dtype = value.dtype
|
134 |
+
|
135 |
+
# Move to calculation device
|
136 |
+
if calc_device is not None:
|
137 |
+
value = value.to(calc_device)
|
138 |
+
|
139 |
+
# Calculate scale factor
|
140 |
+
scale = torch.max(torch.abs(value.flatten())) / max_value
|
141 |
+
# print(f"Optimizing {key} with scale: {scale}")
|
142 |
+
|
143 |
+
# Quantize weight to FP8
|
144 |
+
quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
|
145 |
+
|
146 |
+
# Add to state dict using original key for weight and new key for scale
|
147 |
+
fp8_key = key # Maintain original key
|
148 |
+
scale_key = key.replace(".weight", ".scale_weight")
|
149 |
+
|
150 |
+
quantized_weight = quantized_weight.to(fp8_dtype)
|
151 |
+
|
152 |
+
if not move_to_device:
|
153 |
+
quantized_weight = quantized_weight.to(original_device)
|
154 |
+
|
155 |
+
scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
|
156 |
+
|
157 |
+
state_dict[fp8_key] = quantized_weight
|
158 |
+
state_dict[scale_key] = scale_tensor
|
159 |
+
|
160 |
+
optimized_count += 1
|
161 |
+
|
162 |
+
if calc_device is not None: # optimized_count % 10 == 0 and
|
163 |
+
# free memory on calculation device
|
164 |
+
torch.cuda.empty_cache() # TODO check device typ
|
165 |
+
|
166 |
+
print(f"Number of optimized Linear layers: {optimized_count}")
|
167 |
+
return state_dict
|
168 |
+
|
169 |
+
|
170 |
+
def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None):
|
171 |
+
"""
|
172 |
+
Patched forward method for Linear layers with FP8 weights.
|
173 |
+
|
174 |
+
Args:
|
175 |
+
self: Linear layer instance
|
176 |
+
x (torch.Tensor): Input tensor
|
177 |
+
use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
|
178 |
+
max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor.
|
179 |
+
|
180 |
+
Returns:
|
181 |
+
torch.Tensor: Result of linear transformation
|
182 |
+
"""
|
183 |
+
if use_scaled_mm:
|
184 |
+
input_dtype = x.dtype
|
185 |
+
original_weight_dtype = self.scale_weight.dtype
|
186 |
+
weight_dtype = self.weight.dtype
|
187 |
+
target_dtype = torch.float8_e5m2
|
188 |
+
assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
|
189 |
+
assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
|
190 |
+
|
191 |
+
if max_value is None:
|
192 |
+
# no input quantization
|
193 |
+
scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device)
|
194 |
+
else:
|
195 |
+
# calculate scale factor for input tensor
|
196 |
+
scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
|
197 |
+
|
198 |
+
# quantize input tensor to FP8: this seems to consume a lot of memory
|
199 |
+
x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
|
200 |
+
|
201 |
+
original_shape = x.shape
|
202 |
+
x = x.reshape(-1, x.shape[2]).to(target_dtype)
|
203 |
+
|
204 |
+
weight = self.weight.t()
|
205 |
+
scale_weight = self.scale_weight.to(torch.float32)
|
206 |
+
|
207 |
+
if self.bias is not None:
|
208 |
+
# float32 is not supported with bias in scaled_mm
|
209 |
+
o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight)
|
210 |
+
else:
|
211 |
+
o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
|
212 |
+
|
213 |
+
return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
|
214 |
+
|
215 |
+
else:
|
216 |
+
# Dequantize the weight
|
217 |
+
original_dtype = self.scale_weight.dtype
|
218 |
+
dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
|
219 |
+
|
220 |
+
# Perform linear transformation
|
221 |
+
if self.bias is not None:
|
222 |
+
output = F.linear(x, dequantized_weight, self.bias)
|
223 |
+
else:
|
224 |
+
output = F.linear(x, dequantized_weight)
|
225 |
+
|
226 |
+
return output
|
227 |
+
|
228 |
+
|
229 |
+
def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
|
230 |
+
"""
|
231 |
+
Apply monkey patching to a model using FP8 optimized state dict.
|
232 |
+
|
233 |
+
Args:
|
234 |
+
model (nn.Module): Model instance to patch
|
235 |
+
optimized_state_dict (dict): FP8 optimized state dict
|
236 |
+
use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
|
237 |
+
|
238 |
+
Returns:
|
239 |
+
nn.Module: The patched model (same instance, modified in-place)
|
240 |
+
"""
|
241 |
+
# # Calculate FP8 float8_e5m2 max value
|
242 |
+
# max_value = calculate_fp8_maxval(5, 2)
|
243 |
+
max_value = None # do not quantize input tensor
|
244 |
+
|
245 |
+
# Find all scale keys to identify FP8-optimized layers
|
246 |
+
scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")]
|
247 |
+
|
248 |
+
# Enumerate patched layers
|
249 |
+
patched_module_paths = set()
|
250 |
+
for scale_key in scale_keys:
|
251 |
+
# Extract module path from scale key (remove .scale_weight)
|
252 |
+
module_path = scale_key.rsplit(".scale_weight", 1)[0]
|
253 |
+
patched_module_paths.add(module_path)
|
254 |
+
|
255 |
+
patched_count = 0
|
256 |
+
|
257 |
+
# Apply monkey patch to each layer with FP8 weights
|
258 |
+
for name, module in model.named_modules():
|
259 |
+
# Check if this module has a corresponding scale_weight
|
260 |
+
has_scale = name in patched_module_paths
|
261 |
+
|
262 |
+
# Apply patch if it's a Linear layer with FP8 scale
|
263 |
+
if isinstance(module, nn.Linear) and has_scale:
|
264 |
+
# register the scale_weight as a buffer to load the state_dict
|
265 |
+
module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
|
266 |
+
|
267 |
+
# Create a new forward method with the patched version.
|
268 |
+
def new_forward(self, x):
|
269 |
+
return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value)
|
270 |
+
|
271 |
+
# Bind method to module
|
272 |
+
module.forward = new_forward.__get__(module, type(module))
|
273 |
+
|
274 |
+
patched_count += 1
|
275 |
+
|
276 |
+
print(f"Number of monkey-patched Linear layers: {patched_count}")
|
277 |
+
return model
|