Fabrice-TIERCELIN commited on
Commit
81b2c10
·
verified ·
1 Parent(s): 3cb3fb8

Create fp8_optimization_utils.py

Browse files
Files changed (1) hide show
  1. utils/fp8_optimization_utils.py +277 -0
utils/fp8_optimization_utils.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+
5
+ from tqdm import tqdm
6
+
7
+
8
+ def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
9
+ """
10
+ Calculate the maximum representable value in FP8 format.
11
+ Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
12
+
13
+ Args:
14
+ exp_bits (int): Number of exponent bits
15
+ mantissa_bits (int): Number of mantissa bits
16
+ sign_bits (int): Number of sign bits (0 or 1)
17
+
18
+ Returns:
19
+ float: Maximum value representable in FP8 format
20
+ """
21
+ assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
22
+
23
+ # Calculate exponent bias
24
+ bias = 2 ** (exp_bits - 1) - 1
25
+
26
+ # Calculate maximum mantissa value
27
+ mantissa_max = 1.0
28
+ for i in range(mantissa_bits - 1):
29
+ mantissa_max += 2 ** -(i + 1)
30
+
31
+ # Calculate maximum value
32
+ max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
33
+
34
+ return max_value
35
+
36
+
37
+ def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
38
+ """
39
+ Quantize a tensor to FP8 format.
40
+
41
+ Args:
42
+ tensor (torch.Tensor): Tensor to quantize
43
+ scale (float or torch.Tensor): Scale factor
44
+ exp_bits (int): Number of exponent bits
45
+ mantissa_bits (int): Number of mantissa bits
46
+ sign_bits (int): Number of sign bits
47
+
48
+ Returns:
49
+ tuple: (quantized_tensor, scale_factor)
50
+ """
51
+ # Create scaled tensor
52
+ scaled_tensor = tensor / scale
53
+
54
+ # Calculate FP8 parameters
55
+ bias = 2 ** (exp_bits - 1) - 1
56
+
57
+ if max_value is None:
58
+ # Calculate max and min values
59
+ max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
60
+ min_value = -max_value if sign_bits > 0 else 0.0
61
+
62
+ # Clamp tensor to range
63
+ clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
64
+
65
+ # Quantization process
66
+ abs_values = torch.abs(clamped_tensor)
67
+ nonzero_mask = abs_values > 0
68
+
69
+ # Calculate logF scales (only for non-zero elements)
70
+ log_scales = torch.zeros_like(clamped_tensor)
71
+ if nonzero_mask.any():
72
+ log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
73
+
74
+ # Limit log scales and calculate quantization factor
75
+ log_scales = torch.clamp(log_scales, min=1.0)
76
+ quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
77
+
78
+ # Quantize and dequantize
79
+ quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
80
+
81
+ return quantized, scale
82
+
83
+
84
+ def optimize_state_dict_with_fp8(
85
+ state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
86
+ ):
87
+ """
88
+ Optimize Linear layer weights in a model's state dict to FP8 format.
89
+
90
+ Args:
91
+ state_dict (dict): State dict to optimize, replaced in-place
92
+ calc_device (str): Device to quantize tensors on
93
+ target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers)
94
+ exclude_layer_keys (list, optional): Layer key patterns to exclude
95
+ exp_bits (int): Number of exponent bits
96
+ mantissa_bits (int): Number of mantissa bits
97
+ move_to_device (bool): Move optimized tensors to the calculating device
98
+
99
+ Returns:
100
+ dict: FP8 optimized state dict
101
+ """
102
+ if exp_bits == 4 and mantissa_bits == 3:
103
+ fp8_dtype = torch.float8_e4m3fn
104
+ elif exp_bits == 5 and mantissa_bits == 2:
105
+ fp8_dtype = torch.float8_e5m2
106
+ else:
107
+ raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
108
+
109
+ # Calculate FP8 max value
110
+ max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
111
+ min_value = -max_value # this function supports only signed FP8
112
+
113
+ # Create optimized state dict
114
+ optimized_count = 0
115
+
116
+ # Enumerate tarket keys
117
+ target_state_dict_keys = []
118
+ for key in state_dict.keys():
119
+ # Check if it's a weight key and matches target patterns
120
+ is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
121
+ is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
122
+ is_target = is_target and not is_excluded
123
+
124
+ if is_target and isinstance(state_dict[key], torch.Tensor):
125
+ target_state_dict_keys.append(key)
126
+
127
+ # Process each key
128
+ for key in tqdm(target_state_dict_keys):
129
+ value = state_dict[key]
130
+
131
+ # Save original device and dtype
132
+ original_device = value.device
133
+ original_dtype = value.dtype
134
+
135
+ # Move to calculation device
136
+ if calc_device is not None:
137
+ value = value.to(calc_device)
138
+
139
+ # Calculate scale factor
140
+ scale = torch.max(torch.abs(value.flatten())) / max_value
141
+ # print(f"Optimizing {key} with scale: {scale}")
142
+
143
+ # Quantize weight to FP8
144
+ quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
145
+
146
+ # Add to state dict using original key for weight and new key for scale
147
+ fp8_key = key # Maintain original key
148
+ scale_key = key.replace(".weight", ".scale_weight")
149
+
150
+ quantized_weight = quantized_weight.to(fp8_dtype)
151
+
152
+ if not move_to_device:
153
+ quantized_weight = quantized_weight.to(original_device)
154
+
155
+ scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
156
+
157
+ state_dict[fp8_key] = quantized_weight
158
+ state_dict[scale_key] = scale_tensor
159
+
160
+ optimized_count += 1
161
+
162
+ if calc_device is not None: # optimized_count % 10 == 0 and
163
+ # free memory on calculation device
164
+ torch.cuda.empty_cache() # TODO check device typ
165
+
166
+ print(f"Number of optimized Linear layers: {optimized_count}")
167
+ return state_dict
168
+
169
+
170
+ def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None):
171
+ """
172
+ Patched forward method for Linear layers with FP8 weights.
173
+
174
+ Args:
175
+ self: Linear layer instance
176
+ x (torch.Tensor): Input tensor
177
+ use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
178
+ max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor.
179
+
180
+ Returns:
181
+ torch.Tensor: Result of linear transformation
182
+ """
183
+ if use_scaled_mm:
184
+ input_dtype = x.dtype
185
+ original_weight_dtype = self.scale_weight.dtype
186
+ weight_dtype = self.weight.dtype
187
+ target_dtype = torch.float8_e5m2
188
+ assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
189
+ assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
190
+
191
+ if max_value is None:
192
+ # no input quantization
193
+ scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device)
194
+ else:
195
+ # calculate scale factor for input tensor
196
+ scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
197
+
198
+ # quantize input tensor to FP8: this seems to consume a lot of memory
199
+ x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
200
+
201
+ original_shape = x.shape
202
+ x = x.reshape(-1, x.shape[2]).to(target_dtype)
203
+
204
+ weight = self.weight.t()
205
+ scale_weight = self.scale_weight.to(torch.float32)
206
+
207
+ if self.bias is not None:
208
+ # float32 is not supported with bias in scaled_mm
209
+ o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight)
210
+ else:
211
+ o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
212
+
213
+ return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
214
+
215
+ else:
216
+ # Dequantize the weight
217
+ original_dtype = self.scale_weight.dtype
218
+ dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
219
+
220
+ # Perform linear transformation
221
+ if self.bias is not None:
222
+ output = F.linear(x, dequantized_weight, self.bias)
223
+ else:
224
+ output = F.linear(x, dequantized_weight)
225
+
226
+ return output
227
+
228
+
229
+ def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
230
+ """
231
+ Apply monkey patching to a model using FP8 optimized state dict.
232
+
233
+ Args:
234
+ model (nn.Module): Model instance to patch
235
+ optimized_state_dict (dict): FP8 optimized state dict
236
+ use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
237
+
238
+ Returns:
239
+ nn.Module: The patched model (same instance, modified in-place)
240
+ """
241
+ # # Calculate FP8 float8_e5m2 max value
242
+ # max_value = calculate_fp8_maxval(5, 2)
243
+ max_value = None # do not quantize input tensor
244
+
245
+ # Find all scale keys to identify FP8-optimized layers
246
+ scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")]
247
+
248
+ # Enumerate patched layers
249
+ patched_module_paths = set()
250
+ for scale_key in scale_keys:
251
+ # Extract module path from scale key (remove .scale_weight)
252
+ module_path = scale_key.rsplit(".scale_weight", 1)[0]
253
+ patched_module_paths.add(module_path)
254
+
255
+ patched_count = 0
256
+
257
+ # Apply monkey patch to each layer with FP8 weights
258
+ for name, module in model.named_modules():
259
+ # Check if this module has a corresponding scale_weight
260
+ has_scale = name in patched_module_paths
261
+
262
+ # Apply patch if it's a Linear layer with FP8 scale
263
+ if isinstance(module, nn.Linear) and has_scale:
264
+ # register the scale_weight as a buffer to load the state_dict
265
+ module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
266
+
267
+ # Create a new forward method with the patched version.
268
+ def new_forward(self, x):
269
+ return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value)
270
+
271
+ # Bind method to module
272
+ module.forward = new_forward.__get__(module, type(module))
273
+
274
+ patched_count += 1
275
+
276
+ print(f"Number of monkey-patched Linear layers: {patched_count}")
277
+ return model