AppleSwing
commited on
Commit
•
c2dbb45
1
Parent(s):
ae99472
Fix bugs in quantization
Browse files
src/backend/hflm_with_measurement.py
CHANGED
@@ -315,6 +315,15 @@ class HFLMWithMeasurement(HFLM):
|
|
315 |
generation_kwargs.pop("is_gsm8k")
|
316 |
|
317 |
context_length = context.shape[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
318 |
|
319 |
if not is_gsm8k:
|
320 |
# build stopping criteria
|
@@ -356,8 +365,6 @@ class HFLMWithMeasurement(HFLM):
|
|
356 |
|
357 |
model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
|
358 |
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
359 |
-
|
360 |
-
model_config = self.model.config
|
361 |
|
362 |
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
|
363 |
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
|
|
315 |
generation_kwargs.pop("is_gsm8k")
|
316 |
|
317 |
context_length = context.shape[1]
|
318 |
+
model_config = self.model.config
|
319 |
+
|
320 |
+
if not self.precision:
|
321 |
+
if model_config.quantization_config._load_in_4bit:
|
322 |
+
self.precision = "4bit"
|
323 |
+
elif model_config.quantization_config._load_in_8bit:
|
324 |
+
self.precision = "8bit"
|
325 |
+
else:
|
326 |
+
raise ValueError("Unknown precision")
|
327 |
|
328 |
if not is_gsm8k:
|
329 |
# build stopping criteria
|
|
|
365 |
|
366 |
model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
|
367 |
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
|
|
|
|
368 |
|
369 |
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
|
370 |
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
src/backend/tasks/measurement_task_utils.py
CHANGED
@@ -12,6 +12,9 @@ def process_results_decorator(func):
|
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
|
|
|
15 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
16 |
|
17 |
# Now call the original process_results with the processed results
|
@@ -19,6 +22,8 @@ def process_results_decorator(func):
|
|
19 |
result_dict["end_to_end_time"] = end_to_end_time
|
20 |
result_dict["prefilling_time"] = prefilling_time
|
21 |
result_dict["decoding_throughput"] = decoding_throughput
|
|
|
|
|
22 |
return result_dict
|
23 |
return wrapper
|
24 |
|
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
|
|
30 |
aggregation_list["end_to_end_time"] = mean
|
31 |
aggregation_list["prefilling_time"] = mean
|
32 |
aggregation_list["decoding_throughput"] = mean
|
|
|
|
|
33 |
return aggregation_list
|
34 |
return wrapper
|
35 |
|
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
|
|
41 |
higher_is_better_dict["end_to_end_time"] = False
|
42 |
higher_is_better_dict["prefilling_time"] = False
|
43 |
higher_is_better_dict["decoding_throughput"] = True
|
|
|
|
|
44 |
return higher_is_better_dict
|
45 |
return wrapper
|
46 |
|
|
|
12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
15 |
+
mfu = sum([r[4] for r in results]) / len(results)
|
16 |
+
mbu = sum([r[5] for r in results]) / len(results)
|
17 |
+
|
18 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
19 |
|
20 |
# Now call the original process_results with the processed results
|
|
|
22 |
result_dict["end_to_end_time"] = end_to_end_time
|
23 |
result_dict["prefilling_time"] = prefilling_time
|
24 |
result_dict["decoding_throughput"] = decoding_throughput
|
25 |
+
result_dict["mfu"] = mfu
|
26 |
+
result_dict["mbu"] = mbu
|
27 |
return result_dict
|
28 |
return wrapper
|
29 |
|
|
|
35 |
aggregation_list["end_to_end_time"] = mean
|
36 |
aggregation_list["prefilling_time"] = mean
|
37 |
aggregation_list["decoding_throughput"] = mean
|
38 |
+
aggregation_list["mfu"] = mean
|
39 |
+
aggregation_list["mbu"] = mean
|
40 |
return aggregation_list
|
41 |
return wrapper
|
42 |
|
|
|
48 |
higher_is_better_dict["end_to_end_time"] = False
|
49 |
higher_is_better_dict["prefilling_time"] = False
|
50 |
higher_is_better_dict["decoding_throughput"] = True
|
51 |
+
higher_is_better_dict["mfu"] = True
|
52 |
+
higher_is_better_dict["mbu"] = True
|
53 |
return higher_is_better_dict
|
54 |
return wrapper
|
55 |
|
src/utils.py
CHANGED
@@ -98,7 +98,8 @@ def parse_nvidia_smi():
|
|
98 |
gpu_stats = []
|
99 |
|
100 |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
101 |
-
gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
|
|
|
102 |
|
103 |
gpu_name = ""
|
104 |
for index in gpu_indices:
|
@@ -110,7 +111,7 @@ def parse_nvidia_smi():
|
|
110 |
name_match = gpu_name_pattern.search(line)
|
111 |
gpu_info = {}
|
112 |
if name_match:
|
113 |
-
gpu_name = name_match.
|
114 |
if match:
|
115 |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
116 |
gpu_info.update({
|
|
|
98 |
gpu_stats = []
|
99 |
|
100 |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
101 |
+
# gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
|
102 |
+
gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
|
103 |
|
104 |
gpu_name = ""
|
105 |
for index in gpu_indices:
|
|
|
111 |
name_match = gpu_name_pattern.search(line)
|
112 |
gpu_info = {}
|
113 |
if name_match:
|
114 |
+
gpu_name = ''.join(filter(None, name_match.groups())).strip()
|
115 |
if match:
|
116 |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
117 |
gpu_info.update({
|