Spaces:
Runtime error
Runtime error
Commit
·
c2dbb45
1
Parent(s):
ae99472
Fix bugs in quantization
Browse files
src/backend/hflm_with_measurement.py
CHANGED
|
@@ -315,6 +315,15 @@ class HFLMWithMeasurement(HFLM):
|
|
| 315 |
generation_kwargs.pop("is_gsm8k")
|
| 316 |
|
| 317 |
context_length = context.shape[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
|
| 319 |
if not is_gsm8k:
|
| 320 |
# build stopping criteria
|
|
@@ -356,8 +365,6 @@ class HFLMWithMeasurement(HFLM):
|
|
| 356 |
|
| 357 |
model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
|
| 358 |
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
| 359 |
-
|
| 360 |
-
model_config = self.model.config
|
| 361 |
|
| 362 |
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
|
| 363 |
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
|
|
|
| 315 |
generation_kwargs.pop("is_gsm8k")
|
| 316 |
|
| 317 |
context_length = context.shape[1]
|
| 318 |
+
model_config = self.model.config
|
| 319 |
+
|
| 320 |
+
if not self.precision:
|
| 321 |
+
if model_config.quantization_config._load_in_4bit:
|
| 322 |
+
self.precision = "4bit"
|
| 323 |
+
elif model_config.quantization_config._load_in_8bit:
|
| 324 |
+
self.precision = "8bit"
|
| 325 |
+
else:
|
| 326 |
+
raise ValueError("Unknown precision")
|
| 327 |
|
| 328 |
if not is_gsm8k:
|
| 329 |
# build stopping criteria
|
|
|
|
| 365 |
|
| 366 |
model_info = API.model_info(repo_id=self.pretrained, revision=self.revision)
|
| 367 |
model_size_param = get_model_size(model_info=model_info, precision=self.precision)
|
|
|
|
|
|
|
| 368 |
|
| 369 |
n_layers = model_config.num_hidden_layers if hasattr(model_config, "num_hidden_layers") else model_config.num_layers
|
| 370 |
d_model = model_config.hidden_size if hasattr(model_config, "hidden_size") else model_config.d_model
|
src/backend/tasks/measurement_task_utils.py
CHANGED
|
@@ -12,6 +12,9 @@ def process_results_decorator(func):
|
|
| 12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
| 13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
| 14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
|
|
|
|
|
|
|
|
|
| 15 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
| 16 |
|
| 17 |
# Now call the original process_results with the processed results
|
|
@@ -19,6 +22,8 @@ def process_results_decorator(func):
|
|
| 19 |
result_dict["end_to_end_time"] = end_to_end_time
|
| 20 |
result_dict["prefilling_time"] = prefilling_time
|
| 21 |
result_dict["decoding_throughput"] = decoding_throughput
|
|
|
|
|
|
|
| 22 |
return result_dict
|
| 23 |
return wrapper
|
| 24 |
|
|
@@ -30,6 +35,8 @@ def aggregation_decorator(func):
|
|
| 30 |
aggregation_list["end_to_end_time"] = mean
|
| 31 |
aggregation_list["prefilling_time"] = mean
|
| 32 |
aggregation_list["decoding_throughput"] = mean
|
|
|
|
|
|
|
| 33 |
return aggregation_list
|
| 34 |
return wrapper
|
| 35 |
|
|
@@ -41,6 +48,8 @@ def higher_is_better_decorator(func):
|
|
| 41 |
higher_is_better_dict["end_to_end_time"] = False
|
| 42 |
higher_is_better_dict["prefilling_time"] = False
|
| 43 |
higher_is_better_dict["decoding_throughput"] = True
|
|
|
|
|
|
|
| 44 |
return higher_is_better_dict
|
| 45 |
return wrapper
|
| 46 |
|
|
|
|
| 12 |
end_to_end_time = sum([r[1] for r in results]) / len(results)
|
| 13 |
prefilling_time = sum([r[2] for r in results]) / len(results)
|
| 14 |
decoding_throughput = sum([r[3] for r in results]) / len(results)
|
| 15 |
+
mfu = sum([r[4] for r in results]) / len(results)
|
| 16 |
+
mbu = sum([r[5] for r in results]) / len(results)
|
| 17 |
+
|
| 18 |
# print(f"end_to_end_time: {end_to_end_time}, prefilling_time: {prefilling_time}, decoding_throughput: {decoding_throughput}")
|
| 19 |
|
| 20 |
# Now call the original process_results with the processed results
|
|
|
|
| 22 |
result_dict["end_to_end_time"] = end_to_end_time
|
| 23 |
result_dict["prefilling_time"] = prefilling_time
|
| 24 |
result_dict["decoding_throughput"] = decoding_throughput
|
| 25 |
+
result_dict["mfu"] = mfu
|
| 26 |
+
result_dict["mbu"] = mbu
|
| 27 |
return result_dict
|
| 28 |
return wrapper
|
| 29 |
|
|
|
|
| 35 |
aggregation_list["end_to_end_time"] = mean
|
| 36 |
aggregation_list["prefilling_time"] = mean
|
| 37 |
aggregation_list["decoding_throughput"] = mean
|
| 38 |
+
aggregation_list["mfu"] = mean
|
| 39 |
+
aggregation_list["mbu"] = mean
|
| 40 |
return aggregation_list
|
| 41 |
return wrapper
|
| 42 |
|
|
|
|
| 48 |
higher_is_better_dict["end_to_end_time"] = False
|
| 49 |
higher_is_better_dict["prefilling_time"] = False
|
| 50 |
higher_is_better_dict["decoding_throughput"] = True
|
| 51 |
+
higher_is_better_dict["mfu"] = True
|
| 52 |
+
higher_is_better_dict["mbu"] = True
|
| 53 |
return higher_is_better_dict
|
| 54 |
return wrapper
|
| 55 |
|
src/utils.py
CHANGED
|
@@ -98,7 +98,8 @@ def parse_nvidia_smi():
|
|
| 98 |
gpu_stats = []
|
| 99 |
|
| 100 |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
| 101 |
-
gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
|
|
|
|
| 102 |
|
| 103 |
gpu_name = ""
|
| 104 |
for index in gpu_indices:
|
|
@@ -110,7 +111,7 @@ def parse_nvidia_smi():
|
|
| 110 |
name_match = gpu_name_pattern.search(line)
|
| 111 |
gpu_info = {}
|
| 112 |
if name_match:
|
| 113 |
-
gpu_name = name_match.
|
| 114 |
if match:
|
| 115 |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
| 116 |
gpu_info.update({
|
|
|
|
| 98 |
gpu_stats = []
|
| 99 |
|
| 100 |
gpu_info_pattern = re.compile(r'(\d+)C\s+P\d+\s+(\d+)W / \d+W\s+\|\s+(\d+)MiB / \d+MiB\s+\|\s+(\d+)%')
|
| 101 |
+
# gpu_name_pattern = re.compile(r'NVIDIA\s+([\w\s]+\d+(?:\s*GB)?)')
|
| 102 |
+
gpu_name_pattern = re.compile(r'NVIDIA\s+(RTX\s+)?([A-Z0-9]+)')
|
| 103 |
|
| 104 |
gpu_name = ""
|
| 105 |
for index in gpu_indices:
|
|
|
|
| 111 |
name_match = gpu_name_pattern.search(line)
|
| 112 |
gpu_info = {}
|
| 113 |
if name_match:
|
| 114 |
+
gpu_name = ''.join(filter(None, name_match.groups())).strip()
|
| 115 |
if match:
|
| 116 |
temp, power_usage, mem_usage, gpu_util = map(int, match.groups())
|
| 117 |
gpu_info.update({
|