Spaces:
Running
Running
Jae-Won Chung
commited on
Commit
·
76bf85e
1
Parent(s):
4e4fca8
Leaderboard tweaks
Browse files- app.py +49 -20
- data/diffusion/image-to-video/models.json +1 -1
app.py
CHANGED
|
@@ -229,6 +229,13 @@ class LLMChatTableManager(LLMTableManager):
|
|
| 229 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 230 |
if detail_mode:
|
| 231 |
text = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
Columns
|
| 233 |
- **Model**: The name of the model.
|
| 234 |
- **Params (B)**: Number of parameters in the model.
|
|
@@ -242,10 +249,6 @@ class LLMChatTableManager(LLMTableManager):
|
|
| 242 |
- **Avg BS**: Average batch size of the serving engine over time.
|
| 243 |
- **Max BS**: Maximum batch size configuration of the serving engine.
|
| 244 |
|
| 245 |
-
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response.
|
| 246 |
-
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average.
|
| 247 |
-
You can tweak the TPOT slider to adjust the target average TPOT for the models.
|
| 248 |
-
|
| 249 |
For more detailed information, please take a look at the **About** tab.
|
| 250 |
"""
|
| 251 |
else:
|
|
@@ -290,6 +293,13 @@ class LLMCodeTableManager(LLMTableManager):
|
|
| 290 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 291 |
if detail_mode:
|
| 292 |
text = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
Columns
|
| 294 |
- **Model**: The name of the model.
|
| 295 |
- **Params (B)**: Number of parameters in the model.
|
|
@@ -303,10 +313,6 @@ class LLMCodeTableManager(LLMTableManager):
|
|
| 303 |
- **Avg BS**: Average batch size of the serving engine over time.
|
| 304 |
- **Max BS**: Maximum batch size configuration of the serving engine.
|
| 305 |
|
| 306 |
-
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response.
|
| 307 |
-
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average.
|
| 308 |
-
You can tweak the TPOT slider to adjust the target average TPOT for the models.
|
| 309 |
-
|
| 310 |
For more detailed information, please take a look at the **About** tab.
|
| 311 |
"""
|
| 312 |
else:
|
|
@@ -350,6 +356,13 @@ class VLMChatTableManager(LLMTableManager):
|
|
| 350 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 351 |
if detail_mode:
|
| 352 |
text = """
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
Columns
|
| 354 |
- **Model**: The name of the model.
|
| 355 |
- **Params (B)**: Number of parameters in the model.
|
|
@@ -363,10 +376,6 @@ class VLMChatTableManager(LLMTableManager):
|
|
| 363 |
- **Avg BS**: Average batch size of the serving engine over time.
|
| 364 |
- **Max BS**: Maximum batch size configuration of the serving engine.
|
| 365 |
|
| 366 |
-
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response.
|
| 367 |
-
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average.
|
| 368 |
-
You can tweak the TPOT slider to adjust the target average TPOT for the models.
|
| 369 |
-
|
| 370 |
For more detailed information, please take a look at the **About** tab.
|
| 371 |
"""
|
| 372 |
else:
|
|
@@ -499,7 +508,7 @@ class DiffusionTableManager(TableManager):
|
|
| 499 |
)
|
| 500 |
|
| 501 |
if not detail_mode:
|
| 502 |
-
core_columns = ["Model", "Denoising params", "GPU", "
|
| 503 |
readable_name_mapping = {
|
| 504 |
"Denoising params": "Denoising parameters (Billions)",
|
| 505 |
"GPU": "GPU model",
|
|
@@ -521,7 +530,9 @@ class DiffusionT2ITableManager(DiffusionTableManager):
|
|
| 521 |
|
| 522 |
def get_intro_text(self) -> str:
|
| 523 |
text = """
|
| 524 |
-
<h2>
|
|
|
|
|
|
|
| 525 |
|
| 526 |
<p style="font-size: 16px">
|
| 527 |
Diffusion models generate images that align with input text prompts.
|
|
@@ -537,6 +548,9 @@ class DiffusionT2ITableManager(DiffusionTableManager):
|
|
| 537 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 538 |
if detail_mode:
|
| 539 |
text = """
|
|
|
|
|
|
|
|
|
|
| 540 |
Columns
|
| 541 |
- **Model**: The name of the model.
|
| 542 |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer).
|
|
@@ -557,6 +571,7 @@ class DiffusionT2ITableManager(DiffusionTableManager):
|
|
| 557 |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the image.
|
| 558 |
- **GPU model**: Name of the GPU model used for benchmarking.
|
| 559 |
- **Energy per image (Joules)**: Energy consumed for each generated image in Joules.
|
|
|
|
| 560 |
|
| 561 |
Checking "Show more technical details" above the table will reveal more detailed columns.
|
| 562 |
Also, for more detailed information, please take a look at the **About** tab.
|
|
@@ -575,7 +590,9 @@ class DiffusionT2VTableManager(DiffusionTableManager):
|
|
| 575 |
|
| 576 |
def get_intro_text(self) -> str:
|
| 577 |
text = """
|
| 578 |
-
<h2>
|
|
|
|
|
|
|
| 579 |
|
| 580 |
<p style="font-size: 16px">
|
| 581 |
Diffusion models generate videos that align with input text prompts.
|
|
@@ -591,6 +608,9 @@ class DiffusionT2VTableManager(DiffusionTableManager):
|
|
| 591 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 592 |
if detail_mode:
|
| 593 |
text = """
|
|
|
|
|
|
|
|
|
|
| 594 |
Columns
|
| 595 |
- **Model**: The name of the model.
|
| 596 |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer).
|
|
@@ -612,6 +632,8 @@ class DiffusionT2VTableManager(DiffusionTableManager):
|
|
| 612 |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the video.
|
| 613 |
- **GPU model**: Name of the GPU model used for benchmarking.
|
| 614 |
- **Energy per video (Joules)**: Energy consumed for each generated image in Joules.
|
|
|
|
|
|
|
| 615 |
|
| 616 |
Checking "Show more technical details" above the table will reveal more detailed columns.
|
| 617 |
Also, for more detailed information, please take a look at the **About** tab.
|
|
@@ -630,7 +652,9 @@ class DiffusionI2VTableManager(DiffusionTableManager):
|
|
| 630 |
|
| 631 |
def get_intro_text(self) -> str:
|
| 632 |
text = """
|
| 633 |
-
<h2>
|
|
|
|
|
|
|
| 634 |
|
| 635 |
<p style="font-size: 16px">
|
| 636 |
Diffusion models generate videos given an input image (and sometimes alongside with text).
|
|
@@ -646,6 +670,9 @@ class DiffusionI2VTableManager(DiffusionTableManager):
|
|
| 646 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 647 |
if detail_mode:
|
| 648 |
text = """
|
|
|
|
|
|
|
|
|
|
| 649 |
Columns
|
| 650 |
- **Model**: The name of the model.
|
| 651 |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer).
|
|
@@ -667,6 +694,8 @@ class DiffusionI2VTableManager(DiffusionTableManager):
|
|
| 667 |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the video.
|
| 668 |
- **GPU model**: Name of the GPU model used for benchmarking.
|
| 669 |
- **Energy per video (Joules)**: Energy consumed for each generated image in Joules.
|
|
|
|
|
|
|
| 670 |
|
| 671 |
Checking "Show more technical details" above the table will reveal more detailed columns.
|
| 672 |
Also, for more detailed information, please take a look at the **About** tab.
|
|
@@ -674,7 +703,7 @@ class DiffusionI2VTableManager(DiffusionTableManager):
|
|
| 674 |
return text
|
| 675 |
|
| 676 |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]:
|
| 677 |
-
return {"Batch latency (s)": (0.0, 120.0, 1.0,
|
| 678 |
|
| 679 |
|
| 680 |
class LegacyTableManager:
|
|
@@ -718,7 +747,7 @@ class LegacyTableManager:
|
|
| 718 |
self.full_df = df
|
| 719 |
|
| 720 |
# Default view of the table is to only show the first options.
|
| 721 |
-
self.set_filter_get_df(
|
| 722 |
|
| 723 |
def _read_tables(self, data_dir: str) -> pd.DataFrame:
|
| 724 |
"""Read tables."""
|
|
@@ -777,7 +806,7 @@ class LegacyTableManager:
|
|
| 777 |
gr.Dropdown.update(choices=["None", *columns]),
|
| 778 |
]
|
| 779 |
|
| 780 |
-
def set_filter_get_df(self,
|
| 781 |
"""Set the current set of filters and return the filtered DataFrame."""
|
| 782 |
# If the filter is empty, we default to the first choice for each key.
|
| 783 |
if not filters:
|
|
@@ -1027,7 +1056,7 @@ def consumed_more_energy_message(energy_a, energy_b):
|
|
| 1027 |
# Colosseum event handlers
|
| 1028 |
def on_load():
|
| 1029 |
"""Intialize the dataframe, shuffle the model preference dropdown choices."""
|
| 1030 |
-
dataframe = global_ltbm.set_filter_get_df(
|
| 1031 |
dataframes = [global_tbm.set_filter_get_df(detail_mode=False) for global_tbm in global_tbms]
|
| 1032 |
return dataframe, *dataframes
|
| 1033 |
|
|
|
|
| 229 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 230 |
if detail_mode:
|
| 231 |
text = """
|
| 232 |
+
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response.
|
| 233 |
+
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average.
|
| 234 |
+
You can tweak the TPOT slider to adjust the target average TPOT for the models.
|
| 235 |
+
|
| 236 |
+
Each row corresponds to one model, given a constraint on the maximum average TPOT.
|
| 237 |
+
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per request.
|
| 238 |
+
|
| 239 |
Columns
|
| 240 |
- **Model**: The name of the model.
|
| 241 |
- **Params (B)**: Number of parameters in the model.
|
|
|
|
| 249 |
- **Avg BS**: Average batch size of the serving engine over time.
|
| 250 |
- **Max BS**: Maximum batch size configuration of the serving engine.
|
| 251 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
For more detailed information, please take a look at the **About** tab.
|
| 253 |
"""
|
| 254 |
else:
|
|
|
|
| 293 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 294 |
if detail_mode:
|
| 295 |
text = """
|
| 296 |
+
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response.
|
| 297 |
+
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average.
|
| 298 |
+
You can tweak the TPOT slider to adjust the target average TPOT for the models.
|
| 299 |
+
|
| 300 |
+
Each row corresponds to one model, given a constraint on the maximum average TPOT.
|
| 301 |
+
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per request.
|
| 302 |
+
|
| 303 |
Columns
|
| 304 |
- **Model**: The name of the model.
|
| 305 |
- **Params (B)**: Number of parameters in the model.
|
|
|
|
| 313 |
- **Avg BS**: Average batch size of the serving engine over time.
|
| 314 |
- **Max BS**: Maximum batch size configuration of the serving engine.
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
For more detailed information, please take a look at the **About** tab.
|
| 317 |
"""
|
| 318 |
else:
|
|
|
|
| 356 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 357 |
if detail_mode:
|
| 358 |
text = """
|
| 359 |
+
**TPOT (Time Per Output Token)** is the time between each token generated by LLMs as part of their response.
|
| 360 |
+
An average TPOT of 0.20 seconds roughly corresponds to a person reading at 240 words per minute and assuming one word is 1.3 tokens on average.
|
| 361 |
+
You can tweak the TPOT slider to adjust the target average TPOT for the models.
|
| 362 |
+
|
| 363 |
+
Each row corresponds to one model, given a constraint on the maximum average TPOT.
|
| 364 |
+
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per request.
|
| 365 |
+
|
| 366 |
Columns
|
| 367 |
- **Model**: The name of the model.
|
| 368 |
- **Params (B)**: Number of parameters in the model.
|
|
|
|
| 376 |
- **Avg BS**: Average batch size of the serving engine over time.
|
| 377 |
- **Max BS**: Maximum batch size configuration of the serving engine.
|
| 378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 379 |
For more detailed information, please take a look at the **About** tab.
|
| 380 |
"""
|
| 381 |
else:
|
|
|
|
| 508 |
)
|
| 509 |
|
| 510 |
if not detail_mode:
|
| 511 |
+
core_columns = ["Model", "Denoising params", "GPU", "Resolution", "Frames", self.energy_col]
|
| 512 |
readable_name_mapping = {
|
| 513 |
"Denoising params": "Denoising parameters (Billions)",
|
| 514 |
"GPU": "GPU model",
|
|
|
|
| 530 |
|
| 531 |
def get_intro_text(self) -> str:
|
| 532 |
text = """
|
| 533 |
+
<h2>How much energy do GenAI models consume?</h2>
|
| 534 |
+
|
| 535 |
+
<h3>Diffusion text-to-image generation</h3>
|
| 536 |
|
| 537 |
<p style="font-size: 16px">
|
| 538 |
Diffusion models generate images that align with input text prompts.
|
|
|
|
| 548 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 549 |
if detail_mode:
|
| 550 |
text = """
|
| 551 |
+
Each row corresponds to one model, given a constraint on the maximum computation time for the whole batch.
|
| 552 |
+
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per image.
|
| 553 |
+
|
| 554 |
Columns
|
| 555 |
- **Model**: The name of the model.
|
| 556 |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer).
|
|
|
|
| 571 |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the image.
|
| 572 |
- **GPU model**: Name of the GPU model used for benchmarking.
|
| 573 |
- **Energy per image (Joules)**: Energy consumed for each generated image in Joules.
|
| 574 |
+
- **Resolution**: Resolution of the generated image.
|
| 575 |
|
| 576 |
Checking "Show more technical details" above the table will reveal more detailed columns.
|
| 577 |
Also, for more detailed information, please take a look at the **About** tab.
|
|
|
|
| 590 |
|
| 591 |
def get_intro_text(self) -> str:
|
| 592 |
text = """
|
| 593 |
+
<h2>How much energy do GenAI models consume?</h2>
|
| 594 |
+
|
| 595 |
+
<h3>Diffusion text-to-video generation</h3>
|
| 596 |
|
| 597 |
<p style="font-size: 16px">
|
| 598 |
Diffusion models generate videos that align with input text prompts.
|
|
|
|
| 608 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 609 |
if detail_mode:
|
| 610 |
text = """
|
| 611 |
+
Each row corresponds to one model, given a constraint on the maximum computation time for the whole batch.
|
| 612 |
+
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per video.
|
| 613 |
+
|
| 614 |
Columns
|
| 615 |
- **Model**: The name of the model.
|
| 616 |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer).
|
|
|
|
| 632 |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the video.
|
| 633 |
- **GPU model**: Name of the GPU model used for benchmarking.
|
| 634 |
- **Energy per video (Joules)**: Energy consumed for each generated image in Joules.
|
| 635 |
+
- **Frames**: Number of frames in the generated video.
|
| 636 |
+
- **Resolution**: Resolution of the generated video.
|
| 637 |
|
| 638 |
Checking "Show more technical details" above the table will reveal more detailed columns.
|
| 639 |
Also, for more detailed information, please take a look at the **About** tab.
|
|
|
|
| 652 |
|
| 653 |
def get_intro_text(self) -> str:
|
| 654 |
text = """
|
| 655 |
+
<h2>How much energy do GenAI models consume?</h2>
|
| 656 |
+
|
| 657 |
+
<h3>Diffusion image-to-video generation</h3>
|
| 658 |
|
| 659 |
<p style="font-size: 16px">
|
| 660 |
Diffusion models generate videos given an input image (and sometimes alongside with text).
|
|
|
|
| 670 |
def get_detail_text(self, detail_mode: bool) -> str:
|
| 671 |
if detail_mode:
|
| 672 |
text = """
|
| 673 |
+
Each row corresponds to one model, given a constraint on the maximum computation time for the whole batch.
|
| 674 |
+
If more than one GPU types were chosen, the row shows results from the GPU with the lowest energy consumption per video.
|
| 675 |
+
|
| 676 |
Columns
|
| 677 |
- **Model**: The name of the model.
|
| 678 |
- **Denoising params**: Number of parameters in the denosing module (e.g., UNet, Transformer).
|
|
|
|
| 694 |
- **Denoising parameters (Billions)**: Number of parameters in the diffusion model's (core) denoising module. This part of the model is run repetitively to generate gradually refine the video.
|
| 695 |
- **GPU model**: Name of the GPU model used for benchmarking.
|
| 696 |
- **Energy per video (Joules)**: Energy consumed for each generated image in Joules.
|
| 697 |
+
- **Frames**: Number of frames in the generated video.
|
| 698 |
+
- **Resolution**: Resolution of the generated video.
|
| 699 |
|
| 700 |
Checking "Show more technical details" above the table will reveal more detailed columns.
|
| 701 |
Also, for more detailed information, please take a look at the **About** tab.
|
|
|
|
| 703 |
return text
|
| 704 |
|
| 705 |
def get_benchmark_sliders(self) -> dict[str, tuple[float, float, float, float]]:
|
| 706 |
+
return {"Batch latency (s)": (0.0, 120.0, 1.0, 60.0)}
|
| 707 |
|
| 708 |
|
| 709 |
class LegacyTableManager:
|
|
|
|
| 747 |
self.full_df = df
|
| 748 |
|
| 749 |
# Default view of the table is to only show the first options.
|
| 750 |
+
self.set_filter_get_df()
|
| 751 |
|
| 752 |
def _read_tables(self, data_dir: str) -> pd.DataFrame:
|
| 753 |
"""Read tables."""
|
|
|
|
| 806 |
gr.Dropdown.update(choices=["None", *columns]),
|
| 807 |
]
|
| 808 |
|
| 809 |
+
def set_filter_get_df(self, *filters) -> pd.DataFrame:
|
| 810 |
"""Set the current set of filters and return the filtered DataFrame."""
|
| 811 |
# If the filter is empty, we default to the first choice for each key.
|
| 812 |
if not filters:
|
|
|
|
| 1056 |
# Colosseum event handlers
|
| 1057 |
def on_load():
|
| 1058 |
"""Intialize the dataframe, shuffle the model preference dropdown choices."""
|
| 1059 |
+
dataframe = global_ltbm.set_filter_get_df()
|
| 1060 |
dataframes = [global_tbm.set_filter_get_df(detail_mode=False) for global_tbm in global_tbms]
|
| 1061 |
return dataframe, *dataframes
|
| 1062 |
|
data/diffusion/image-to-video/models.json
CHANGED
|
@@ -15,7 +15,7 @@
|
|
| 15 |
},
|
| 16 |
"stabilityai/stable-video-diffusion-img2vid-xt": {
|
| 17 |
"url": "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt",
|
| 18 |
-
"nickname": "Stable Video Diffusion
|
| 19 |
"total_params": 2.3,
|
| 20 |
"denoising_params": 1.5,
|
| 21 |
"resolution": "1024x576"
|
|
|
|
| 15 |
},
|
| 16 |
"stabilityai/stable-video-diffusion-img2vid-xt": {
|
| 17 |
"url": "https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt",
|
| 18 |
+
"nickname": "Stable Video Diffusion XT",
|
| 19 |
"total_params": 2.3,
|
| 20 |
"denoising_params": 1.5,
|
| 21 |
"resolution": "1024x576"
|