Spaces:

joaogante
/

assisted_generation_benchmarks

Running

App Files Files Community

joaogante commited on May 5, 2023

Commit

39b15f4

1 Parent(s): 7cd710e

working plots

Browse files

Files changed (4) hide show

__pycache__/app.cpython-310.pyc +0 -0
app.py +159 -219
data.csv +65 -0
plt.png +0 -0

__pycache__/app.cpython-310.pyc ADDED Viewed

Binary file (6.36 kB). View file

app.py CHANGED Viewed

@@ -2,280 +2,220 @@ import matplotlib
 matplotlib.use('Agg')
 import functools
 import gradio as gr
 import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
-# benchmark order: pytorch, tf eager, tf xla; units = ms
-BENCHMARK_DATA = {
-    "Greedy Decoding": {
-        "DistilGPT2": {
-            "T4": [336.22, 3976.23, 115.84],
-            "3090": [158.38, 1835.82, 46.56],
-            "A100": [371.49, 4073.84, 60.94],
-        },
-        "GPT2": {
-            "T4": [607.31, 7140.23, 185.12],
-            "3090": [297.03, 3308.31, 76.68],
-            "A100": [691.75, 7323.60, 110.72],
-        },
-        "OPT-1.3B": {
-            "T4": [1303.41, 15939.07, 1488.15],
-            "3090": [428.33, 7259.43, 468.37],
-            "A100": [1125.00, 16713.63, 384.52],
-        },
-        "GPTJ-6B": {
-            "T4": [0, 0, 0],
-            "3090": [0, 0, 0],
-            "A100": [2664.28, 32783.09, 1440.06],
-        },
-        "T5 Small": {
-            "T4": [99.88, 1527.73, 18.78],
-            "3090": [55.09, 665.70, 9.25],
-            "A100": [124.91, 1642.07, 13.72],
-        },
-        "T5 Base": {
-            "T4": [416.56, 6095.05, 106.12],
-            "3090": [223.00, 2503.28, 46.67],
-            "A100": [550.76, 6504.11, 64.57],
-        },
-        "T5 Large": {
-            "T4": [645.05, 9587.67, 225.17],
-            "3090": [377.74, 4216.41, 97.92],
-            "A100": [944.17, 10572.43, 116.52],
-        },
-        "T5 3B": {
-            "T4": [1493.61, 13629.80, 1494.80],
-            "3090": [694.75, 6316.79, 489.33],
-            "A100": [1801.68, 16707.71, 411.93],
-        },
-    },
-    "Sampling": {
-        "DistilGPT2": {
-            "T4": [617.40, 6078.81, 221.65],
-            "3090": [310.37, 2843.73, 85.44],
-            "A100": [729.05, 7140.05, 121.83],
-        },
-        "GPT2": {
-            "T4": [1205.34, 12256.98, 378.69],
-            "3090": [577.12, 5637.11, 160.02],
-            "A100": [1377.68, 15605.72, 234.47],
-        },
-        "OPT-1.3B": {
-            "T4": [2166.72, 19126.25, 2341.32],
-            "3090": [706.50, 9616.97, 731.58],
-            "A100": [2019.70, 28621.09, 690.36],
-        },
-        "GPTJ-6B": {
-            "T4": [0, 0, 0],
-            "3090": [0, 0, 0],
-            "A100": [5150.35, 70554.07, 2744.49],
-        },
-        "T5 Small": {
-            "T4": [235.93, 3599.47, 41.07],
-            "3090": [100.41, 1093.33, 23.24],
-            "A100": [267.42, 3366.73, 28.53],
-        },
-        "T5 Base": {
-            "T4": [812.59, 7966.73, 196.85],
-            "3090": [407.81, 4904.54, 97.56],
-            "A100": [1033.05, 11521.97, 123.93],
-        },
-        "T5 Large": {
-            "T4": [1114.22, 16433.31, 424.91],
-            "3090": [647.61, 7184.71, 160.97],
-            "A100": [1668.73, 19962.78, 200.75],
-        },
-        "T5 3B": {
-            "T4": [2282.56, 20891.22, 2196.02],
-            "3090": [1011.32, 9735.97, 734.40],
-            "A100": [2769.64, 26440.65, 612.98],
-        },
-    },
-    "Beam Search": {
-        "DistilGPT2": {
-            "T4": [2407.89, 19442.60, 3313.92],
-            "3090": [998.52, 8286.03, 900.28],
-            "A100": [2237.41, 21771.40, 760.47],
-        },
-        "GPT2": {
-            "T4": [3767.43, 34813.93, 5559.42],
-            "3090": [1633.04, 14606.93, 1533.55],
-            "A100": [3705.43, 34586.23, 1295.87],
-        },
-        "OPT-1.3B": {
-            "T4": [16649.82, 78500.33, 21894.31],
-            "3090": [508518, 32822.81, 5762.46],
-            "A100": [5967.32, 78334.56, 4096.38],
-        },
-        "GPTJ-6B": {
-            "T4": [0, 0, 0],
-            "3090": [0, 0, 0],
-            "A100": [15119.10, 134000.40, 10214.17],
-        },
-        "T5 Small": {
-            "T4": [283.64, 25089.12, 1391.66],
-            "3090": [137.38, 10680.28, 486.96],
-            "A100": [329.28, 24747.38, 513.99],
-        },
-        "T5 Base": {
-            "T4": [1383.21, 44809.14, 3920.40],
-            "3090": [723.11, 18657.48, 1258.60],
-            "A100": [2360.85, 45085.07, 1107.58],
-        },
-        "T5 Large": {
-            "T4": [1663.50, 81902.41, 9551.29],
-            "3090": [922.53, 35524.30, 2838.86],
-            "A100": [2168.22, 86890.00, 2373.04],
-        },
-        "T5 3B": {
-            "T4": [0, 0, 0],
-            "3090": [1521.05, 35337.30, 8282.09],
-            "A100": [3162.54, 88453.65, 5585.20],
-        },
-    },
-}
 FIGURE_PATH = "plt.png"
 FIG_DPI = 300
-def get_plot(model_name, plot_eager, generate_type):
-    df = pd.DataFrame(BENCHMARK_DATA[generate_type][model_name])
-    df["framework"] = ["PyTorch", "TF (Eager Execution)", "TF (XLA)"]
-    df = pd.melt(df, id_vars=["framework"], value_vars=["T4", "3090", "A100"])
-    if plot_eager == "No":
-        df = df[df["framework"] != "TF (Eager Execution)"]
     g = sns.catplot(
         data=df,
         kind="bar",
-        x="variable",
-        y="value",
-        hue="framework",
-        palette={"PyTorch": "blue", "TF (Eager Execution)": "orange", "TF (XLA)": "red"},
         alpha=.9,
     )
     g.despine(left=True)
-    g.set_axis_labels("GPU", "Generation time (ms)")
-    g.legend.set_title("Framework")
     # Add the number to the top of each bar
     ax = g.facet_axis(0, 0)
     for i in ax.containers:
-        ax.bar_label(i,)
     plt.savefig(FIGURE_PATH, dpi=FIG_DPI)
     return FIGURE_PATH
 demo = gr.Blocks()
 with demo:
     gr.Markdown(
         """
-        # TensorFlow XLA Text Generation Benchmark
-        Instructions:
-        1. Pick a tab for the type of generation (or for benchmark information);
-        2. Select a model from the dropdown menu;
-        3. Optionally omit results from TensorFlow Eager Execution, if you wish to better compare the performance of
-        PyTorch to TensorFlow with XLA.
         """
     )
     with gr.Tabs():
-        with gr.TabItem("Greedy Decoding"):
-            plot_fn = functools.partial(get_plot, generate_type="Greedy Decoding")
             with gr.Row():
-                with gr.Column():
-                    model_selector = gr.Dropdown(
-                        choices=["DistilGPT2", "GPT2", "OPT-1.3B", "GPTJ-6B", "T5 Small", "T5 Base", "T5 Large", "T5 3B"],
-                        value="T5 Small",
-                        label="Model",
-                        interactive=True,
-                    )
-                    eager_enabler = gr.Radio(
-                        ["Yes", "No"],
-                        value="Yes",
-                        label="Plot TF Eager Execution?",
-                        interactive=True
-                    )
                     gr.Markdown(
                         """
-                        ### Greedy Decoding benchmark parameters
-                        - `max_new_tokens = 64`;
-                        - `pad_to_multiple_of = 64` for Tensorflow XLA models. Others do not pad (input prompts between 2 and 33 tokens).
                         """
                     )
-                plot = gr.Image(value=plot_fn("T5 Small", "Yes"))  # Show plot when the gradio app is initialized
-            model_selector.change(fn=plot_fn, inputs=[model_selector, eager_enabler], outputs=plot)
-            eager_enabler.change(fn=plot_fn, inputs=[model_selector, eager_enabler], outputs=plot)
-        with gr.TabItem("Sampling"):
-            plot_fn = functools.partial(get_plot, generate_type="Sampling")
             with gr.Row():
-                with gr.Column():
-                    model_selector = gr.Dropdown(
-                        choices=["DistilGPT2", "GPT2", "OPT-1.3B", "GPTJ-6B", "T5 Small", "T5 Base", "T5 Large", "T5 3B"],
-                        value="T5 Small",
-                        label="Model",
-                        interactive=True,
-                    )
-                    eager_enabler = gr.Radio(
-                        ["Yes", "No"],
-                        value="Yes",
-                        label="Plot TF Eager Execution?",
-                        interactive=True
-                    )
                     gr.Markdown(
                         """
-                        ### Sampling benchmark parameters
-                        - `max_new_tokens = 128`;
-                        - `temperature = 2.0`;
-                        - `top_k = 50`;
-                        - `pad_to_multiple_of = 64` for Tensorflow XLA models. Others do not pad (input prompts between 2 and 33 tokens).
                         """
                     )
-                plot = gr.Image(value=plot_fn("T5 Small", "Yes"))  # Show plot when the gradio app is initialized
-            model_selector.change(fn=plot_fn, inputs=[model_selector, eager_enabler], outputs=plot)
-            eager_enabler.change(fn=plot_fn, inputs=[model_selector, eager_enabler], outputs=plot)
-        with gr.TabItem("Beam Search"):
-            plot_fn = functools.partial(get_plot, generate_type="Beam Search")
             with gr.Row():
-                with gr.Column():
-                    model_selector = gr.Dropdown(
-                        choices=["DistilGPT2", "GPT2", "OPT-1.3B", "GPTJ-6B", "T5 Small", "T5 Base", "T5 Large", "T5 3B"],
-                        value="T5 Small",
-                        label="Model",
-                        interactive=True,
                     )
-                    eager_enabler = gr.Radio(
-                        ["Yes", "No"],
-                        value="Yes",
-                        label="Plot TF Eager Execution?",
-                        interactive=True
                     )
                     gr.Markdown(
                         """
-                        ### Beam Search benchmark parameters
-                        - `max_new_tokens = 256`;
-                        - `num_beams = 16`;
-                        - `pad_to_multiple_of = 64` for Tensorflow XLA models. Others do not pad (input prompts between 2 and 33 tokens).
                         """
                     )
-                plot = gr.Image(value=plot_fn("T5 Small", "Yes"))  # Show plot when the gradio app is initialized
-            model_selector.change(fn=plot_fn, inputs=[model_selector, eager_enabler], outputs=plot)
-            eager_enabler.change(fn=plot_fn, inputs=[model_selector, eager_enabler], outputs=plot)
         with gr.TabItem("Benchmark Information"):
             gr.Dataframe(
                 headers=["Parameter", "Value"],
                 value=[
-                    ["Transformers Version", "4.21"],
-                    ["TensorFlow Version", "2.9.1"],
-                    ["Pytorch Version", "1.11.0"],
                     ["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"],
-                    ["CUDA", "11.6 (3090) / 11.3 (others GPUs)"],
-                    ["Number of Runs", "100 (the first run was discarded to ignore compilation time)"],
-                    ["Is there code to reproduce?", "Yes -- https://gist.github.com/gante/f0017e3f13ac11b0c02e4e4db351f52f"],
                 ],
             )

 matplotlib.use('Agg')
 import functools
 import gradio as gr
 import matplotlib.pyplot as plt
 import seaborn as sns
 import pandas as pd
 FIGURE_PATH = "plt.png"
 FIG_DPI = 300
+def get_plot(task, gpu, omit_offload):
+    # slice the dataframe according to the inputs
+    df = pd.read_csv("data.csv")
+    df = df[df["task"] == task]
+    df = df[df["gpu"] == gpu]
+    if omit_offload == "Yes":
+        df = df[df["offload"] == 0]
+    # combine model name and dtype
+    df["model and dtype"] = df['model_name'].str.cat(df[['dtype']], sep=', ')
+    # fuse the two columns to be compared (original and assisted generation)
+    df = df.melt(
+        id_vars=["task", "gpu", "model and dtype", "offload"],
+        value_vars=["Greedy", "Assisted"],
+        var_name="generation_type",
+        value_name="generation_time",
+    )
     g = sns.catplot(
         data=df,
         kind="bar",
+        x="model and dtype",
+        y="generation_time",
+        hue="generation_type",
+        palette={"Greedy": "blue", "Assisted": "orange"},
         alpha=.9,
     )
     g.despine(left=True)
+    g.set_axis_labels("Model size and dtype", "Latency (ms/token)")
+    g.set_xticklabels(fontsize=7)
+    g.set_yticklabels(fontsize=7)
+    g.legend.set_title("Generation Type")
+    plt.setp(g._legend.get_texts(), fontsize='7')  # for legend text
     # Add the number to the top of each bar
     ax = g.facet_axis(0, 0)
     for i in ax.containers:
+        ax.bar_label(i, fontsize=7)
     plt.savefig(FIGURE_PATH, dpi=FIG_DPI)
     return FIGURE_PATH
 demo = gr.Blocks()
 with demo:
     gr.Markdown(
         """
+        # Assisted Generation Benchmark
         """
     )
+    # components shared across tabs
+    omit_offload_fn = functools.partial(
+        gr.Radio, ["Yes", "No"], value="No", label="Omit cases with memory offload?", interactive=True
+    )
+    def gpu_selector_fn(gpu_list):
+        return gr.Dropdown(
+            gpu_list, value=gpu_list[-1], label="GPU", interactive=True
+        )
     with gr.Tabs():
+        with gr.TabItem("OPT: Open Text Generation"):
+            plot_fn = functools.partial(get_plot, "OPT: Open Text Generation")
             with gr.Row():
+                with gr.Column(scale=0.3):
+                    gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
+                    omit_offload = omit_offload_fn()
                     gr.Markdown(
                         """
+                        ### Assistant Model
+                        - `facebook/opt-125m`
+                        ### Model Names:
+                        - 1.3B: `facebook/opt-1.3b`
+                        - 6.7B: `facebook/opt-6.7b`
+                        - 30B: `facebook/opt-30b`
+                        - 66B: `facebook/opt-66b`
+                        ### Dataset used as input prompt:
+                        - C4 (en, validation set)
                         """
                     )
+                # Show plot when the gradio app is initialized
+                plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
+            # Update plot when any of the inputs change
+            plot_inputs = [gpu_selector, omit_offload]
+            gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+            omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+        with gr.TabItem("OPT: Summarization"):
+            plot_fn = functools.partial(get_plot, "OPT: Summarization")
             with gr.Row():
+                with gr.Column(scale=0.3):
+                    gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
+                    omit_offload = omit_offload_fn()
                     gr.Markdown(
                         """
+                        ### Assistant Model
+                        - `facebook/opt-125m`
+                        ### Model Names:
+                        - 1.3B: `facebook/opt-1.3b`
+                        - 6.7B: `facebook/opt-6.7b`
+                        - 30B: `facebook/opt-30b`
+                        - 66B: `facebook/opt-66b`
+                        ### Dataset used as input prompt:
+                        - CNN Dailymail (3.0.0, validation set)
                         """
                     )
+                # Show plot when the gradio app is initialized
+                plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
+            # Update plot when any of the inputs change
+            plot_inputs = [gpu_selector, omit_offload]
+            gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+            omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+        with gr.TabItem("Whisper: ARS"):
+            plot_fn = functools.partial(get_plot, "Whisper: ARS")
             with gr.Row():
+                with gr.Column(scale=0.3):
+                    gpu_selector = gpu_selector_fn(["3090", "T4"])
+                    omit_offload = omit_offload_fn()
+                    gr.Markdown(
+                        """
+                        ### Assistant Model
+                        - `openai/whisper-tiny`
+                        ### Model Names:
+                        - large-v2: `openai/whisper-large-v2`
+                        ### Dataset used as input prompt:
+                        - Librispeech ARS (clean, validation set)
+                        """
                     )
+                # Show plot when the gradio app is initialized
+                plot = gr.Image(value=plot_fn("T4", "No"))
+            # Update plot when any of the inputs change
+            plot_inputs = [gpu_selector, omit_offload]
+            gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+            omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+        with gr.TabItem("CodeGen: Code Generation"):
+            plot_fn = functools.partial(get_plot, "CodeGen: Code Generation")
+            with gr.Row():
+                with gr.Column(scale=0.3):
+                    gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
+                    omit_offload = omit_offload_fn()
+                    gr.Markdown(
+                        """
+                        ### Assistant Model
+                        - `Salesforce/codegen-350M-mono`
+                        ### Model Names:
+                        - 2B: `Salesforce/codegen-2B-mono`
+                        - 6B: `Salesforce/codegen-6B-mono`
+                        - 16B: `Salesforce/codegen-16B-mono`
+                        ### Dataset used as input prompt:
+                        - The Stack (python)
+                        """
                     )
+                # Show plot when the gradio app is initialized
+                plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
+            # Update plot when any of the inputs change
+            plot_inputs = [gpu_selector, omit_offload]
+            gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+            omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+        with gr.TabItem("Flan-T5: Summarization"):
+            plot_fn = functools.partial(get_plot, "Flan-T5: Summarization")
+            with gr.Row():
+                with gr.Column(scale=0.3):
+                    gpu_selector = gpu_selector_fn(["3090", "T4", "T4 *2", "A100 (80GB)"])
+                    omit_offload = omit_offload_fn()
                     gr.Markdown(
                         """
+                        ### Assistant Model
+                        - `google/flan-t5-small`
+                        ### Model Names:
+                        - large: `google/flan-t5-large`
+                        - xl: `google/flan-t5-xl`
+                        - xxl: `google/flan-t5-xxl`
+                        - ul2: `google/flan-ul2`
+                        ### Dataset used as input prompt:
+                        - CNN Dailymail (3.0.0, validation set)
                         """
                     )
+                # Show plot when the gradio app is initialized
+                plot = gr.Image(value=plot_fn("A100 (80GB)", "No"))
+            # Update plot when any of the inputs change
+            plot_inputs = [gpu_selector, omit_offload]
+            gpu_selector.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
+            omit_offload.change(fn=plot_fn, inputs=plot_inputs, outputs=plot)
         with gr.TabItem("Benchmark Information"):
             gr.Dataframe(
                 headers=["Parameter", "Value"],
                 value=[
+                    ["Transformers Version", "4.29dev0"],
+                    ["Pytorch Version", "2.0.0"],
                     ["OS", "22.04 LTS (3090) / Debian 10 (other GPUs)"],
+                    ["CUDA", "11.8 (3090) / 11.3 (others GPUs)"],
+                    ["Number of input samples", "20-100 (depending on the model size)"],
+                    ["Is there code to reproduce?", "Yes -- https://github.com/gante/huggingface-demos/tree/main/experiments/faster_generation"],
                 ],
             )

data.csv ADDED Viewed

	@@ -0,0 +1,65 @@

+gpu,task,model_name,dtype,offload,Greedy,Assisted
+3090,OPT: Open Text Generation,1.3B,FP32,0,11.64,10.01
+3090,OPT: Open Text Generation,6.7B,FP32,1,428.47,114.99
+3090,OPT: Open Text Generation,6.7B,FP16,0,19.62,12.44
+3090,OPT: Open Text Generation,6.7B,INT8,0,104.43,40.33
+3090,OPT: Open Text Generation,30B,FP16,1,2616,1099
+3090,OPT: Summarization,1.3B,FP32,0,13.16,10.89
+3090,OPT: Summarization,6.7B,FP32,1,587.8,114.53
+3090,OPT: Summarization,6.7B,FP16,0,25.14,14.56
+3090,OPT: Summarization,30B,FP16,1,2732,331.2
+3090,Whisper: ARS,large-v2,FP32,0,24.81,12.55
+3090,CodeGen: Code Generation,2B,FP32,0,28.90,28.36
+3090,CodeGen: Code Generation,6B,FP32,1,544.11,110.42
+3090,CodeGen: Code Generation,6B,FP16,0,34.36,31.84
+3090,CodeGen: Code Generation,16B,FP16,1,808.69,161.50
+3090,CodeGen: Code Generation,16B,INT8,0,66.69,41.47
+3090,Flan-T5: Summarization,large,FP32,0,21.27,15.76
+3090,Flan-T5: Summarization,xl,FP32,0,25.60,18.94
+3090,Flan-T5: Summarization,xxl,FP32,1,1326.22,580.10
+3090,Flan-T5: Summarization,xxl,FP16,1,52.52,36.07
+3090,Flan-T5: Summarization,xxl,INT8,0,67.13,38.92
+3090,Flan-T5: Summarization,ul2,FP16,1,1185.25,480.11
+T4,OPT: Open Text Generation,1.3B,FP32,0,24.74,22.37
+T4,OPT: Open Text Generation,6.7B,FP32,1,2863.57,733.32
+T4,OPT: Open Text Generation,6.7B,FP16,0,62.04,29.67
+T4,OPT: Open Text Generation,6.7B,INT8,0,180.59,66.12
+T4,OPT: Summarization,1.3B,FP32,0,32.50,26.58
+T4,OPT: Summarization,6.7B,FP16,1,499.00,67.33
+T4,OPT: Summarization,6.7B,INT8,0,182.98,37.89
+T4,Whisper: ARS,large-v2,FP32,0,62.68,40.74
+T4,CodeGen: Code Generation,2B,FP32,0,73.88,67.62
+T4,CodeGen: Code Generation,6B,FP16,1,682.94,135.99
+T4,CodeGen: Code Generation,6B,INT8,0,117.91,72.40
+T4,Flan-T5: Summarization,large,FP32,0,43.67,36.26
+T4,Flan-T5: Summarization,xl,FP16,0,53.54,42.27
+T4,Flan-T5: Summarization,xxl,FP16,1,2814,1177
+T4 *2,OPT: Open Text Generation,6.7B,FP32,0,118.42,55.42
+T4 *2,OPT: Open Text Generation,6.7B,FP16,0,61.30,34.76
+T4 *2,OPT: Summarization,6.7B,FP32,1,1238.59,339.34
+T4 *2,OPT: Summarization,6.7B,FP16,0,94.62,34.37
+T4 *2,CodeGen: Code Generation,6B,FP16,0,116.34,72.09
+T4 *2,CodeGen: Code Generation,6B,INT8,0,119.14,79.01
+T4 *2,CodeGen: Code Generation,16B,FP16,1,1509.05,693.01
+T4 *2,CodeGen: Code Generation,16B,INT8,0,200.79,99.00
+T4 *2,Flan-T5: Summarization,xl,FP32,0,59.27,68.70
+T4 *2,Flan-T5: Summarization,xl,FP16,0,51.59,50.56
+T4 *2,Flan-T5: Summarization,xxl,FP16,1,797.7,534.3
+T4 *2,Flan-T5: Summarization,xxl,INT8,0,243.3,143.38
+A100 (80GB),OPT: Open Text Generation,6.7B,FP32,0,35.34,30.00
+A100 (80GB),OPT: Open Text Generation,30B,FP16,0,54.57,38.27
+A100 (80GB),OPT: Open Text Generation,30B,INT8,0,290.82,135.77
+A100 (80GB),OPT: Open Text Generation,66B,INT8,0,398.49,146.04
+A100 (80GB),OPT: Summarization,6.7B,FP32,0,43.64,27.03
+A100 (80GB),OPT: Summarization,30B,FP16,0,54.94,28.87
+A100 (80GB),OPT: Summarization,30B,INT8,0,291.57,49.42
+A100 (80GB),OPT: Summarization,66B,INT8,0,392.34,82.29
+A100 (80GB),CodeGen: Code Generation,16B,FP32,0,75.56,80.44
+A100 (80GB),CodeGen: Code Generation,16B,FP16,0,70.51,74.79
+A100 (80GB),CodeGen: Code Generation,16B,INT8,0,130.77,90.28
+A100 (80GB),Flan-T5: Summarization,ul2,FP32,0,87.40,59.26
+A100 (80GB),Flan-T5: Summarization,ul2,FP16,0,78.13,42.95
+A100 (80GB),Flan-T5: Summarization,ul2,INT8,0,187.66,81.72

plt.png ADDED Viewed