joaogante HF staff commited on
Commit
8afec35
·
verified ·
1 Parent(s): 8a1e417

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -9
app.py CHANGED
@@ -8,11 +8,11 @@ import torch
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
9
 
10
 
11
- model_id = "facebook/opt-6.7b"
12
- assistant_id = "facebook/opt-125m"
13
 
14
- model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
15
- assistant_model = AutoModelForCausalLM.from_pretrained(assistant_id).to(device=model.device, dtype=torch.bfloat16)
16
  tokenizer = AutoTokenizer.from_pretrained(model_id)
17
 
18
 
@@ -49,9 +49,9 @@ def run_generation(user_text, use_assistant, temperature, max_new_tokens):
49
  model_output = ""
50
  for new_text in streamer:
51
  model_output += new_text
52
- time_so_far = round(time.time() - start, 3)
53
  tokens_so_far = tokenizer(model_output, return_tensors="pt").input_ids.shape[1]
54
- yield [model_output, tokens_so_far/time_so_far]
55
 
56
 
57
  def reset_textbox():
@@ -61,8 +61,8 @@ def reset_textbox():
61
  with gr.Blocks() as demo:
62
  gr.Markdown(
63
  "# 🤗 Assisted Generation Demo\n"
64
- f"- Model: {model_id} (INT8, ~7GB)\n"
65
- f"- Assistant Model: {assistant_id} (FP16, ~0.3GB)\n"
66
  "- Recipe for speedup: a) >10x model size difference in parameters; b) assistant trained similarly; c) CPU is not a bottleneck"
67
  )
68
 
@@ -84,7 +84,7 @@ with gr.Blocks() as demo:
84
  temperature = gr.Slider(
85
  minimum=0.0, maximum=2.0, value=0.6, step=0.05, interactive=True, label="Temperature (0.0 = Greedy)",
86
  )
87
- gr.Markdown("### Tokens per secon")
88
  tokens_per_second = gr.Textbox(lines=1, interactive=False, show_label=False)
89
 
90
  generate_inputs = [user_text, use_assistant, temperature, max_new_tokens]
 
8
  from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
9
 
10
 
11
+ model_id = "Qwen/Qwen2.5-32B-Instruct"
12
+ assistant_id = "Qwen/Qwen2.5-0.5B-Instruct"
13
 
14
+ model = AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True, device_map="auto")
15
+ assistant_model = AutoModelForCausalLM.from_pretrained(assistant_id).to(device=model.device, dtype=torch.float16)
16
  tokenizer = AutoTokenizer.from_pretrained(model_id)
17
 
18
 
 
49
  model_output = ""
50
  for new_text in streamer:
51
  model_output += new_text
52
+ time_so_far = time.time() - start
53
  tokens_so_far = tokenizer(model_output, return_tensors="pt").input_ids.shape[1]
54
+ yield [model_output, round(tokens_so_far/time_so_far, 2)]
55
 
56
 
57
  def reset_textbox():
 
61
  with gr.Blocks() as demo:
62
  gr.Markdown(
63
  "# 🤗 Assisted Generation Demo\n"
64
+ f"- Model: {model_id} (4-bit quant, ~16GB)\n"
65
+ f"- Assistant Model: {assistant_id} (FP16, ~1GB)\n"
66
  "- Recipe for speedup: a) >10x model size difference in parameters; b) assistant trained similarly; c) CPU is not a bottleneck"
67
  )
68
 
 
84
  temperature = gr.Slider(
85
  minimum=0.0, maximum=2.0, value=0.6, step=0.05, interactive=True, label="Temperature (0.0 = Greedy)",
86
  )
87
+ gr.Markdown("### Tokens per second")
88
  tokens_per_second = gr.Textbox(lines=1, interactive=False, show_label=False)
89
 
90
  generate_inputs = [user_text, use_assistant, temperature, max_new_tokens]