MergeLlama-7b

Paused

codys12 commited on Oct 17, 2023

Commit

99ab088

1 Parent(s): 4e4fd76

fixed?

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,7 +7,7 @@ from typing import Iterator
 import gradio as gr
 import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 DESCRIPTION = "# Mistral-7B"
@@ -21,8 +21,9 @@ MAX_INPUT_TOKEN_LENGTH = 4096
 if torch.cuda.is_available():
     model_id = "codys12/MergeLlama-7b"
     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map=0, cache_dir="/data")
-    model.cuda()
     tokenizer = AutoTokenizer.from_pretrained(model_id)
 @spaces.GPU
@@ -50,20 +51,25 @@ def generate(
         input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
-    input_ids = tokenizer(current_input, return_tensors="pt").to("cuda")
-    # Generate
-    output_ids = model.generate(input_ids=input_ids,
-                              max_new_tokens=max_new_tokens,
-                              do_sample=True,
-                              top_p=top_p,
-                              top_k=top_k,
-                              temperature=temperature,
-                              repetition_penalty=repetition_penalty)
-    # Stream output
-    for id in output_ids.tolist()[0]:
-     yield tokenizer.decode(id)
 chat_interface = gr.ChatInterface(

 import gradio as gr
 import spaces
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
 DESCRIPTION = "# Mistral-7B"
 if torch.cuda.is_available():
     model_id = "codys12/MergeLlama-7b"
     model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map=0, cache_dir="/data")
     tokenizer = AutoTokenizer.from_pretrained(model_id)
+    tokenizer.pad_token = tokenizer.eos_token
+    tokenizer.padding_side = "right"
 @spaces.GPU
         input_ids = input_ids[-MAX_INPUT_TOKEN_LENGTH:]
         gr.Warning("Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
+    )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 chat_interface = gr.ChatInterface(