|
from threading import Thread |
|
|
|
import torch |
|
import gradio as gr |
|
from transformers import pipeline,AutoTokenizer, AutoModelForCausalLM |
|
from peft import PeftModel, PeftConfig |
|
import re |
|
|
|
|
|
torch_device = "cuda" if torch.cuda.is_available() else "cpu" |
|
print("Running on device:", torch_device) |
|
print("CPU threads:", torch.get_num_threads()) |
|
|
|
peft_model_id = "ldhldh/1.3_40kstep" |
|
|
|
|
|
|
|
|
|
|
|
config = PeftConfig.from_pretrained(peft_model_id) |
|
|
|
base_model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path) |
|
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path) |
|
|
|
|
|
|
|
base_model.eval() |
|
base_model.config.use_cache = True |
|
|
|
|
|
model = PeftModel.from_pretrained(base_model, peft_model_id, device_map="auto") |
|
model.eval() |
|
model.config.use_cache = True |
|
|
|
|
|
|
|
def gen(x, top_p, top_k, temperature, max_new_tokens, repetition_penalty): |
|
gened = model.generate( |
|
**tokenizer( |
|
f"{x}", |
|
return_tensors='pt', |
|
return_token_type_ids=False |
|
), |
|
|
|
max_new_tokens=max_new_tokens, |
|
min_new_tokens = 5, |
|
exponential_decay_length_penalty = (max_new_tokens/2, 1.1), |
|
top_p=top_p, |
|
top_k=top_k, |
|
temperature = temperature, |
|
early_stopping=True, |
|
do_sample=True, |
|
eos_token_id=2, |
|
pad_token_id=2, |
|
|
|
repetition_penalty=repetition_penalty, |
|
no_repeat_ngram_size = 2 |
|
) |
|
|
|
model_output = tokenizer.decode(gened[0]) |
|
return model_output |
|
|
|
def reset_textbox(): |
|
return gr.update(value='') |
|
|
|
|
|
with gr.Blocks() as demo: |
|
duplicate_link = "https://huggingface.co/spaces/beomi/KoRWKV-1.5B?duplicate=true" |
|
gr.Markdown( |
|
"duplicated from beomi/KoRWKV-1.5B, baseModel:EleutherAI/polyglot-ko-1.3b" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=4): |
|
user_text = gr.Textbox( |
|
placeholder='\\nfriend: μ°λ¦¬ μ¬ν κ°λ? \\nyou:', |
|
label="User input" |
|
) |
|
model_output = gr.Textbox(label="Model output", lines=10, interactive=False) |
|
button_submit = gr.Button(value="Submit") |
|
|
|
with gr.Column(scale=1): |
|
max_new_tokens = gr.Slider( |
|
minimum=1, maximum=200, value=20, step=1, interactive=True, label="Max New Tokens", |
|
) |
|
top_p = gr.Slider( |
|
minimum=0.05, maximum=1.0, value=0.8, step=0.05, interactive=True, label="Top-p (nucleus sampling)", |
|
) |
|
top_k = gr.Slider( |
|
minimum=5, maximum=100, value=30, step=5, interactive=True, label="Top-k (nucleus sampling)", |
|
) |
|
temperature = gr.Slider( |
|
minimum=0.1, maximum=2.0, value=0.5, step=0.1, interactive=True, label="Temperature", |
|
) |
|
repetition_penalty = gr.Slider( |
|
minimum=1.0, maximum=3.0, value=1.2, step=0.1, interactive=True, label="repetition_penalty", |
|
) |
|
|
|
button_submit.click(gen, [user_text, top_p, top_k, temperature, max_new_tokens, repetition_penalty], model_output) |
|
|
|
demo.queue(max_size=32).launch(enable_queue=True) |