Spaces:
Sleeping
Sleeping
import re | |
import threading | |
import gradio as gr | |
import spaces | |
import transformers | |
from transformers import pipeline | |
# loading model and tokenizer | |
model_name = "Qwen/Qwen2-1.5B-Instruct" | |
if gr.NO_RELOAD: | |
pipe = pipeline( | |
"text-generation", | |
model=model_name, | |
device_map="auto", | |
torch_dtype="auto", | |
) | |
# the sentences starting the reasoning step by step | |
rethink_prepends = [ | |
"OK, I need to figure out ", | |
"I think ", | |
"Wait, I think ", | |
"Let me check if ", | |
"I should also remember that ", | |
"Another thing to note is that ", | |
"I also recall that ", | |
"I think I have a good grasp ", | |
"Now, using all the above information, I can answer the question using the original language used for the question:" | |
"\n{question}\n" | |
"\n**ANSWER**\n", | |
] | |
# to fix some problems with math display | |
latex_delimiters = [ | |
{"left": "$$", "right": "$$", "display": True}, | |
{"left": "$", "right": "$", "display": False}, | |
] | |
def reformat_math(text): | |
"""Fix MathJax delimiters to use the Gradio syntax (Katex). | |
This is a workaround to display math formulas in Gradio. For now, I havn't found a way to | |
make it work as expected using others latex_delimiters... | |
""" | |
text = re.sub(r"\\\[\s*(.*?)\s*\\\]", r"$$\1$$", text, flags=re.DOTALL) | |
text = re.sub(r"\\\(\s*(.*?)\s*\\\)", r"$\1$", text, flags=re.DOTALL) | |
return text | |
def user_input(message, history: list): | |
"""Append the user input in the history and clean the input textbox""" | |
return "", history + [gr.ChatMessage(role="user", content=message)] | |
def rebuild_messages(history: list): | |
"""Rebuid the messages from the history to be used by the model without the intermediate thoughs""" | |
messages = [] | |
for h in history: | |
if isinstance(h, dict) and not h.get("metadata", {}).get("title", False): | |
messages.append(h) | |
elif ( | |
isinstance(h, gr.ChatMessage) | |
and h.metadata.get("title") | |
and isinstance(h.content, str) | |
): | |
messages.append({"role": h.role, "content": h.content}) | |
return messages | |
def bot(history: list, max_num_tokens: int, final_num_tokens: int): | |
"""Make the model answering the question""" | |
# to get token as a stream, later in a thread | |
streamer = transformers.TextIteratorStreamer( | |
pipe.tokenizer, # pyright: ignore | |
skip_special_tokens=True, | |
skip_prompt=True, | |
) | |
# to reinsert the question in the reasoning if needed | |
question = history[-1]["content"] | |
# prepare the assistant message | |
history.append( | |
gr.ChatMessage( | |
role="assistant", | |
content=str(""), | |
metadata={ | |
"title": "Thinking", | |
}, | |
) | |
) | |
# for the moment, make the reasoning to be displayed in the chat | |
messages = rebuild_messages(history) | |
for i, prepend in enumerate(rethink_prepends): | |
if i > 0: | |
messages[-1]["content"] += "\n\n" | |
messages[-1]["content"] += prepend.format(question=question) | |
num_tokens = int( | |
max_num_tokens if "**ANSWER**" not in prepend else final_num_tokens | |
) | |
t = threading.Thread( | |
target=pipe, | |
args=(messages,), | |
kwargs=dict( | |
max_new_tokens=num_tokens, | |
streamer=streamer, | |
), | |
) | |
t.start() | |
# rebuild the history with the new content | |
history[-1].content += prepend | |
if "**ANSWER**" in prepend: | |
# stop thinking, this is the answer now (no metadata for intermediate steps) | |
history.append(gr.ChatMessage(role="assistant", content="")) | |
for token in streamer: | |
history[-1].content += token | |
history[-1].content = reformat_math(history[-1].content) | |
yield history | |
t.join() | |
yield history | |
with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo: | |
with gr.Row(scale=1): | |
with gr.Column(scale=5): | |
gr.Markdown(f""" | |
# Force reasoning for any model | |
This is a simple proof-of-concept to get any LLM model to reason ahead of its response. | |
This interface uses *{model_name}* model which is **not** a reasoning model. The used method | |
is only to force some "reasoning" steps with prefixes to help the model to enhance the answer. | |
""") | |
chatbot = gr.Chatbot( | |
scale=1, | |
type="messages", | |
latex_delimiters=latex_delimiters, | |
) | |
msg = gr.Textbox( | |
submit_btn=True, | |
label="", | |
show_label=False, | |
placeholder="Type your question here.", | |
) | |
with gr.Column(scale=1): | |
gr.Markdown("""## Tweaks""") | |
num_tokens = gr.Slider( | |
50, | |
255, | |
100, | |
step=1, | |
label="Max tokens per reasoning step", | |
interactive=True, | |
) | |
final_num_tokens = gr.Slider( | |
50, | |
255, | |
200, | |
step=1, | |
label="Max token for the final answer", | |
interactive=True, | |
) | |
gr.Markdown(""" | |
Using smaller number of tokens in the reasoning steps will make the model | |
faster to answer, but it may not be able to go deep enough in its reasoning. | |
A good value is 100. | |
Using smaller number of tokens for the final answer will make the model | |
to be less verbose, but it may not be able to give a complete answer. | |
A good value is 200 to 255. | |
""") | |
gr.Markdown(""" | |
This interface can work on personal computer with 6Go VRAM (e.g. NVidia 30NV). Feel free to fork | |
the application and try others instruct models. | |
""") | |
# when the user submit a message, the bot will answer | |
msg.submit( | |
user_input, | |
[msg, chatbot], # inputs | |
[msg, chatbot], # outputs | |
).then( | |
bot, | |
[chatbot, num_tokens, final_num_tokens], # actually, the "history" input | |
chatbot, # to store the new history from the output | |
) | |
if __name__ == "__main__": | |
demo.queue().launch() | |