Spaces:

Metal3d
/

force-reasoning-any-model

Running on Zero

App Files Files Community

Metal3d commited on Mar 23

Commit

3a1cf13

unverified ·

1 Parent(s): 61fff43

Add sampling and temperature parameters and grox up num tokens

Browse files

I think it's a good idea to allow the tweaking of this values.

Also, more tokens produce better results.

Files changed (1) hide show

app.py +35 -12

app.py CHANGED Viewed

@@ -76,7 +76,13 @@ def rebuild_messages(history: list):
 @spaces.GPU
-def bot(history: list, max_num_tokens: int, final_num_tokens: int):
     """Make the model answering the question"""
     # to get token as a stream, later in a thread
@@ -114,6 +120,8 @@ def bot(history: list, max_num_tokens: int, final_num_tokens: int):
             kwargs=dict(
                 max_new_tokens=num_tokens,
                 streamer=streamer,
             ),
         )
         t.start()
@@ -133,14 +141,14 @@ def bot(history: list, max_num_tokens: int, final_num_tokens: int):
     yield history
-with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
     with gr.Row(scale=1):
         with gr.Column(scale=5):
             gr.Markdown(f"""
-            # Force reasoning for any model
-            This is a simple proof-of-concept to get any LLM model to reason ahead of its response.
-            This interface uses *{model_name}* model which is **not** a reasoning model. The used method
             is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.
             See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
@@ -158,10 +166,10 @@ with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
                 autofocus=True,
             )
         with gr.Column(scale=1):
-            gr.Markdown("""## Tweaks""")
             num_tokens = gr.Slider(
                 50,
-                255,
                 100,
                 step=1,
                 label="Max tokens per reasoning step",
@@ -169,20 +177,29 @@ with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
             )
             final_num_tokens = gr.Slider(
                 50,
-                255,
-                200,
                 step=1,
                 label="Max token for the final answer",
                 interactive=True,
             )
             gr.Markdown("""
             Using smaller number of tokens in the reasoning steps will make the model
             faster to answer, but it may not be able to go deep enough in its reasoning.
-            A good value is 100.
             Using smaller number of tokens for the final answer will make the model
             to be less verbose, but it may not be able to give a complete answer.
-            A good value is 200 to 255.
             """)
             gr.Markdown("""
             This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop).
@@ -196,7 +213,13 @@ with gr.Blocks(fill_height=True, title="Making any model reasoning") as demo:
         [msg, chatbot],  # outputs
     ).then(
         bot,
-        [chatbot, num_tokens, final_num_tokens],  # actually, the "history" input
         chatbot,  # to store the new history from the output
     )

 @spaces.GPU
+def bot(
+    history: list,
+    max_num_tokens: int,
+    final_num_tokens: int,
+    do_sample: bool,
+    temperature: float,
+):
     """Make the model answering the question"""
     # to get token as a stream, later in a thread
             kwargs=dict(
                 max_new_tokens=num_tokens,
                 streamer=streamer,
+                do_sample=do_sample,
+                temperature=temperature,
             ),
         )
         t.start()
     yield history
+with gr.Blocks(fill_height=True, title="Making any LLM model reasoning") as demo:
     with gr.Row(scale=1):
         with gr.Column(scale=5):
             gr.Markdown(f"""
+            # Force reasoning for any LLM
+            This is a simple proof-of-concept to get any LLM (Large language Model) to reason ahead of its response.
+            This interface uses *{model_name}* model **which is not a reasoning model**. The used method
             is only to force some "reasoning" steps with prefixes to help the model to enhance the answer.
             See my related article here: [Make any model reasoning](https://huggingface.co/blog/Metal3d/making-any-model-reasoning)
                 autofocus=True,
             )
         with gr.Column(scale=1):
+            gr.Markdown("""## Tweaking""")
             num_tokens = gr.Slider(
                 50,
+                1024,
                 100,
                 step=1,
                 label="Max tokens per reasoning step",
             )
             final_num_tokens = gr.Slider(
                 50,
+                1024,
+                512,
                 step=1,
                 label="Max token for the final answer",
                 interactive=True,
             )
+            do_sample = gr.Checkbox(True, label="Do sample")
+            temperature = gr.Slider(0.1, 1.0, 0.7, step=0.1, label="Temperature")
             gr.Markdown("""
             Using smaller number of tokens in the reasoning steps will make the model
             faster to answer, but it may not be able to go deep enough in its reasoning.
+            A good value is 100 to 512.
             Using smaller number of tokens for the final answer will make the model
             to be less verbose, but it may not be able to give a complete answer.
+            A good value is 512 to 1024.
+            **Do sample** uses another strategie to select the next token to complete the
+            answer. It's commonly better to leave it checked.
+            **Temperature** indicates how much the model could be "creative". 0.7 is a common value.
+            If you set a too high value (like 1.0) the model could be incoherent. With a low value
+            (like 0.3), the model will produce very predictives answers.
             """)
             gr.Markdown("""
             This interface can work on personal computer with 6Go VRAM (e.g. NVidia 3050/3060 on laptop).
         [msg, chatbot],  # outputs
     ).then(
         bot,
+        [
+            chatbot,
+            num_tokens,
+            final_num_tokens,
+            do_sample,
+            temperature,
+        ],  # actually, the "history" input
         chatbot,  # to store the new history from the output
     )