Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on Jan 4

Commit

69b4a5f

verified ·

1 Parent(s): 880ced6

Update app.py

Browse files

Files changed (1) hide show

app.py +132 -202

app.py CHANGED Viewed

@@ -16,9 +16,9 @@ print("OpenAI client initialized.")
 def respond(
     message,
     history: list[tuple[str, str]],
-    system_message,
-    custom_model,
     model,
     max_tokens,
     temperature,
     top_p,
@@ -26,66 +26,43 @@ def respond(
     seed
 ):
     """
-    This function handles the chatbot response. It takes in:
-    - message: the user's new message
-    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
-    - system_message: the system prompt
-    - custom_model: custom model path (if any)
-    - model: selected model from featured models
-    - max_tokens: the maximum number of tokens to generate in the response
-    - temperature: sampling temperature
-    - top_p: top-p (nucleus) sampling
-    - frequency_penalty: penalize repeated tokens in the output
-    - seed: a fixed seed for reproducibility; -1 will mean 'random'
     """
     print(f"Received message: {message}")
     print(f"History: {history}")
-    print(f"System message: {system_message}")
     print(f"Custom model: {custom_model}")
-    print(f"Selected model: {model}")
-    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
-    # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
-    # Construct the messages array required by the API
     messages = [{"role": "system", "content": system_message}]
-    # Add conversation history to the context
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
         if user_part:
             messages.append({"role": "user", "content": user_part})
-            print(f"Added user message to context: {user_part}")
         if assistant_part:
             messages.append({"role": "assistant", "content": assistant_part})
-            print(f"Added assistant message to context: {assistant_part}")
-    # Append the latest user message
     messages.append({"role": "user", "content": message})
-    # Start with an empty string to build the response as tokens stream in
     response = ""
-    print("Sending request to OpenAI API.")
-    # Determine which model to use
-    if custom_model.strip():
-        selected_model = custom_model.strip()
-    else:
-        # Map the display names to actual model paths
-        model_mapping = {
-            "Llama 2 70B": "meta-llama/Llama-2-70b-chat-hf",
-            "Mixtral 8x7B": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "Zephyr 7B": "HuggingFaceH4/zephyr-7b-beta",
-            "OpenChat 3.5": "openchat/openchat-3.5-0106",
-        }
-        selected_model = model_mapping.get(model, "meta-llama/Llama-2-70b-chat-hf")
-    # Make the streaming request to the HF Inference API via openai-like client
     for message_chunk in client.chat.completions.create(
         model=selected_model,
         max_tokens=max_tokens,
@@ -96,7 +73,6 @@ def respond(
         seed=seed,
         messages=messages,
     ):
-        # Extract the token text from the response chunk
         token_text = message_chunk.choices[0].delta.content
         print(f"Received token: {token_text}")
         response += token_text
@@ -104,181 +80,135 @@ def respond(
     print("Completed response generation.")
-# Create a Chatbot component with a specified height
 chatbot = gr.Chatbot(height=600)
 print("Chatbot interface created.")
 # Create the Gradio interface with tabs
 with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
-    with gr.Row():
-        with gr.Column():
-            # Basic Settings Tab
-            with gr.Tab("Settings"):
-                # System Message
-                system_message = gr.Textbox(
-                    value="",
-                    label="System message",
-                    placeholder="Enter a system message to guide the model's behavior"
-                )
-                # Model Selection Section
                 with gr.Accordion("Featured Models", open=True):
-                    # Model Search
                     model_search = gr.Textbox(
                         label="Filter Models",
-                        placeholder="Search for a featured model...",
                         lines=1
                     )
-                    # Featured Models List
-                    models_list = [
-                        "Llama 2 70B",
-                        "Mixtral 8x7B",
-                        "Zephyr 7B",
-                        "OpenChat 3.5"
-                    ]
                     model = gr.Radio(
                         label="Select a model",
                         choices=models_list,
-                        value="Llama 2 70B"
-                    )
-                    # Custom Model Input
-                    custom_model = gr.Textbox(
-                        label="Custom Model",
-                        info="Hugging Face model path (optional)",
-                        placeholder="meta-llama/Llama-2-70b-chat-hf"
-                    )
-                    # Function to filter models
-                    def filter_models(search_term):
-                        filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
-                        return gr.update(choices=filtered_models)
-                    # Update model list when search box is used
-                    model_search.change(filter_models, inputs=model_search, outputs=model)
-                # Generation Parameters
-                with gr.Row():
-                    max_tokens = gr.Slider(
-                        minimum=1,
-                        maximum=4096,
-                        value=512,
-                        step=1,
-                        label="Max new tokens"
-                    )
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=4.0,
-                        value=0.7,
-                        step=0.1,
-                        label="Temperature"
-                    )
-                with gr.Row():
-                    top_p = gr.Slider(
-                        minimum=0.1,
-                        maximum=1.0,
-                        value=0.95,
-                        step=0.05,
-                        label="Top-P"
-                    )
-                    frequency_penalty = gr.Slider(
-                        minimum=-2.0,
-                        maximum=2.0,
-                        value=0.0,
-                        step=0.1,
-                        label="Frequency Penalty"
-                    )
-                with gr.Row():
-                    seed = gr.Slider(
-                        minimum=-1,
-                        maximum=65535,
-                        value=-1,
-                        step=1,
-                        label="Seed (-1 for random)"
-                    )
-            # Information Tab
-            with gr.Tab("Information"):
-                # Featured Models Table
-                with gr.Accordion("Featured Models", open=True):
-                    gr.HTML(
-                        """
-                        <p><a href="https://huggingface.co/models?inference=warm&pipeline_tag=text-to-text">See all available models</a></p>
-                        <table style="width:100%; text-align:center; margin:auto;">
-                            <tr>
-                                <th>Model Name</th>
-                                <th>Size</th>
-                                <th>Notes</th>
-                            </tr>
-                            <tr>
-                                <td>Llama 2 70B</td>
-                                <td>70B</td>
-                                <td>Meta's flagship model</td>
-                            </tr>
-                            <tr>
-                                <td>Mixtral 8x7B</td>
-                                <td>47B</td>
-                                <td>Mistral AI's MoE model</td>
-                            </tr>
-                            <tr>
-                                <td>Zephyr 7B</td>
-                                <td>7B</td>
-                                <td>Efficient fine-tuned model</td>
-                            </tr>
-                            <tr>
-                                <td>OpenChat 3.5</td>
-                                <td>7B</td>
-                                <td>High performance chat model</td>
-                            </tr>
-                        </table>
-                        """
                     )
-                # Parameters Overview
-                with gr.Accordion("Parameters Overview", open=False):
-                    gr.Markdown(
-                        """
-                        ## System Message
-                        A message that sets the context and behavior for the model. This helps guide the model's responses.
-                        ## Max New Tokens
-                        Controls the maximum length of the generated response. Higher values allow for longer outputs but may take more time.
-                        ## Temperature
-                        Controls randomness in the output:
-                        - Lower values (0.1-0.5): More focused and deterministic
-                        - Higher values (0.7-1.0): More creative and diverse
-                        - Very high values (>1.0): More random and potentially chaotic
-                        ## Top-P (Nucleus Sampling)
-                        Controls the cumulative probability threshold for token selection:
-                        - Lower values: More focused on highly likely tokens
-                        - Higher values: Considers a wider range of possibilities
-                        ## Frequency Penalty
-                        Adjusts the likelihood of token repetition:
-                        - Negative values: May encourage repetition
-                        - Zero: Neutral
-                        - Positive values: Discourages repetition
-                        ## Seed
-                        A number that controls the randomness in generation:
-                        - -1: Random seed each time
-                        - Fixed value: Reproducible outputs with same parameters
-                        """
-                    )
-    # Set up the chat interface
-    chatbot = gr.Chatbot(height=600)
-    msg = gr.Textbox(label="Message")
-    clear = gr.ClearButton([msg, chatbot])
-    msg.submit(respond, [msg, chatbot, system_message, custom_model, model, max_tokens, temperature, top_p, frequency_penalty, seed], [chatbot, msg])
-print("Launching the demo application.")
-demo.launch(show_api=False, share=False)

 def respond(
     message,
     history: list[tuple[str, str]],
     model,
+    custom_model,
+    system_message,
     max_tokens,
     temperature,
     top_p,
     seed
 ):
     """
+    This function handles the chatbot response.
     """
     print(f"Received message: {message}")
     print(f"History: {history}")
+    print(f"Model: {model}")
     print(f"Custom model: {custom_model}")
+    print(f"System message: {system_message}")
+    print(f"Parameters - Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
+    # Convert seed to None if -1
     if seed == -1:
         seed = None
+    # Set the model based on selection or custom input
+    selected_model = custom_model.strip() if custom_model.strip() != "" else model
+    # Construct messages array
     messages = [{"role": "system", "content": system_message}]
+    # Add conversation history
     for val in history:
         user_part = val[0]
         assistant_part = val[1]
         if user_part:
             messages.append({"role": "user", "content": user_part})
         if assistant_part:
             messages.append({"role": "assistant", "content": assistant_part})
+    # Append latest message
     messages.append({"role": "user", "content": message})
+    # Start with empty response
     response = ""
+    print("Sending request to API.")
+    # Make the streaming request
     for message_chunk in client.chat.completions.create(
         model=selected_model,
         max_tokens=max_tokens,
         seed=seed,
         messages=messages,
     ):
         token_text = message_chunk.choices[0].delta.content
         print(f"Received token: {token_text}")
         response += token_text
     print("Completed response generation.")
+# Create Chatbot component
 chatbot = gr.Chatbot(height=600)
 print("Chatbot interface created.")
+# Define available models
+models_list = [
+    "meta-llama/Llama-2-70b-chat-hf",
+    "meta-llama/Llama-2-13b-chat-hf",
+    "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "mistralai/Mistral-7B-Instruct-v0.2",
+    "HuggingFaceH4/zephyr-7b-beta",
+]
 # Create the Gradio interface with tabs
 with gr.Blocks(theme="Nymbo/Nymbo_Theme") as demo:
+    with gr.Tab("Chat"):
+        with gr.Row():
+            with gr.Column():
+                # Model selection accordion
                 with gr.Accordion("Featured Models", open=True):
                     model_search = gr.Textbox(
                         label="Filter Models",
+                        placeholder="Search for a model...",
                         lines=1
                     )
                     model = gr.Radio(
                         label="Select a model",
                         choices=models_list,
+                        value="meta-llama/Llama-2-70b-chat-hf"
                     )
+                # Custom model input
+                custom_model = gr.Textbox(
+                    label="Custom Model",
+                    info="Enter Hugging Face model path (optional)",
+                    placeholder="organization/model-name"
+                )
+                # System message and parameters
+                system_message = gr.Textbox(label="System message")
+                max_tokens = gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens")
+                temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
+                top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
+                frequency_penalty = gr.Slider(minimum=-2.0, maximum=2.0, value=0.0, step=0.1, label="Frequency Penalty")
+                seed = gr.Slider(minimum=-1, maximum=65535, value=-1, step=1, label="Seed (-1 for random)")
+    with gr.Tab("Information"):
+        with gr.Accordion("Featured Models", open=False):
+            gr.HTML("""
+            <p><a href="https://huggingface.co/models?pipeline_tag=text-generation&sort=trending">See all available models</a></p>
+            <table style="width:100%; text-align:center; margin:auto;">
+                <tr>
+                    <th>Model Name</th>
+                    <th>Parameters</th>
+                    <th>Notes</th>
+                </tr>
+                <tr>
+                    <td>Llama-2-70b-chat</td>
+                    <td>70B</td>
+                    <td>Meta's largest chat model</td>
+                </tr>
+                <tr>
+                    <td>Mixtral-8x7B</td>
+                    <td>47B</td>
+                    <td>Mixture of Experts architecture</td>
+                </tr>
+                <tr>
+                    <td>Mistral-7B</td>
+                    <td>7B</td>
+                    <td>Efficient base model</td>
+                </tr>
+            </table>
+            """)
+        with gr.Accordion("Parameters Overview", open=False):
+            gr.Markdown("""
+            ## System Message
+            The system message sets the context and behavior for the AI assistant. It's like giving it a role or specific instructions.
+            ## Max New Tokens
+            Controls the maximum length of the generated response. Higher values allow for longer responses but take more time.
+            ## Temperature
+            Controls randomness in the response:
+            - Lower (0.1-0.5): More focused and deterministic
+            - Higher (0.7-1.0): More creative and varied
+            ## Top-P
+            Nucleus sampling parameter:
+            - Lower values: More focused on likely tokens
+            - Higher values: More diverse vocabulary usage
+            ## Frequency Penalty
+            Discourages repetition:
+            - Negative: May allow more repetition
+            - Positive: Encourages more diverse word choice
+            ## Seed
+            Controls randomness initialization:
+            - -1: Random seed each time
+            - Fixed value: Reproducible outputs
+            """)
+    # Function to filter models based on search
+    def filter_models(search_term):
+        filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
+        return gr.update(choices=filtered_models)
+    # Connect the search box to the model filter function
+    model_search.change(filter_models, inputs=model_search, outputs=model)
+    # Create the chat interface
+    chat_interface = gr.ChatInterface(
+        respond,
+        additional_inputs=[
+            model,
+            custom_model,
+            system_message,
+            max_tokens,
+            temperature,
+            top_p,
+            frequency_penalty,
+            seed,
+        ],
+        chatbot=chatbot,
+    )
+print("Gradio interface initialized.")
+if __name__ == "__main__":
+    print("Launching the demo application.")
+    demo.launch(show_api=False, share=False)