Spaces:

MawaredHR
/

mawared-Cohere

Sleeping

App Files Files Community

Daemontatox commited on Jan 5

Commit

b70c257

verified ·

1 Parent(s): 99f29ff

Update app.py

Browse files

Files changed (1) hide show

app.py +285 -129

app.py CHANGED Viewed

@@ -1,23 +1,30 @@
 import os
 import time
-import spaces
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
 import gradio as gr
 from threading import Thread
-MODEL_LIST = ["CohereForAI/aya-expanse-32b"]
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
-MODEL = "CohereForAI/aya-expanse-32b"
-TITLE = "<h1><center>Mawred T2 Wip </center></h1>"
-PLACEHOLDER = """
-<center>
-<p>Hi! How can I help you today?</p>
-</center>
-"""
 CSS = """
 .duplicate-button {
@@ -29,150 +36,299 @@ CSS = """
 h3 {
     text-align: center;
 }
 """
-device = "cuda" # for GPU usage or "cpu" for CPU usage
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_use_double_quant=True,
-    bnb_4bit_quant_type= "nf4")
-tokenizer = AutoTokenizer.from_pretrained(MODEL)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-@spaces.GPU(660)
-def stream_chat(
-    message: str,
     history: list,
     system_prompt: str,
-    temperature: float = 0.8,
-    max_new_tokens: int = 1024,
-    top_p: float = 1.0,
-    top_k: int = 20,
     penalty: float = 1.2,
 ):
-    print(f'message: {message}')
-    print(f'history: {history}')
     conversation = [
         {"role": "system", "content": system_prompt}
     ]
     for prompt, answer in history:
         conversation.extend([
-            {"role": "user", "content": prompt},
-            {"role": "assistant", "content": answer},
         ])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
-    streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True)
     generate_kwargs = dict(
-        input_ids=input_ids,
-        max_new_tokens = max_new_tokens,
-        do_sample = False if temperature == 0 else True,
-        top_p = top_p,
-        top_k = top_k,
-        temperature = temperature,
         repetition_penalty=penalty,
-        eos_token_id=255001,
         streamer=streamer,
     )
     with torch.no_grad():
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        yield buffer
-chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER)
-with gr.Blocks(css=CSS, theme="soft") as demo:
-    gr.HTML(TITLE)
-    gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button")
-    gr.ChatInterface(
-        fn=stream_chat,
-        chatbot=chatbot,
-        fill_height=True,
-        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
-        additional_inputs=[
-            gr.Textbox(
-                value="""
-                You are a helpful assistant.
-                """,
-                label="System Prompt",
-                lines=5,
-                render=False,
-            ),
-            gr.Slider(
-                minimum=0,
-                maximum=1,
-                step=0.1,
-                value=0.8,
-                label="Temperature",
-                render=False,
-            ),
-            gr.Slider(
-                minimum=128,
-                maximum=8192,
-                step=1,
-                value=1024,
-                label="Max new tokens",
-                render=False,
-            ),
-            gr.Slider(
-                minimum=0.0,
-                maximum=1.0,
-                step=0.1,
-                value=1.0,
-                label="top_p",
-                render=False,
-            ),
-            gr.Slider(
-                minimum=1,
-                maximum=20,
-                step=1,
-                value=20,
-                label="top_k",
-                render=False,
-            ),
-            gr.Slider(
-                minimum=0.0,
-                maximum=2.0,
-                step=0.1,
-                value=1.2,
-                label="Repetition penalty",
-                render=False,
-            ),
-        ],
-        examples=[
-        ["Translate 'artificial intelligence' to Arabic."],
-        ["How do you say 'photosynthesis' in Arabic?"],
-        ["Translate 'main causes of climate change' into Arabic."],
-        ["What is the Arabic translation for 'protein synthesis'?"],
-        ["Translate 'key features of a democratic government' to Arabic."],
-        ["How do you translate 'theory of relativity' into Arabic?"],
-        ["What is the Arabic equivalent of 'vaccines prevent diseases'?"],
-        ["Translate 'major events of World War II' to Arabic."],
-        ["How do you say 'structure of a human cell' in Arabic?"],
-        ["Translate 'role of DNA in genetics' into Arabic."]
-    ],
-        cache_examples=False,
-    )
 if __name__ == "__main__":
     demo.launch()

+import subprocess
+subprocess.run(
+    'pip install flash-attn --no-build-isolation',
+    env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"},
+    shell=True
+)
 import os
+import re
 import time
 import torch
+import spaces
 import gradio as gr
 from threading import Thread
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    TextIteratorStreamer
+)
+# Configuration Constants
+MODEL_ID = "CohereForAI/aya-expanse-32b"
+DEFAULT_SYSTEM_PROMPT = """You are a highly intelligent  assistant."""
+# UI Configuration
+TITLE = "<h1><center>AI Reasoning Assistant</center></h1>"
+PLACEHOLDER = "Ask me anything! I'll think through it step by step."
 CSS = """
 .duplicate-button {
 h3 {
     text-align: center;
 }
+.message-wrap {
+    overflow-x: auto;
+}
+.message-wrap p {
+    margin-bottom: 1em;
+}
+.message-wrap pre {
+    background-color: #f6f8fa;
+    border-radius: 3px;
+    padding: 16px;
+    overflow-x: auto;
+}
+.message-wrap code {
+    background-color: rgba(175,184,193,0.2);
+    border-radius: 3px;
+    padding: 0.2em 0.4em;
+    font-family: monospace;
+}
+.custom-tag {
+    color: #0066cc;
+    font-weight: bold;
+}
+.chat-area {
+    height: 500px !important;
+    overflow-y: auto !important;
+}
 """
+def initialize_model():
+    """Initialize the model with appropriate configurations"""
+    quantization_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_compute_dtype=torch.bfloat16,
+        bnb_4bit_use_double_quant=True
+    )
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16,
+        device_map="cuda",
+        attn_implementation="flash_attention_2",
+        quantization_config=quantization_config
+    )
+    return model, tokenizer
+def format_text(text):
+    """Format text with proper spacing and tag highlighting (but keep tags visible)"""
+    tag_patterns = [
+        (r'<Thinking>', '\n<Thinking>\n'),
+        (r'</Thinking>', '\n</Thinking>\n'),
+        (r'<Critique>', '\n<Critique>\n'),
+        (r'</Critique>', '\n</Critique>\n'),
+        (r'<Revising>', '\n<Revising>\n'),
+        (r'</Revising>', '\n</Revising>\n'),
+        (r'<Final>', '\n<Final>\n'),
+        (r'</Final>', '\n</Final>\n')
+    ]
+    formatted = text
+    for pattern, replacement in tag_patterns:
+        formatted = re.sub(pattern, replacement, formatted)
+    formatted = '\n'.join(line for line in formatted.split('\n') if line.strip())
+    return formatted
+def format_chat_history(history):
+    """Format chat history for display, keeping tags visible"""
+    formatted = []
+    for user_msg, assistant_msg in history:
+        formatted.append(f"User: {user_msg}")
+        if assistant_msg:
+            formatted.append(f"Assistant: {assistant_msg}")
+    return "\n\n".join(formatted)
+def create_examples():
+    """Create example queries for the UI"""
+    return [
+        "Explain the concept of artificial intelligence.",
+        "How does photosynthesis work?",
+        "What are the main causes of climate change?",
+        "Describe the process of protein synthesis.",
+        "What are the key features of a democratic government?",
+        "Explain the theory of relativity.",
+        "How do vaccines work to prevent diseases?",
+        "What are the major events of World War II?",
+        "Describe the structure of a human cell.",
+        "What is the role of DNA in genetics?"
+    ]
+@spaces.GPU(duration=660)
+def chat_response(
+    message: str,
     history: list,
+    chat_display: str,
     system_prompt: str,
+    temperature: float = 1.0,
+    max_new_tokens: int = 4000,
+    top_p: float = 0.8,
+    top_k: int = 40,
     penalty: float = 1.2,
 ):
+    """Generate chat responses, keeping tags visible in the output"""
     conversation = [
         {"role": "system", "content": system_prompt}
     ]
     for prompt, answer in history:
         conversation.extend([
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": answer}
         ])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(
+        conversation,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(model.device)
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        timeout=60.0,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
     generate_kwargs = dict(
+        input_ids=input_ids,
+        max_new_tokens=max_new_tokens,
+        do_sample=False if temperature == 0 else True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
         repetition_penalty=penalty,
         streamer=streamer,
     )
+    buffer = ""
     with torch.no_grad():
         thread = Thread(target=model.generate, kwargs=generate_kwargs)
         thread.start()
+        history = history + [[message, ""]]
+        for new_text in streamer:
+            buffer += new_text
+            formatted_buffer = format_text(buffer)
+            history[-1][1] = formatted_buffer
+            chat_display = format_chat_history(history)
+            yield history, chat_display
+def process_example(example: str) -> tuple:
+    """Process example query and return empty history and updated display"""
+    return [], f"User: {example}\n\n"
+def main():
+    """Main function to set up and launch the Gradio interface"""
+    global model, tokenizer
+    model, tokenizer = initialize_model()
+    with gr.Blocks(css=CSS, theme="soft") as demo:
+        gr.HTML(TITLE)
+        gr.DuplicateButton(
+            value="Duplicate Space for private use",
+            elem_classes="duplicate-button"
+        )
+        with gr.Row():
+            with gr.Column():
+                chat_history = gr.State([])
+                chat_display = gr.TextArea(
+                    value="",
+                    label="Chat History",
+                    interactive=False,
+                    elem_classes=["chat-area"],
+                )
+                message = gr.TextArea(
+                    placeholder=PLACEHOLDER,
+                    label="Your message",
+                    lines=3
+                )
+                with gr.Row():
+                    submit = gr.Button("Send")
+                    clear = gr.Button("Clear")
+                with gr.Accordion("⚙️ Advanced Settings", open=False):
+                    system_prompt = gr.TextArea(
+                        value=DEFAULT_SYSTEM_PROMPT,
+                        label="System Prompt",
+                        lines=5,
+                    )
+                    temperature = gr.Slider(
+                        minimum=0,
+                        maximum=1,
+                        step=0.1,
+                        value=0.2,
+                        label="Temperature",
+                    )
+                    max_tokens = gr.Slider(
+                        minimum=128,
+                        maximum=32000,
+                        step=128,
+                        value=4000,
+                        label="Max Tokens",
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.1,
+                        maximum=1.0,
+                        step=0.1,
+                        value=0.8,
+                        label="Top-p",
+                    )
+                    top_k = gr.Slider(
+                        minimum=1,
+                        maximum=100,
+                        step=1,
+                        value=40,
+                        label="Top-k",
+                    )
+                    penalty = gr.Slider(
+                        minimum=1.0,
+                        maximum=2.0,
+                        step=0.1,
+                        value=1.2,
+                        label="Repetition Penalty",
+                    )
+                examples = gr.Examples(
+                    examples=create_examples(),
+                    inputs=[message],
+                    outputs=[chat_history, chat_display],
+                    fn=process_example,
+                    cache_examples=False,
+                )
+        # Set up event handlers
+        submit_click = submit.click(
+            chat_response,
+            inputs=[
+                message,
+                chat_history,
+                chat_display,
+                system_prompt,
+                temperature,
+                max_tokens,
+                top_p,
+                top_k,
+                penalty,
+            ],
+            outputs=[chat_history, chat_display],
+            show_progress=True,
+        )
+        message.submit(
+            chat_response,
+            inputs=[
+                message,
+                chat_history,
+                chat_display,
+                system_prompt,
+                temperature,
+                max_tokens,
+                top_p,
+                top_k,
+                penalty,
+            ],
+            outputs=[chat_history, chat_display],
+            show_progress=True,
+        )
+        clear.click(
+            lambda: ([], ""),
+            outputs=[chat_history, chat_display],
+            show_progress=True,
+        )
+        submit_click.then(lambda: "", outputs=message)
+        message.submit(lambda: "", outputs=message)
+    return demo
 if __name__ == "__main__":
+    demo = main()
     demo.launch()