Spaces:

Smilyai-labs
/

Sam-chat-full

Runtime error

App Files Files Community

Boning c commited on Jul 17

Commit

4d45982

verified ·

1 Parent(s): 5283875

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -21

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 import re, json
 from html import escape
-# ─── Config ─────────────────────────────────────────────────────────
 PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
 USAGE_LIMIT   = 5
@@ -14,7 +14,7 @@ device        = "cuda" if torch.cuda.is_available() else "cpu"
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
-# ─── Load Models ───────────────────────────────────────────────────────
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
@@ -25,9 +25,9 @@ def load_models():
     fallback_model    = AutoModelForCausalLM.from_pretrained(
         FALLBACK_MODEL, torch_dtype=torch.float16
     ).to(device).eval()
-    return f"✅ Loaded {PRIMARY_MODEL} with fallback {FALLBACK_MODEL}"
-# ─── Build Qwen-style Prompt ──────────────────────────────────────────
 def build_chat_prompt(history, user_input, reasoning_enabled):
     system_flag = "/think" if reasoning_enabled else "/no_think"
     prompt = f"<|system|>\n{system_flag}\n"
@@ -36,20 +36,20 @@ def build_chat_prompt(history, user_input, reasoning_enabled):
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
-# ─── Collapse <think> Blocks ──────────────────────────────────────────
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if not match:
         return escape(text)
     reasoning = escape(match.group(1).strip())
-    visible = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
     return (
         escape(visible)
         + "<br><details><summary>🧠 Show reasoning</summary>"
         + f"<pre>{reasoning}</pre></details>"
     )
-# ─── Token‐by‐Token Streaming ─────────────────────────────────────────
 def generate_stream(prompt, use_fallback=False,
                     max_length=100, temperature=0.2, top_p=0.9):
     model     = fallback_model if use_fallback else primary_model
@@ -59,7 +59,7 @@ def generate_stream(prompt, use_fallback=False,
     assistant_text = ""
     for _ in range(max_length):
-        # 1) Sample next token
         logits = model(generated).logits[:, -1, :] / temperature
         sorted_logits, idxs = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
@@ -68,40 +68,64 @@ def generate_stream(prompt, use_fallback=False,
         mask[..., 0]  = 0
         filtered = logits.clone()
         filtered[:, idxs[mask]] = -float("Inf")
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
-        # 2) Decode and append
         new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
-        # 3) Strip starting <|assistant|> if present
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
-        # 4) If the accumulated text contains the user tag, truncate and stop
         if "<|user|>" in assistant_text:
-            # drop the user tag and anything after it
             assistant_text = assistant_text.split("<|user|>")[0]
             yield assistant_text
             break
-        # 5) Otherwise stream the clean assistant text
         yield assistant_text
-        # 6) Stop on EOS
         if next_token.item() == tokenizer.eos_token_id:
             break
 def clear_chat():
     return [], [], "🧠 A3 left: 5", "Send"
-# ─── Gradio UI ─────────────────────────────────────────────────────────
 with gr.Blocks() as demo:
     # Inject client-side JS + CSS
     gr.HTML(f"""
 <script>
-  // bump/reset usage in localStorage and write to hidden textbox
   function updateUsageLimit() {{
     const key = "samai_limit";
     const now = Date.now();
@@ -114,7 +138,6 @@ with gr.Blocks() as demo:
     localStorage.setItem(key, JSON.stringify(rec));
     document.getElementById("limit_json").value = JSON.stringify(rec);
   }}
-  // on Send click: update limit & flip button text
   document.addEventListener("DOMContentLoaded", () => {{
     const btn = document.getElementById("send_btn");
     btn.addEventListener("click", () => {{
@@ -133,11 +156,11 @@ with gr.Blocks() as demo:
     text-align: center;
   }}
 </style>
-""")
     gr.Markdown("# 🤖 SamAI – Chat Reasoning (Final)")
-    # carry usage JSON from JS → Python
     limit_json    = gr.Textbox(visible=False, elem_id="limit_json")
     model_status  = gr.Textbox(interactive=False, label="Model Status")
     usage_counter = gr.Textbox("🧠 A3 left: 5", interactive=False, show_label=False)
@@ -154,14 +177,16 @@ with gr.Blocks() as demo:
     model_status.value = load_models()
     send_btn.click(
         fn=respond,
         inputs=[user_input, chat_state, reason_toggle, limit_json],
         outputs=[chat_box, chat_state, usage_counter, send_btn]
     )
     clear_btn.click(
         fn=clear_chat,
-        inputs=[],
         outputs=[chat_box, chat_state, usage_counter, send_btn]
     )

 import re, json
 from html import escape
+# ─── Configuration ─────────────────────────────────────────────────────────
 PRIMARY_MODEL = "Smilyai-labs/Sam-reason-A3"
 FALLBACK_MODEL = "Smilyai-labs/Sam-reason-A1"
 USAGE_LIMIT   = 5
 primary_model = primary_tokenizer = None
 fallback_model = fallback_tokenizer = None
+# ─── Model Loading ─────────────────────────────────────────────────────────
 def load_models():
     global primary_model, primary_tokenizer, fallback_model, fallback_tokenizer
     primary_tokenizer = AutoTokenizer.from_pretrained(PRIMARY_MODEL, trust_remote_code=True)
     fallback_model    = AutoModelForCausalLM.from_pretrained(
         FALLBACK_MODEL, torch_dtype=torch.float16
     ).to(device).eval()
+    return f"✅ Loaded {PRIMARY_MODEL} (fallback: {FALLBACK_MODEL})"
+# ─── Build Chat Prompt ──────────────────────────────────────────────────────
 def build_chat_prompt(history, user_input, reasoning_enabled):
     system_flag = "/think" if reasoning_enabled else "/no_think"
     prompt = f"<|system|>\n{system_flag}\n"
     prompt += f"<|user|>\n{user_input}\n<|assistant|>\n"
     return prompt
+# ─── Collapse <think> Blocks ────────────────────────────────────────────────
 def format_thinking(text):
     match = re.search(r"<think>(.*?)</think>", text, re.DOTALL)
     if not match:
         return escape(text)
     reasoning = escape(match.group(1).strip())
+    visible   = re.sub(r"<think>.*?</think>", "[thinking...]", text, flags=re.DOTALL).strip()
     return (
         escape(visible)
         + "<br><details><summary>🧠 Show reasoning</summary>"
         + f"<pre>{reasoning}</pre></details>"
     )
+# ─── Token-by-Token Streaming (Stops on <|user|>) ─────────────────────────
 def generate_stream(prompt, use_fallback=False,
                     max_length=100, temperature=0.2, top_p=0.9):
     model     = fallback_model if use_fallback else primary_model
     assistant_text = ""
     for _ in range(max_length):
+        # 1) Get next-token logits and apply top-p
         logits = model(generated).logits[:, -1, :] / temperature
         sorted_logits, idxs = torch.sort(logits, descending=True)
         probs = torch.softmax(sorted_logits, dim=-1).cumsum(dim=-1)
         mask[..., 0]  = 0
         filtered = logits.clone()
         filtered[:, idxs[mask]] = -float("Inf")
+        # 2) Sample and append
         next_token = torch.multinomial(torch.softmax(filtered, dim=-1), 1)
         generated = torch.cat([generated, next_token], dim=-1)
         new_text = tokenizer.decode(next_token[0], skip_special_tokens=False)
         assistant_text += new_text
+        # 3) Remove any leading assistant tag
         if assistant_text.startswith("<|assistant|>"):
             assistant_text = assistant_text[len("<|assistant|>"):]
+        # 4) If we see a user‐turn tag, truncate and bail
         if "<|user|>" in assistant_text:
             assistant_text = assistant_text.split("<|user|>")[0]
             yield assistant_text
             break
+        # 5) Otherwise stream clean assistant text
         yield assistant_text
+        # 6) End if EOS
         if next_token.item() == tokenizer.eos_token_id:
             break
+# ─── Main Chat Handler ──────────────────────────────────────────────────────
+def respond(message, history, reasoning_enabled, limit_json):
+    # parse client-side usage info
+    info  = json.loads(limit_json) if limit_json else {"count": 0}
+    count = info.get("count", 0)
+    use_fallback = count > USAGE_LIMIT
+    remaining    = max(0, USAGE_LIMIT - count)
+    model_label  = "A3" if not use_fallback else "Fallback A1"
+    # initial yield to set "Generating…"
+    prompt = build_chat_prompt(history, message.strip(), reasoning_enabled)
+    history = history + [[message, ""]]
+    yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
+    # stream assistant reply
+    for chunk in generate_stream(prompt, use_fallback):
+        formatted = format_thinking(chunk)
+        history[-1][1] = (
+            f"{formatted}<br><sub style='color:gray'>({model_label})</sub>"
+        )
+        yield history, history, f"🧠 A3 left: {remaining}", "Generating…"
+    # final yield resets button text
+    yield history, history, f"🧠 A3 left: {remaining}", "Send"
+# ─── Clear Chat ─────────────────────────────────────────────────────────────
 def clear_chat():
     return [], [], "🧠 A3 left: 5", "Send"
+# ─── Gradio UI ──────────────────────────────────────────────────────────────
 with gr.Blocks() as demo:
     # Inject client-side JS + CSS
     gr.HTML(f"""
 <script>
   function updateUsageLimit() {{
     const key = "samai_limit";
     const now = Date.now();
     localStorage.setItem(key, JSON.stringify(rec));
     document.getElementById("limit_json").value = JSON.stringify(rec);
   }}
   document.addEventListener("DOMContentLoaded", () => {{
     const btn = document.getElementById("send_btn");
     btn.addEventListener("click", () => {{
     text-align: center;
   }}
 </style>
+    """)
     gr.Markdown("# 🤖 SamAI – Chat Reasoning (Final)")
+    # Hidden textbox ferrying usage JSON from JS → Python
     limit_json    = gr.Textbox(visible=False, elem_id="limit_json")
     model_status  = gr.Textbox(interactive=False, label="Model Status")
     usage_counter = gr.Textbox("🧠 A3 left: 5", interactive=False, show_label=False)
     model_status.value = load_models()
+    # Bind Send button -> respond()
     send_btn.click(
         fn=respond,
         inputs=[user_input, chat_state, reason_toggle, limit_json],
         outputs=[chat_box, chat_state, usage_counter, send_btn]
     )
     clear_btn.click(
         fn=clear_chat,
+        inputs=[],
         outputs=[chat_box, chat_state, usage_counter, send_btn]
     )