Spaces:
Running
on
Zero
Running
on
Zero
AbstractPhil
commited on
Commit
·
7779abb
1
Parent(s):
ed0198d
disabled peft in a differnt version
Browse files
app.py
CHANGED
@@ -73,8 +73,8 @@ def _hf_login() -> None:
|
|
73 |
else:
|
74 |
print("[HF Auth] No token found in environment variables")
|
75 |
|
76 |
-
# Login
|
77 |
-
|
78 |
|
79 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
80 |
|
@@ -364,15 +364,16 @@ def zerogpu_generate(full_prompt,
|
|
364 |
out_ids = model.generate(
|
365 |
**inputs,
|
366 |
do_sample=bool(gen_kwargs.get("do_sample", True)),
|
367 |
-
temperature=float(gen_kwargs.get("temperature", 0.
|
368 |
-
top_p=
|
369 |
-
top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
|
370 |
max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
|
371 |
pad_token_id=model.config.pad_token_id,
|
372 |
eos_token_id=tokenizer.eos_token_id,
|
373 |
-
|
374 |
-
no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
|
375 |
logits_processor=logits_processor,
|
|
|
|
|
376 |
stopping_criteria=sc,
|
377 |
)
|
378 |
|
@@ -419,59 +420,6 @@ def zerogpu_generate(full_prompt,
|
|
419 |
if torch.cuda.is_available():
|
420 |
torch.cuda.empty_cache()
|
421 |
|
422 |
-
# -----------------------
|
423 |
-
# Simple (non-Harmony) GPU path — matches your minimal example
|
424 |
-
# -----------------------
|
425 |
-
@spaces.GPU(duration=120)
|
426 |
-
def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_map: Optional[Dict[str, float]], rose_alpha: float, rose_score: Optional[float], seed: Optional[int]) -> Dict[str, str]:
|
427 |
-
"""Straight chat_template path. No Harmony tokens. Slices completion from prompt_len.
|
428 |
-
Mirrors the minimal HF example and avoids header loops entirely."""
|
429 |
-
model = None
|
430 |
-
try:
|
431 |
-
if seed is not None:
|
432 |
-
torch.manual_seed(int(seed))
|
433 |
-
model = _load_model_on("auto")
|
434 |
-
device = next(model.parameters()).device
|
435 |
-
|
436 |
-
# Encode prompt string
|
437 |
-
enc = tokenizer(prompt_str, return_tensors="pt")
|
438 |
-
inputs = {k: v.to(device) for k, v in enc.items()}
|
439 |
-
prompt_len = int(inputs["input_ids"].shape[1])
|
440 |
-
if "attention_mask" not in inputs:
|
441 |
-
inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
|
442 |
-
|
443 |
-
# Optional Rose bias
|
444 |
-
logits_processor = None
|
445 |
-
if rose_map:
|
446 |
-
bias = build_bias_from_tokens(tokenizer, rose_map).to(device)
|
447 |
-
eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
|
448 |
-
logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
|
449 |
-
|
450 |
-
out_ids = model.generate(
|
451 |
-
**inputs,
|
452 |
-
do_sample=bool(gen_kwargs.get("do_sample", True)),
|
453 |
-
temperature=float(gen_kwargs.get("temperature", 0.6)),
|
454 |
-
top_p=(float(gen_kwargs.get("top_p")) if gen_kwargs.get("top_p") is not None else None),
|
455 |
-
top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
|
456 |
-
max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
|
457 |
-
pad_token_id=model.config.pad_token_id,
|
458 |
-
logits_processor=logits_processor,
|
459 |
-
)
|
460 |
-
# Slice generated continuation only
|
461 |
-
new_ids = out_ids[0, prompt_len:]
|
462 |
-
text = tokenizer.decode(new_ids, skip_special_tokens=True)
|
463 |
-
return {"final": text}
|
464 |
-
except Exception as e:
|
465 |
-
return {"final": f"[Error] {type(e).__name__}: {e}"}
|
466 |
-
finally:
|
467 |
-
try:
|
468 |
-
del model
|
469 |
-
except Exception:
|
470 |
-
pass
|
471 |
-
gc.collect()
|
472 |
-
if torch.cuda.is_available():
|
473 |
-
torch.cuda.empty_cache()
|
474 |
-
|
475 |
# -----------------------
|
476 |
# GPU Debug: Harmony Inspector
|
477 |
# -----------------------
|
@@ -512,6 +460,7 @@ def zerogpu_generate_debug(full_prompt, gen_kwargs: Dict[str, Any]) -> Dict[str,
|
|
512 |
max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
|
513 |
pad_token_id=model.config.pad_token_id,
|
514 |
eos_token_id=tokenizer.eos_token_id,
|
|
|
515 |
stopping_criteria=sc,
|
516 |
repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
|
517 |
no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
|
@@ -568,45 +517,29 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
|
|
568 |
rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
|
569 |
rose_tokens: str, rose_json: str,
|
570 |
show_thinking: bool = False,
|
571 |
-
simple_mode: bool = True, # NEW: default to simple chat_template path
|
572 |
reasoning_effort: str = "high") -> str:
|
573 |
"""
|
574 |
Generate response with proper CoT handling using Harmony format.
|
575 |
"""
|
576 |
try:
|
577 |
-
# Build
|
578 |
messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
|
579 |
-
|
580 |
-
# Add
|
581 |
if history:
|
582 |
-
|
583 |
-
|
584 |
-
|
585 |
-
|
586 |
-
|
587 |
-
if
|
588 |
-
messages.append({"role":
|
589 |
-
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
if a:
|
596 |
-
messages.append({"role": "assistant", "content": str(a)})
|
597 |
-
|
598 |
-
# Current user message
|
599 |
-
if isinstance(message, dict):
|
600 |
-
user_text = message.get("content", "")
|
601 |
-
else:
|
602 |
-
user_text = str(message)
|
603 |
-
messages.append({"role": "user", "content": user_text})
|
604 |
-
|
605 |
-
# FAST PATH: simple chat_template prompt (recommended)
|
606 |
-
if simple_mode:
|
607 |
-
prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
|
608 |
-
# Harmony path (optional)
|
609 |
-
elif HARMONY_AVAILABLE:
|
610 |
prompt = create_harmony_prompt(messages, reasoning_effort) # returns token IDs
|
611 |
else:
|
612 |
# Fallback to tokenizer template (string)
|
@@ -640,23 +573,7 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
|
|
640 |
rose_map = None
|
641 |
|
642 |
# Generate with model
|
643 |
-
|
644 |
-
channels = zerogpu_generate_simple(
|
645 |
-
prompt,
|
646 |
-
{
|
647 |
-
"do_sample": bool(do_sample),
|
648 |
-
"temperature": float(temperature),
|
649 |
-
"top_p": float(top_p) if top_p is not None else None,
|
650 |
-
"top_k": int(top_k) if top_k > 0 else None,
|
651 |
-
"max_new_tokens": int(max_new_tokens),
|
652 |
-
},
|
653 |
-
rose_map,
|
654 |
-
float(rose_alpha),
|
655 |
-
float(rose_score) if rose_score is not None else None,
|
656 |
-
int(seed) if seed is not None else None,
|
657 |
-
)
|
658 |
-
else:
|
659 |
-
channels = zerogpu_generate(
|
660 |
prompt,
|
661 |
{
|
662 |
"do_sample": bool(do_sample),
|
@@ -717,7 +634,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
717 |
"""
|
718 |
)
|
719 |
|
720 |
-
|
721 |
with gr.Row():
|
722 |
system_prompt = gr.Textbox(
|
723 |
label="System Prompt",
|
@@ -725,13 +641,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
725 |
lines=2
|
726 |
)
|
727 |
|
728 |
-
with gr.Accordion("Generation Settings
|
729 |
-
# NEW: toggle to bypass Harmony and use plain chat_template like your minimal script
|
730 |
-
simple_mode = gr.Checkbox(
|
731 |
-
value=True,
|
732 |
-
label="Use simple chat_template (no Harmony)",
|
733 |
-
info="Matches the minimal HF example; safest path for now"
|
734 |
-
)
|
735 |
with gr.Row():
|
736 |
temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
|
737 |
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
|
@@ -782,9 +692,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
782 |
fn=generate_response,
|
783 |
type="messages",
|
784 |
additional_inputs=[
|
785 |
-
system_prompt, temperature, top_p, top_k, max_new,
|
786 |
-
do_sample, seed, rose_enable, rose_alpha, rose_score,
|
787 |
-
rose_tokens, rose_json, show_thinking,
|
788 |
],
|
789 |
title="Chat with Mirel",
|
790 |
description="A chain-of-thought model using Harmony format",
|
|
|
73 |
else:
|
74 |
print("[HF Auth] No token found in environment variables")
|
75 |
|
76 |
+
# Login before loading any models
|
77 |
+
_hf_login()
|
78 |
|
79 |
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
80 |
|
|
|
364 |
out_ids = model.generate(
|
365 |
**inputs,
|
366 |
do_sample=bool(gen_kwargs.get("do_sample", True)),
|
367 |
+
temperature=float(gen_kwargs.get("temperature", 0.7)),
|
368 |
+
top_p=float(gen_kwargs.get("top_p", 0.9)),
|
369 |
+
top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
|
370 |
max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
|
371 |
pad_token_id=model.config.pad_token_id,
|
372 |
eos_token_id=tokenizer.eos_token_id,
|
373 |
+
bad_words_ids=bad_words_ids,
|
|
|
374 |
logits_processor=logits_processor,
|
375 |
+
repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.2)),
|
376 |
+
no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 8)),
|
377 |
stopping_criteria=sc,
|
378 |
)
|
379 |
|
|
|
420 |
if torch.cuda.is_available():
|
421 |
torch.cuda.empty_cache()
|
422 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
423 |
# -----------------------
|
424 |
# GPU Debug: Harmony Inspector
|
425 |
# -----------------------
|
|
|
460 |
max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
|
461 |
pad_token_id=model.config.pad_token_id,
|
462 |
eos_token_id=tokenizer.eos_token_id,
|
463 |
+
bad_words_ids=bad_words_ids,
|
464 |
stopping_criteria=sc,
|
465 |
repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
|
466 |
no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
|
|
|
517 |
rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
|
518 |
rose_tokens: str, rose_json: str,
|
519 |
show_thinking: bool = False,
|
|
|
520 |
reasoning_effort: str = "high") -> str:
|
521 |
"""
|
522 |
Generate response with proper CoT handling using Harmony format.
|
523 |
"""
|
524 |
try:
|
525 |
+
# Build message list
|
526 |
messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
|
527 |
+
|
528 |
+
# Add history
|
529 |
if history:
|
530 |
+
for turn in history:
|
531 |
+
if isinstance(turn, (list, tuple)) and len(turn) >= 2:
|
532 |
+
user_msg, assistant_msg = turn[0], turn[1]
|
533 |
+
if user_msg:
|
534 |
+
messages.append({"role": "user", "content": str(user_msg)})
|
535 |
+
if assistant_msg:
|
536 |
+
messages.append({"role": "assistant", "content": str(assistant_msg)})
|
537 |
+
|
538 |
+
# Add current message
|
539 |
+
messages.append({"role": "user", "content": str(message)})
|
540 |
+
|
541 |
+
# Create Harmony-formatted prompt
|
542 |
+
if HARMONY_AVAILABLE:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
543 |
prompt = create_harmony_prompt(messages, reasoning_effort) # returns token IDs
|
544 |
else:
|
545 |
# Fallback to tokenizer template (string)
|
|
|
573 |
rose_map = None
|
574 |
|
575 |
# Generate with model
|
576 |
+
channels = zerogpu_generate(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
577 |
prompt,
|
578 |
{
|
579 |
"do_sample": bool(do_sample),
|
|
|
634 |
"""
|
635 |
)
|
636 |
|
|
|
637 |
with gr.Row():
|
638 |
system_prompt = gr.Textbox(
|
639 |
label="System Prompt",
|
|
|
641 |
lines=2
|
642 |
)
|
643 |
|
644 |
+
with gr.Accordion("Generation Settings", open=False):
|
|
|
|
|
|
|
|
|
|
|
|
|
645 |
with gr.Row():
|
646 |
temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
|
647 |
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
|
|
|
692 |
fn=generate_response,
|
693 |
type="messages",
|
694 |
additional_inputs=[
|
695 |
+
system_prompt, temperature, top_p, top_k, max_new,
|
696 |
+
do_sample, seed, rose_enable, rose_alpha, rose_score,
|
697 |
+
rose_tokens, rose_json, show_thinking, reasoning_effort
|
698 |
],
|
699 |
title="Chat with Mirel",
|
700 |
description="A chain-of-thought model using Harmony format",
|