AbstractPhil commited on
Commit
7779abb
·
1 Parent(s): ed0198d

disabled peft in a differnt version

Browse files
Files changed (1) hide show
  1. app.py +30 -120
app.py CHANGED
@@ -73,8 +73,8 @@ def _hf_login() -> None:
73
  else:
74
  print("[HF Auth] No token found in environment variables")
75
 
76
- # Login is handled by Space OAuth/session; avoid explicit CLI login here to prevent OAuth var errors
77
- # _hf_login()
78
 
79
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
80
 
@@ -364,15 +364,16 @@ def zerogpu_generate(full_prompt,
364
  out_ids = model.generate(
365
  **inputs,
366
  do_sample=bool(gen_kwargs.get("do_sample", True)),
367
- temperature=float(gen_kwargs.get("temperature", 0.6)),
368
- top_p=(float(gen_kwargs.get("top_p")) if gen_kwargs.get("top_p") is not None else None),
369
- top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
370
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
371
  pad_token_id=model.config.pad_token_id,
372
  eos_token_id=tokenizer.eos_token_id,
373
- repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.1)),
374
- no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
375
  logits_processor=logits_processor,
 
 
376
  stopping_criteria=sc,
377
  )
378
 
@@ -419,59 +420,6 @@ def zerogpu_generate(full_prompt,
419
  if torch.cuda.is_available():
420
  torch.cuda.empty_cache()
421
 
422
- # -----------------------
423
- # Simple (non-Harmony) GPU path — matches your minimal example
424
- # -----------------------
425
- @spaces.GPU(duration=120)
426
- def zerogpu_generate_simple(prompt_str: str, gen_kwargs: Dict[str, Any], rose_map: Optional[Dict[str, float]], rose_alpha: float, rose_score: Optional[float], seed: Optional[int]) -> Dict[str, str]:
427
- """Straight chat_template path. No Harmony tokens. Slices completion from prompt_len.
428
- Mirrors the minimal HF example and avoids header loops entirely."""
429
- model = None
430
- try:
431
- if seed is not None:
432
- torch.manual_seed(int(seed))
433
- model = _load_model_on("auto")
434
- device = next(model.parameters()).device
435
-
436
- # Encode prompt string
437
- enc = tokenizer(prompt_str, return_tensors="pt")
438
- inputs = {k: v.to(device) for k, v in enc.items()}
439
- prompt_len = int(inputs["input_ids"].shape[1])
440
- if "attention_mask" not in inputs:
441
- inputs["attention_mask"] = torch.ones_like(inputs["input_ids"], dtype=torch.long, device=device)
442
-
443
- # Optional Rose bias
444
- logits_processor = None
445
- if rose_map:
446
- bias = build_bias_from_tokens(tokenizer, rose_map).to(device)
447
- eff_alpha = float(rose_alpha) * (float(rose_score) if rose_score is not None else 1.0)
448
- logits_processor = [RoseGuidedLogits(bias, eff_alpha)]
449
-
450
- out_ids = model.generate(
451
- **inputs,
452
- do_sample=bool(gen_kwargs.get("do_sample", True)),
453
- temperature=float(gen_kwargs.get("temperature", 0.6)),
454
- top_p=(float(gen_kwargs.get("top_p")) if gen_kwargs.get("top_p") is not None else None),
455
- top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") else None),
456
- max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
457
- pad_token_id=model.config.pad_token_id,
458
- logits_processor=logits_processor,
459
- )
460
- # Slice generated continuation only
461
- new_ids = out_ids[0, prompt_len:]
462
- text = tokenizer.decode(new_ids, skip_special_tokens=True)
463
- return {"final": text}
464
- except Exception as e:
465
- return {"final": f"[Error] {type(e).__name__}: {e}"}
466
- finally:
467
- try:
468
- del model
469
- except Exception:
470
- pass
471
- gc.collect()
472
- if torch.cuda.is_available():
473
- torch.cuda.empty_cache()
474
-
475
  # -----------------------
476
  # GPU Debug: Harmony Inspector
477
  # -----------------------
@@ -512,6 +460,7 @@ def zerogpu_generate_debug(full_prompt, gen_kwargs: Dict[str, Any]) -> Dict[str,
512
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
513
  pad_token_id=model.config.pad_token_id,
514
  eos_token_id=tokenizer.eos_token_id,
 
515
  stopping_criteria=sc,
516
  repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
517
  no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
@@ -568,45 +517,29 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
568
  rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
569
  rose_tokens: str, rose_json: str,
570
  show_thinking: bool = False,
571
- simple_mode: bool = True, # NEW: default to simple chat_template path
572
  reasoning_effort: str = "high") -> str:
573
  """
574
  Generate response with proper CoT handling using Harmony format.
575
  """
576
  try:
577
- # Build messages robustly for Gradio type='messages' or legacy tuple format
578
  messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
579
-
580
- # Add prior turns
581
  if history:
582
- if isinstance(history, list) and history and isinstance(history[0], dict):
583
- # history is already a flat list of {'role','content'} dicts
584
- for m in history:
585
- role = m.get("role")
586
- content = m.get("content", "")
587
- if role in ("user", "assistant"):
588
- messages.append({"role": role, "content": str(content)})
589
- else:
590
- for turn in history:
591
- if isinstance(turn, (list, tuple)) and len(turn) >= 2:
592
- u, a = turn[0], turn[1]
593
- if u:
594
- messages.append({"role": "user", "content": str(u)})
595
- if a:
596
- messages.append({"role": "assistant", "content": str(a)})
597
-
598
- # Current user message
599
- if isinstance(message, dict):
600
- user_text = message.get("content", "")
601
- else:
602
- user_text = str(message)
603
- messages.append({"role": "user", "content": user_text})
604
-
605
- # FAST PATH: simple chat_template prompt (recommended)
606
- if simple_mode:
607
- prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
608
- # Harmony path (optional)
609
- elif HARMONY_AVAILABLE:
610
  prompt = create_harmony_prompt(messages, reasoning_effort) # returns token IDs
611
  else:
612
  # Fallback to tokenizer template (string)
@@ -640,23 +573,7 @@ def generate_response(message: str, history: List[List[str]], system_prompt: str
640
  rose_map = None
641
 
642
  # Generate with model
643
- if simple_mode:
644
- channels = zerogpu_generate_simple(
645
- prompt,
646
- {
647
- "do_sample": bool(do_sample),
648
- "temperature": float(temperature),
649
- "top_p": float(top_p) if top_p is not None else None,
650
- "top_k": int(top_k) if top_k > 0 else None,
651
- "max_new_tokens": int(max_new_tokens),
652
- },
653
- rose_map,
654
- float(rose_alpha),
655
- float(rose_score) if rose_score is not None else None,
656
- int(seed) if seed is not None else None,
657
- )
658
- else:
659
- channels = zerogpu_generate(
660
  prompt,
661
  {
662
  "do_sample": bool(do_sample),
@@ -717,7 +634,6 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
717
  """
718
  )
719
 
720
-
721
  with gr.Row():
722
  system_prompt = gr.Textbox(
723
  label="System Prompt",
@@ -725,13 +641,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
725
  lines=2
726
  )
727
 
728
- with gr.Accordion("Generation Settings ", open=False):
729
- # NEW: toggle to bypass Harmony and use plain chat_template like your minimal script
730
- simple_mode = gr.Checkbox(
731
- value=True,
732
- label="Use simple chat_template (no Harmony)",
733
- info="Matches the minimal HF example; safest path for now"
734
- )
735
  with gr.Row():
736
  temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
737
  top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
@@ -782,9 +692,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
782
  fn=generate_response,
783
  type="messages",
784
  additional_inputs=[
785
- system_prompt, temperature, top_p, top_k, max_new,
786
- do_sample, seed, rose_enable, rose_alpha, rose_score,
787
- rose_tokens, rose_json, show_thinking, simple_mode, reasoning_effort
788
  ],
789
  title="Chat with Mirel",
790
  description="A chain-of-thought model using Harmony format",
 
73
  else:
74
  print("[HF Auth] No token found in environment variables")
75
 
76
+ # Login before loading any models
77
+ _hf_login()
78
 
79
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
80
 
 
364
  out_ids = model.generate(
365
  **inputs,
366
  do_sample=bool(gen_kwargs.get("do_sample", True)),
367
+ temperature=float(gen_kwargs.get("temperature", 0.7)),
368
+ top_p=float(gen_kwargs.get("top_p", 0.9)),
369
+ top_k=(int(gen_kwargs.get("top_k")) if gen_kwargs.get("top_k") and int(gen_kwargs.get("top_k")) > 0 else None),
370
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
371
  pad_token_id=model.config.pad_token_id,
372
  eos_token_id=tokenizer.eos_token_id,
373
+ bad_words_ids=bad_words_ids,
 
374
  logits_processor=logits_processor,
375
+ repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.2)),
376
+ no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 8)),
377
  stopping_criteria=sc,
378
  )
379
 
 
420
  if torch.cuda.is_available():
421
  torch.cuda.empty_cache()
422
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
423
  # -----------------------
424
  # GPU Debug: Harmony Inspector
425
  # -----------------------
 
460
  max_new_tokens=int(gen_kwargs.get("max_new_tokens", MAX_DEF)),
461
  pad_token_id=model.config.pad_token_id,
462
  eos_token_id=tokenizer.eos_token_id,
463
+ bad_words_ids=bad_words_ids,
464
  stopping_criteria=sc,
465
  repetition_penalty=float(gen_kwargs.get("repetition_penalty", 1.15)),
466
  no_repeat_ngram_size=int(gen_kwargs.get("no_repeat_ngram_size", 6)),
 
517
  rose_enable: bool, rose_alpha: float, rose_score: Optional[float],
518
  rose_tokens: str, rose_json: str,
519
  show_thinking: bool = False,
 
520
  reasoning_effort: str = "high") -> str:
521
  """
522
  Generate response with proper CoT handling using Harmony format.
523
  """
524
  try:
525
+ # Build message list
526
  messages = [{"role": "system", "content": system_prompt or SYSTEM_DEF}]
527
+
528
+ # Add history
529
  if history:
530
+ for turn in history:
531
+ if isinstance(turn, (list, tuple)) and len(turn) >= 2:
532
+ user_msg, assistant_msg = turn[0], turn[1]
533
+ if user_msg:
534
+ messages.append({"role": "user", "content": str(user_msg)})
535
+ if assistant_msg:
536
+ messages.append({"role": "assistant", "content": str(assistant_msg)})
537
+
538
+ # Add current message
539
+ messages.append({"role": "user", "content": str(message)})
540
+
541
+ # Create Harmony-formatted prompt
542
+ if HARMONY_AVAILABLE:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  prompt = create_harmony_prompt(messages, reasoning_effort) # returns token IDs
544
  else:
545
  # Fallback to tokenizer template (string)
 
573
  rose_map = None
574
 
575
  # Generate with model
576
+ channels = zerogpu_generate(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
  prompt,
578
  {
579
  "do_sample": bool(do_sample),
 
634
  """
635
  )
636
 
 
637
  with gr.Row():
638
  system_prompt = gr.Textbox(
639
  label="System Prompt",
 
641
  lines=2
642
  )
643
 
644
+ with gr.Accordion("Generation Settings", open=False):
 
 
 
 
 
 
645
  with gr.Row():
646
  temperature = gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature")
647
  top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.01, label="Top-p")
 
692
  fn=generate_response,
693
  type="messages",
694
  additional_inputs=[
695
+ system_prompt, temperature, top_p, top_k, max_new,
696
+ do_sample, seed, rose_enable, rose_alpha, rose_score,
697
+ rose_tokens, rose_json, show_thinking, reasoning_effort
698
  ],
699
  title="Chat with Mirel",
700
  description="A chain-of-thought model using Harmony format",