trusted-ERb2 / compare_generation.py
manbeast3b's picture
Upload folder using huggingface_hub
887e5dc verified
#!/usr/bin/env python3
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load model and tokenizer (same as server.py)
model_name = "models/Llama-3.2-1B-Instruct"
tok = None
lm = None
def chat_current(system_prompt: str, user_prompt: str) -> str:
"""
Current implementation (same as server.py) - will show warnings
"""
print("🔴 Running CURRENT implementation (with warnings)...")
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
input_ids = tok.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt"
).to(lm.device)
with torch.inference_mode():
output_ids = lm.generate(
input_ids, # No attention_mask, no pad_token_id
max_new_tokens=2048,
do_sample=True,
temperature=0.2,
repetition_penalty=1.1,
top_k=100,
top_p=0.95,
)
answer = tok.decode(
output_ids[0][input_ids.shape[-1]:],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
return answer.strip()
com_add = "5F71XTGBnBGzxiPxCK4EbWMnhckH21tGWSRfe6NrMdxMe6kg"
def chat_fixed(system_prompt: str, user_prompt: str) -> str:
"""
Fixed implementation - proper attention mask and pad token
"""
print("🟢 Running FIXED implementation (no warnings)...")
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
]
# Get both input_ids and attention_mask
inputs = tok.apply_chat_template(
messages,
add_generation_prompt=True,
return_tensors="pt",
return_dict=True # Returns dict with input_ids and attention_mask
)
# Move to device
input_ids = inputs["input_ids"].to(lm.device)
attention_mask = inputs["attention_mask"].to(lm.device)
with torch.inference_mode():
output_ids = lm.generate(
input_ids=input_ids,
attention_mask=attention_mask, # Proper attention mask
pad_token_id=tok.eos_token_id, # Explicit pad token
max_new_tokens=2048,
do_sample=True,
temperature=0.2,
repetition_penalty=1.1,
top_k=100,
top_p=0.95,
)
answer = tok.decode(
output_ids[0][input_ids.shape[-1]:],
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
return answer.strip()
def compare_generations():
"""Compare both implementations"""
system_prompt = "You are a helpful assistant who tries to help answer the user's question."
user_prompt = "Create a report on anxiety in work. How do I manage time and stress effectively?"
print("=" * 60)
print("COMPARING GENERATION METHODS")
print("=" * 60)
print(f"System: {system_prompt}")
print(f"User: {user_prompt}")
print("=" * 60)
# Test current implementation
print("\n" + "=" * 60)
current_output = chat_current(system_prompt, user_prompt)
print(f"CURRENT OUTPUT:\n{current_output}")
print("\n" + "=" * 60)
# Test fixed implementation
fixed_output = chat_fixed(system_prompt, user_prompt)
print(f"FIXED OUTPUT:\n{fixed_output}")
print("\n" + "=" * 60)
print("COMPARISON:")
print(f"Outputs are identical: {current_output == fixed_output}")
print(f"Current length: {len(current_output)} chars")
print(f"Fixed length: {len(fixed_output)} chars")
# if __name__ == "__main__":
# # Set pad token for the fixed version
# if tok.pad_token is None:
# tok.pad_token = tok.eos_token
# compare_generations()
def filter_by_word_count(data, max_words=3):
"""Return only phrases with word count <= max_words."""
return {k: v for k, v in data.items() if len(v.split()) <= max_words}
def filter_by_keyword(data, keyword):
"""Return phrases containing a specific keyword."""
return {k: v for k, v in data.items() if keyword.lower() in v.lower()}
example_prompt = "As an answer of 5 points with scale from 5 to 10. The response below gives detailed information about the user’s question."