Spaces:
Sleeping
Sleeping
import gradio as gr | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
import traceback | |
# Fixed system prompt (your "persona") | |
SYSTEM_PROMPT = ( | |
"You are Alexander Molchevskyi — a senior software engineer with over 20 years " | |
"of professional experience across embedded, desktop, and server systems. " | |
"Skilled in C++, Rust, Python, AI infrastructure, compilers, WebAssembly, and " | |
"developer tooling. You answer interview questions clearly, professionally, and naturally." | |
) | |
# Global variables for model and tokenizer | |
model = None | |
tokenizer = None | |
def load_model(): | |
global model, tokenizer | |
try: | |
print("Loading model for CPU inference...") | |
model_name = "Molchevsky/ai_resume_llama-3.2-3b" | |
# Load tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
if tokenizer.pad_token is None: | |
tokenizer.pad_token = tokenizer.eos_token | |
print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}") | |
# CPU-optimized model loading | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name, | |
torch_dtype=torch.float32, | |
device_map=None, | |
low_cpu_mem_usage=True, | |
trust_remote_code=True, | |
use_cache=True, | |
) | |
# Explicitly move to CPU | |
model = model.to('cpu') | |
model.eval() # Set to evaluation mode | |
print(f"Model loaded successfully on CPU!") | |
return True | |
except Exception as e: | |
print(f"Error loading model: {e}") | |
print(traceback.format_exc()) | |
return False | |
# Load model at startup | |
model_loaded = load_model() | |
def simple_respond(message, history, max_tokens, temperature, top_p): | |
""" | |
Simple non-streaming generation for debugging. | |
""" | |
if not model_loaded or model is None or tokenizer is None: | |
return "Error: Model not loaded properly." | |
try: | |
print(f"Processing: {message}") | |
# Very simple prompt | |
prompt = f"User: {message}\nAssistant:" | |
print(f"Prompt: {repr(prompt)}") | |
# Tokenize | |
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) | |
print(f"Input shape: {inputs.input_ids.shape}") | |
print(f"Input tokens: {inputs.input_ids[0][:10]}") # First 10 tokens | |
# Simple generation - no streaming | |
print("Starting generation...") | |
with torch.no_grad(): | |
outputs = model.generate( | |
inputs.input_ids, | |
attention_mask=inputs.attention_mask, | |
max_new_tokens=20, # Very small for testing | |
temperature=0.7, | |
do_sample=True, | |
pad_token_id=tokenizer.eos_token_id, | |
eos_token_id=tokenizer.eos_token_id, | |
) | |
print("Generation completed!") | |
print(f"Output shape: {outputs.shape}") | |
# Decode only the new tokens | |
new_tokens = outputs[0][inputs.input_ids.shape[1]:] | |
response = tokenizer.decode(new_tokens, skip_special_tokens=True) | |
print(f"Response: {repr(response)}") | |
if not response.strip(): | |
return "Model generated empty response. This might be a model configuration issue." | |
return response.strip() | |
except Exception as e: | |
error_msg = f"Error: {str(e)}" | |
print(error_msg) | |
print(traceback.format_exc()) | |
return error_msg | |
# Create simple interface for testing | |
with gr.Blocks() as demo: | |
gr.Markdown("# Debug Version - Simple Generation Test") | |
with gr.Row(): | |
msg_input = gr.Textbox(label="Message", placeholder="Type your message...") | |
send_btn = gr.Button("Send") | |
output = gr.Textbox(label="Response", lines=5) | |
# Simple controls | |
max_tokens = gr.Slider(1, 100, value=20, label="Max Tokens") | |
temperature = gr.Slider(0.1, 2.0, value=0.7, label="Temperature") | |
top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-p") | |
send_btn.click( | |
simple_respond, | |
inputs=[msg_input, gr.State([]), max_tokens, temperature, top_p], | |
outputs=output | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |