ai_resume_chat / app.py
Molchevsky's picture
many updates
423e539
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import traceback
# Fixed system prompt (your "persona")
SYSTEM_PROMPT = (
"You are Alexander Molchevskyi — a senior software engineer with over 20 years "
"of professional experience across embedded, desktop, and server systems. "
"Skilled in C++, Rust, Python, AI infrastructure, compilers, WebAssembly, and "
"developer tooling. You answer interview questions clearly, professionally, and naturally."
)
# Global variables for model and tokenizer
model = None
tokenizer = None
def load_model():
global model, tokenizer
try:
print("Loading model for CPU inference...")
model_name = "Molchevsky/ai_resume_llama-3.2-3b"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}")
# CPU-optimized model loading
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float32,
device_map=None,
low_cpu_mem_usage=True,
trust_remote_code=True,
use_cache=True,
)
# Explicitly move to CPU
model = model.to('cpu')
model.eval() # Set to evaluation mode
print(f"Model loaded successfully on CPU!")
return True
except Exception as e:
print(f"Error loading model: {e}")
print(traceback.format_exc())
return False
# Load model at startup
model_loaded = load_model()
def simple_respond(message, history, max_tokens, temperature, top_p):
"""
Simple non-streaming generation for debugging.
"""
if not model_loaded or model is None or tokenizer is None:
return "Error: Model not loaded properly."
try:
print(f"Processing: {message}")
# Very simple prompt
prompt = f"User: {message}\nAssistant:"
print(f"Prompt: {repr(prompt)}")
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
print(f"Input shape: {inputs.input_ids.shape}")
print(f"Input tokens: {inputs.input_ids[0][:10]}") # First 10 tokens
# Simple generation - no streaming
print("Starting generation...")
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
attention_mask=inputs.attention_mask,
max_new_tokens=20, # Very small for testing
temperature=0.7,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
print("Generation completed!")
print(f"Output shape: {outputs.shape}")
# Decode only the new tokens
new_tokens = outputs[0][inputs.input_ids.shape[1]:]
response = tokenizer.decode(new_tokens, skip_special_tokens=True)
print(f"Response: {repr(response)}")
if not response.strip():
return "Model generated empty response. This might be a model configuration issue."
return response.strip()
except Exception as e:
error_msg = f"Error: {str(e)}"
print(error_msg)
print(traceback.format_exc())
return error_msg
# Create simple interface for testing
with gr.Blocks() as demo:
gr.Markdown("# Debug Version - Simple Generation Test")
with gr.Row():
msg_input = gr.Textbox(label="Message", placeholder="Type your message...")
send_btn = gr.Button("Send")
output = gr.Textbox(label="Response", lines=5)
# Simple controls
max_tokens = gr.Slider(1, 100, value=20, label="Max Tokens")
temperature = gr.Slider(0.1, 2.0, value=0.7, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-p")
send_btn.click(
simple_respond,
inputs=[msg_input, gr.State([]), max_tokens, temperature, top_p],
outputs=output
)
if __name__ == "__main__":
demo.launch(debug=True)