import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM import torch import traceback # Fixed system prompt (your "persona") SYSTEM_PROMPT = ( "You are Alexander Molchevskyi — a senior software engineer with over 20 years " "of professional experience across embedded, desktop, and server systems. " "Skilled in C++, Rust, Python, AI infrastructure, compilers, WebAssembly, and " "developer tooling. You answer interview questions clearly, professionally, and naturally." ) # Global variables for model and tokenizer model = None tokenizer = None def load_model(): global model, tokenizer try: print("Loading model for CPU inference...") model_name = "Molchevsky/ai_resume_llama-3.2-3b" # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token print(f"Tokenizer loaded. Vocab size: {len(tokenizer)}") # CPU-optimized model loading model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, device_map=None, low_cpu_mem_usage=True, trust_remote_code=True, use_cache=True, ) # Explicitly move to CPU model = model.to('cpu') model.eval() # Set to evaluation mode print(f"Model loaded successfully on CPU!") return True except Exception as e: print(f"Error loading model: {e}") print(traceback.format_exc()) return False # Load model at startup model_loaded = load_model() def simple_respond(message, history, max_tokens, temperature, top_p): """ Simple non-streaming generation for debugging. """ if not model_loaded or model is None or tokenizer is None: return "Error: Model not loaded properly." try: print(f"Processing: {message}") # Very simple prompt prompt = f"User: {message}\nAssistant:" print(f"Prompt: {repr(prompt)}") # Tokenize inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512) print(f"Input shape: {inputs.input_ids.shape}") print(f"Input tokens: {inputs.input_ids[0][:10]}") # First 10 tokens # Simple generation - no streaming print("Starting generation...") with torch.no_grad(): outputs = model.generate( inputs.input_ids, attention_mask=inputs.attention_mask, max_new_tokens=20, # Very small for testing temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, ) print("Generation completed!") print(f"Output shape: {outputs.shape}") # Decode only the new tokens new_tokens = outputs[0][inputs.input_ids.shape[1]:] response = tokenizer.decode(new_tokens, skip_special_tokens=True) print(f"Response: {repr(response)}") if not response.strip(): return "Model generated empty response. This might be a model configuration issue." return response.strip() except Exception as e: error_msg = f"Error: {str(e)}" print(error_msg) print(traceback.format_exc()) return error_msg # Create simple interface for testing with gr.Blocks() as demo: gr.Markdown("# Debug Version - Simple Generation Test") with gr.Row(): msg_input = gr.Textbox(label="Message", placeholder="Type your message...") send_btn = gr.Button("Send") output = gr.Textbox(label="Response", lines=5) # Simple controls max_tokens = gr.Slider(1, 100, value=20, label="Max Tokens") temperature = gr.Slider(0.1, 2.0, value=0.7, label="Temperature") top_p = gr.Slider(0.1, 1.0, value=0.9, label="Top-p") send_btn.click( simple_respond, inputs=[msg_input, gr.State([]), max_tokens, temperature, top_p], outputs=output ) if __name__ == "__main__": demo.launch(debug=True)