from transformers import AutoTokenizer, AutoModelForCausalLM # Load a small, fast model model_name = "distilgpt2" # smaller than full GPT-2, uses less memory tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained(model_name) conversation = "You are a kind AI assistant. Stay on topic.\n" print("\nType your messages below. Type 'quit' to exit.\n") while True: try: user_input = input("You: ") except KeyboardInterrupt: print("\nEnding chat. Bye!") break if user_input.lower() in ["quit", "exit"]: print("Ending chat. Bye!") break conversation += f"User: {user_input}\nAI:" # Tokenize efficiently and limit input size to avoid memory issues inputs = tokenizer(conversation, return_tensors="pt", truncation=True, max_length=512) # Generate a short response to save memory and speed output = model.generate( **inputs, max_new_tokens=40, # shorter responses = faster temperature=0.7, do_sample=True, pad_token_id=tokenizer.eos_token_id ) # Decode and extract AI response ai_response = tokenizer.decode(output[0], skip_special_tokens=True).split("AI:")[-1].strip() print(f"Baby AI: {ai_response}") # Add AI response to conversation for context conversation += f"{ai_response}\n"