import streamlit as st from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer import torch import bitsandbytes as bnb # Required for 4-bit quantization # Check if CUDA is available, and decide on the device device = "cuda" if torch.cuda.is_available() else "cpu" # Load the tokenizer and the quantized LLaMA model model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit" tokenizer = AutoTokenizer.from_pretrained(model_name) # Load the quantized LLaMA model in 4-bit precision model = AutoModelForCausalLM.from_pretrained( model_name, load_in_4bit=True, # Enable 4-bit quantization device_map="auto" if device == "cuda" else {"": "cpu"} # Use auto if CUDA is available, else fallback to CPU ) # Streamlit interface st.title("Keyword Extractor using LLaMA 4-bit Model") # Text input area for user input user_input = st.text_area("Enter text for keyword extraction") if user_input: # Prepare the prompt for keyword extraction prompt_template = ( "Extract keywords and variables from the prompt:\n" "{}\n" ) alpaca_prompt = prompt_template.format(user_input) # Tokenize the input text inputs = tokenizer([alpaca_prompt], return_tensors="pt").to(device) # Set up the text streamer to display the generated text as it streams text_streamer = TextStreamer(tokenizer) # Generate keywords and extract variables with torch.no_grad(): output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) # Decode the output tokens to get the generated text generated_text = tokenizer.decode(output[0], skip_special_tokens=True) # Display the result in the Streamlit app st.write("Extracted Keywords and Variables:") st.write(generated_text)