Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoProcessor, LlavaForConditionalGeneration | |
| from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext, set_global_service_context | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.vector_stores.faiss import FaissVectorStore | |
| from llama_index.storage.storage_context import StorageContext | |
| import torch | |
| from PIL import Image | |
| import os | |
| # Load LLaVA model and processor | |
| model_id = "llava-hf/llava-1.5-7b-hf" | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16, low_cpu_mem_usage=True) | |
| model.to("cuda" if torch.cuda.is_available() else "cpu") | |
| # Load documents and build FAISS index | |
| documents = SimpleDirectoryReader("docs").load_data() | |
| embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en") | |
| service_context = ServiceContext.from_defaults(embed_model=embed_model) | |
| set_global_service_context(service_context) | |
| index = VectorStoreIndex.from_documents(documents, service_context=service_context) | |
| query_engine = index.as_query_engine() | |
| def multimodal_rag(image, question): | |
| # Step 1: RAG to retrieve context | |
| context = query_engine.query(question) | |
| # Step 2: Process with LLaVA | |
| prompt = f"Context: {context} | |
| Question: {question}" | |
| inputs = processor(prompt, image, return_tensors="pt").to(model.device) | |
| output = model.generate(**inputs, max_new_tokens=100) | |
| answer = processor.decode(output[0], skip_special_tokens=True) | |
| return answer | |
| demo = gr.Interface( | |
| fn=multimodal_rag, | |
| inputs=[ | |
| gr.Image(type="pil", label="Upload Image"), | |
| gr.Textbox(label="Enter your question") | |
| ], | |
| outputs="text", | |
| title="Multimodal RAG with LLaVA and FAISS", | |
| description="Upload an image and ask a question. The system retrieves relevant text using FAISS and answers using LLaVA." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |