"""loading the models to be used by the Mulltimodal RAG system.""" import torch import gc from sentence_transformers import SentenceTransformer from transformers import AutoProcessor, Gemma3ForConditionalGeneration # from accelerate import disk_offload from utils import clear_gpu_cache device = "cuda" if torch.cuda.is_available() else "cpu" # Embedding model embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Gemma3 quantization config model_name = "google/gemma-3-4b-it" # Load Gemma3 model = Gemma3ForConditionalGeneration.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="cpu", # Avoid meta errors ) # disk_offload(model=model, offload_dir="offload") model.to("cpu") model.eval() # Processor processor = AutoProcessor.from_pretrained(model_name, use_fast=True) clear_gpu_cache() gc.collect()