import gradio as gr import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer from retriever.vectordb import search_documents # ๐Ÿง  RAG ๊ฒ€์ƒ‰๊ธฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ model_name = "dasomaru/gemma-3-4bit-it-demo" # ๐Ÿš€ tokenizer๋Š” CPU์—์„œ๋„ ๋ฏธ๋ฆฌ ๋ถˆ๋Ÿฌ์˜ฌ ์ˆ˜ ์žˆ์Œ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # ๐Ÿš€ model์€ CPU๋กœ๋งŒ ๋จผ์ € ์˜ฌ๋ฆผ (GPU ์•„์ง ์—†์Œ) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, # 4bit model์ด๋‹ˆ๊นŒ trust_remote_code=True, ) @spaces.GPU(duration=300) def generate_response(query): # ๐Ÿš€ generate_response ํ•จ์ˆ˜ ์•ˆ์—์„œ ๋งค๋ฒˆ ๋กœ๋“œ # tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # model = AutoModelForCausalLM.from_pretrained( # model_name, # torch_dtype=torch.float16, # device_map="auto", # โœ… ์ค‘์š”: ์ž๋™์œผ๋กœ GPU ํ• ๋‹น # trust_remote_code=True, # ) tokenizer = AutoTokenizer.from_pretrained("dasomaru/gemma-3-4bit-it-demo") model = AutoModelForCausalLM.from_pretrained("dasomaru/gemma-3-4bit-it-demo") model.to("cuda") # 1. ๊ฒ€์ƒ‰ top_k = 5 retrieved_docs = search_documents(query, top_k=top_k) # 2. ํ”„๋กฌํ”„ํŠธ ์กฐ๋ฆฝ prompt = ( "๋‹น์‹ ์€ ๊ณต์ธ์ค‘๊ฐœ์‚ฌ ์‹œํ—˜ ๋ฌธ์ œ ์ถœ์ œ ์ „๋ฌธ๊ฐ€์ž…๋‹ˆ๋‹ค.\n\n" "๋‹ค์Œ์€ ๊ธฐ์ถœ ๋ฌธ์ œ ๋ฐ ๊ด€๋ จ ๋ฒ•๋ น ์ •๋ณด์ž…๋‹ˆ๋‹ค:\n" ) for idx, doc in enumerate(retrieved_docs, 1): prompt += f"- {doc}\n" prompt += f"\n์ด ์ •๋ณด๋ฅผ ์ฐธ๊ณ ํ•˜์—ฌ ์‚ฌ์šฉ์ž์˜ ์š”์ฒญ์— ๋‹ต๋ณ€ํ•ด ์ฃผ์„ธ์š”.\n\n" prompt += f"[์งˆ๋ฌธ]\n{query}\n\n[๋‹ต๋ณ€]\n" # 3. ๋‹ต๋ณ€ ์ƒ์„ฑ inputs = tokenizer(prompt, return_tensors="pt").to(model.device) # โœ… model.device outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, top_p=0.9, top_k=50, do_sample=True, ) return tokenizer.decode(outputs[0], skip_special_tokens=True) demo = gr.Interface(fn=generate_response, inputs="text", outputs="text") demo.launch()