anabury commited on
Commit
9a972c0
·
verified ·
1 Parent(s): 7939451

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -3,46 +3,59 @@ import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
5
 
6
- BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit" # base that you finetuned from
7
- ADAPTER_ID = "Anabury/My_Finetuned_Phi-4" # your adapter repo
8
 
9
- # tokenizer (either base or adapter works; use base)
 
 
 
 
 
 
 
 
 
 
 
10
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
11
 
12
- # load base model (4-bit quant is fine on Spaces GPU/CPU)
13
  base = AutoModelForCausalLM.from_pretrained(
14
  BASE_MODEL,
15
- device_map="auto",
16
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
17
  trust_remote_code=True
18
  )
19
 
20
- # attach your LoRA adapter
21
  model = PeftModel.from_pretrained(base, ADAPTER_ID)
22
  model.eval()
23
 
 
24
  def chat(message, history):
25
- # build a simple prompt; adapt if you have a chat template in your repo
26
- prompt = message
27
- inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
28
  with torch.no_grad():
29
- output = model.generate(
30
  **inputs,
31
  max_new_tokens=256,
32
  do_sample=True,
33
  temperature=0.7,
34
  top_p=0.9,
35
- pad_token_id=tokenizer.eos_token_id
36
  )
37
- reply = tokenizer.decode(output[0], skip_special_tokens=True)
38
  history.append((message, reply))
39
  return history, history
40
 
 
41
  with gr.Blocks() as demo:
42
- gr.Markdown("# Phi-4 Chat (LoRA)")
43
  chatbot = gr.Chatbot(height=420)
44
  msg = gr.Textbox(placeholder="Ask me anything…")
45
  clear = gr.Button("Clear")
 
46
  msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
47
  clear.click(lambda: [], None, chatbot, queue=False)
48
 
 
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  from peft import PeftModel
5
 
6
+ # Your adapter (LoRA fine-tuned model on Hugging Face)
7
+ ADAPTER_ID = "Anabury/My_Finetuned_Phi-4"
8
 
9
+ # Detect device
10
+ USE_GPU = torch.cuda.is_available()
11
+
12
+ # Pick base model depending on device
13
+ if USE_GPU:
14
+ BASE_MODEL = "unsloth/phi-4-unsloth-bnb-4bit" # fast + quantized
15
+ else:
16
+ BASE_MODEL = "unsloth/phi-4" # full precision for CPU
17
+
18
+ print(f"Loading base model: {BASE_MODEL} on {'GPU' if USE_GPU else 'CPU'}")
19
+
20
+ # Load tokenizer
21
  tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
22
 
23
+ # Load base model
24
  base = AutoModelForCausalLM.from_pretrained(
25
  BASE_MODEL,
26
+ device_map="auto" if USE_GPU else None,
27
+ torch_dtype=torch.float16 if USE_GPU else torch.float32,
28
  trust_remote_code=True
29
  )
30
 
31
+ # Attach your LoRA adapter
32
  model = PeftModel.from_pretrained(base, ADAPTER_ID)
33
  model.eval()
34
 
35
+ # Chat function
36
  def chat(message, history):
37
+ # simple prompt, you can swap in chat template later
38
+ inputs = tokenizer(message, return_tensors="pt").to(model.device)
 
39
  with torch.no_grad():
40
+ outputs = model.generate(
41
  **inputs,
42
  max_new_tokens=256,
43
  do_sample=True,
44
  temperature=0.7,
45
  top_p=0.9,
46
+ pad_token_id=tokenizer.eos_token_id,
47
  )
48
+ reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
49
  history.append((message, reply))
50
  return history, history
51
 
52
+ # Gradio UI
53
  with gr.Blocks() as demo:
54
+ gr.Markdown("# 🧠 Phi-4 Chatbot (Fine-tuned)")
55
  chatbot = gr.Chatbot(height=420)
56
  msg = gr.Textbox(placeholder="Ask me anything…")
57
  clear = gr.Button("Clear")
58
+
59
  msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
60
  clear.click(lambda: [], None, chatbot, queue=False)
61