Molchevsky commited on
Commit
e15ea41
·
1 Parent(s): 1c5fd3a

many updates

Browse files
Files changed (2) hide show
  1. app.py +74 -28
  2. requirements.txt +3 -2
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
 
4
  # Fixed system prompt (your "persona")
5
  SYSTEM_PROMPT = (
@@ -9,43 +11,90 @@ SYSTEM_PROMPT = (
9
  "developer tooling. You answer interview questions clearly, professionally, and naturally."
10
  )
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def respond(
13
  message,
14
  history: list[dict[str, str]],
15
  max_tokens,
16
  temperature,
17
  top_p,
18
- hf_token: gr.OAuthToken,
19
  ):
20
  """
21
- Sends a chat request to your model hosted on Hugging Face.
22
  """
23
- client = InferenceClient(
24
- token=hf_token.token,
25
- model="Molchevsky/ai_resume_llama-3.2-3b" # <--- your model here
26
- )
27
-
28
- # Compose chat history with system prompt
29
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
30
  messages.extend(history)
31
  messages.append({"role": "user", "content": message})
32
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  response = ""
34
- for msg in client.chat_completion(
35
- messages,
36
- max_tokens=max_tokens,
37
- stream=True,
38
- temperature=temperature,
39
- top_p=top_p,
40
- ):
41
- choices = msg.choices
42
- token = ""
43
- if len(choices) and choices[0].delta.content:
44
- token = choices[0].delta.content
45
- response += token
46
- yield response
47
-
48
 
 
49
  chatbot = gr.ChatInterface(
50
  respond,
51
  type="messages",
@@ -63,10 +112,7 @@ chatbot = gr.ChatInterface(
63
  )
64
 
65
  with gr.Blocks() as demo:
66
- with gr.Sidebar():
67
- gr.LoginButton()
68
  chatbot.render()
69
 
70
-
71
  if __name__ == "__main__":
72
- demo.launch(debug=True)
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
3
+ import torch
4
+ from threading import Thread
5
 
6
  # Fixed system prompt (your "persona")
7
  SYSTEM_PROMPT = (
 
11
  "developer tooling. You answer interview questions clearly, professionally, and naturally."
12
  )
13
 
14
+ # Load model and tokenizer
15
+ print("Loading model...")
16
+ model_name = "Molchevsky/ai_resume_llama-3.2-3b"
17
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ model_name,
20
+ torch_dtype=torch.float16,
21
+ device_map="auto",
22
+ low_cpu_mem_usage=True,
23
+ trust_remote_code=True
24
+ )
25
+ print("Model loaded successfully!")
26
+
27
  def respond(
28
  message,
29
  history: list[dict[str, str]],
30
  max_tokens,
31
  temperature,
32
  top_p,
 
33
  ):
34
  """
35
+ Generate response using the local model.
36
  """
37
+ # Build conversation with system prompt
 
 
 
 
 
38
  messages = [{"role": "system", "content": SYSTEM_PROMPT}]
39
  messages.extend(history)
40
  messages.append({"role": "user", "content": message})
41
+
42
+ # Apply chat template if available
43
+ try:
44
+ formatted_chat = tokenizer.apply_chat_template(
45
+ messages,
46
+ tokenize=False,
47
+ add_generation_prompt=True
48
+ )
49
+ except Exception:
50
+ # Fallback formatting if chat template fails
51
+ formatted_chat = ""
52
+ for msg in messages:
53
+ if msg["role"] == "system":
54
+ formatted_chat += f"System: {msg['content']}\n\n"
55
+ elif msg["role"] == "user":
56
+ formatted_chat += f"User: {msg['content']}\n"
57
+ elif msg["role"] == "assistant":
58
+ formatted_chat += f"Assistant: {msg['content']}\n"
59
+ formatted_chat += "Assistant: "
60
+
61
+ # Tokenize
62
+ inputs = tokenizer(formatted_chat, return_tensors="pt").to(model.device)
63
+
64
+ # Set up streamer for real-time output
65
+ streamer = TextIteratorStreamer(
66
+ tokenizer,
67
+ timeout=60,
68
+ skip_prompt=True,
69
+ skip_special_tokens=True
70
+ )
71
+
72
+ # Generation parameters
73
+ generation_kwargs = {
74
+ **inputs,
75
+ "streamer": streamer,
76
+ "max_new_tokens": max_tokens,
77
+ "temperature": temperature,
78
+ "top_p": top_p,
79
+ "do_sample": True,
80
+ "pad_token_id": tokenizer.eos_token_id,
81
+ "eos_token_id": tokenizer.eos_token_id,
82
+ }
83
+
84
+ # Start generation in separate thread
85
+ generation_thread = Thread(target=model.generate, kwargs=generation_kwargs)
86
+ generation_thread.start()
87
+
88
+ # Stream the response
89
  response = ""
90
+ try:
91
+ for token in streamer:
92
+ response += token
93
+ yield response
94
+ except Exception as e:
95
+ yield f"Error generating response: {str(e)}"
 
 
 
 
 
 
 
 
96
 
97
+ # Create the chat interface
98
  chatbot = gr.ChatInterface(
99
  respond,
100
  type="messages",
 
112
  )
113
 
114
  with gr.Blocks() as demo:
 
 
115
  chatbot.render()
116
 
 
117
  if __name__ == "__main__":
118
+ demo.launch(debug=True)
requirements.txt CHANGED
@@ -1,4 +1,5 @@
1
  gradio
2
- torch
3
  transformers
4
- accelerate
 
 
 
1
  gradio
 
2
  transformers
3
+ torch
4
+ accelerate
5
+ safetensors