rphrp1985 commited on
Commit
9d23d35
·
verified ·
1 Parent(s): 3446fdb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -66
app.py CHANGED
@@ -7,6 +7,16 @@ from huggingface_hub import InferenceClient
7
  import os
8
  import psutil
9
 
 
 
 
 
 
 
 
 
 
 
10
 
11
 
12
  """
@@ -49,48 +59,48 @@ import transformers
49
 
50
  # model_id = "mistralai/Mistral-7B-v0.3"
51
 
52
- model_id = "microsoft/Phi-3-medium-4k-instruct"
53
- # model_id = "microsoft/phi-4"
54
 
55
- # model_id = "Qwen/Qwen2-7B-Instruct"
56
 
57
 
58
- tokenizer = AutoTokenizer.from_pretrained(
59
- # model_id
60
- model_id,
61
- # use_fast=False
62
- token= token,
63
- trust_remote_code=True)
64
 
65
 
66
- accelerator = Accelerator()
67
 
68
- model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
69
- # torch_dtype= torch.uint8,
70
- torch_dtype=torch.bfloat16,
71
- # load_in_8bit=True,
72
- # # # torch_dtype=torch.fl,
73
- attn_implementation="flash_attention_2",
74
- low_cpu_mem_usage=True,
75
- trust_remote_code=True,
76
- device_map='cuda',
77
- # device_map=accelerator.device_map,
78
 
79
- )
80
 
81
 
82
 
83
 
84
 
85
- #
86
- model = accelerator.prepare(model)
87
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
88
 
89
- pipe = pipeline(
90
- "text-generation",
91
- model=model,
92
- tokenizer=tokenizer,
93
- )
94
 
95
 
96
 
@@ -109,6 +119,27 @@ pipe = pipeline(
109
  # model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, no_split_module_classes=["GPTJBlock"])
110
  # model.half()
111
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  import json
113
 
114
  def str_to_json(str_obj):
@@ -123,48 +154,83 @@ def respond(
123
  system_message,
124
  max_tokens,
125
  temperature,
126
- top_p,
127
- ):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  # yield 'retuend'
129
  # model.to(accelerator.device)
130
 
131
- messages = []
132
- json_obj = str_to_json(message)
133
- print(json_obj)
134
 
135
- messages= json_obj
136
-
137
- # input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(accelerator.device)
138
- # input_ids2 = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt") #.to('cuda')
139
- # print(f"Converted input_ids dtype: {input_ids.dtype}")
140
- # input_str= str(input_ids2)
141
- # print('input str = ', input_str)
142
-
143
- generation_args = {
144
- "max_new_tokens": max_tokens,
145
- "return_full_text": False,
146
- "temperature": temperature,
147
- "do_sample": False,
148
- }
149
-
150
- output = pipe(messages, **generation_args)
151
- print(output[0]['generated_text'])
152
- gen_text=output[0]['generated_text']
153
-
154
- # with torch.no_grad():
155
- # gen_tokens = model.generate(
156
- # input_ids,
157
- # max_new_tokens=max_tokens,
158
- # # do_sample=True,
159
- # temperature=temperature,
160
- # )
161
-
162
- # gen_text = tokenizer.decode(gen_tokens[0])
163
- # print(gen_text)
164
- # gen_text= gen_text.replace(input_str,'')
165
- # gen_text= gen_text.replace('<|im_end|>','')
166
 
167
- yield gen_text
168
 
169
 
170
  # messages = [
 
7
  import os
8
  import psutil
9
 
10
+ import json
11
+ import subprocess
12
+ from threading import Thread
13
+
14
+ import torch
15
+ import spaces
16
+ import gradio as gr
17
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TextIteratorStreamer
18
+
19
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
20
 
21
 
22
  """
 
59
 
60
  # model_id = "mistralai/Mistral-7B-v0.3"
61
 
62
+ # model_id = "microsoft/Phi-3-medium-4k-instruct"
63
+ # # model_id = "microsoft/phi-4"
64
 
65
+ # # model_id = "Qwen/Qwen2-7B-Instruct"
66
 
67
 
68
+ # tokenizer = AutoTokenizer.from_pretrained(
69
+ # # model_id
70
+ # model_id,
71
+ # # use_fast=False
72
+ # token= token,
73
+ # trust_remote_code=True)
74
 
75
 
76
+ # accelerator = Accelerator()
77
 
78
+ # model = AutoModelForCausalLM.from_pretrained(model_id, token= token,
79
+ # # torch_dtype= torch.uint8,
80
+ # torch_dtype=torch.bfloat16,
81
+ # # load_in_8bit=True,
82
+ # # # # torch_dtype=torch.fl,
83
+ # attn_implementation="flash_attention_2",
84
+ # low_cpu_mem_usage=True,
85
+ # trust_remote_code=True,
86
+ # device_map='cuda',
87
+ # # device_map=accelerator.device_map,
88
 
89
+ # )
90
 
91
 
92
 
93
 
94
 
95
+ # #
96
+ # model = accelerator.prepare(model)
97
+ # from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
98
 
99
+ # pipe = pipeline(
100
+ # "text-generation",
101
+ # model=model,
102
+ # tokenizer=tokenizer,
103
+ # )
104
 
105
 
106
 
 
119
  # model = load_checkpoint_and_dispatch(model, model_id, device_map=device_map, no_split_module_classes=["GPTJBlock"])
120
  # model.half()
121
 
122
+ MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B"
123
+ CHAT_TEMPLATE = "َAuto"
124
+ MODEL_NAME = MODEL_ID.split("/")[-1]
125
+ CONTEXT_LENGTH = 16000
126
+
127
+
128
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
129
+ quantization_config = BitsAndBytesConfig(
130
+ load_in_4bit=True,
131
+ bnb_4bit_compute_dtype=torch.bfloat16
132
+ )
133
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
134
+ model = AutoModelForCausalLM.from_pretrained(
135
+ MODEL_ID,
136
+ device_map="auto",
137
+ quantization_config=quantization_config,
138
+ attn_implementation="flash_attention_2",
139
+ )
140
+
141
+
142
+
143
  import json
144
 
145
  def str_to_json(str_obj):
 
154
  system_message,
155
  max_tokens,
156
  temperature,
157
+ top_p):
158
+
159
+ stop_tokens = ["<|endoftext|>", "<|im_end|>"]
160
+ instruction = '<|im_start|>system\n' + system_message + '\n<|im_end|>\n'
161
+ for user, assistant in history:
162
+ instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
163
+ instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
164
+
165
+ print(instruction)
166
+
167
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
168
+ enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
169
+ input_ids, attention_mask = enc.input_ids, enc.attention_mask
170
+
171
+ if input_ids.shape[1] > CONTEXT_LENGTH:
172
+ input_ids = input_ids[:, -CONTEXT_LENGTH:]
173
+ attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
174
+
175
+ generate_kwargs = dict(
176
+ input_ids=input_ids.to(device),
177
+ attention_mask=attention_mask.to(device),
178
+ streamer=streamer,
179
+ do_sample=True,
180
+ temperature=temperature,
181
+ max_new_tokens=max_new_tokens,
182
+ top_k=top_k,
183
+ repetition_penalty=repetition_penalty,
184
+ top_p=top_p
185
+ )
186
+ t = Thread(target=model.generate, kwargs=generate_kwargs)
187
+ t.start()
188
+ outputs = []
189
+ for new_token in streamer:
190
+ outputs.append(new_token)
191
+ if new_token in stop_tokens:
192
+ break
193
+ yield "".join(outputs)
194
  # yield 'retuend'
195
  # model.to(accelerator.device)
196
 
197
+ # messages = []
198
+ # json_obj = str_to_json(message)
199
+ # print(json_obj)
200
 
201
+ # messages= json_obj
202
+
203
+ # # input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(accelerator.device)
204
+ # # input_ids2 = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, return_tensors="pt") #.to('cuda')
205
+ # # print(f"Converted input_ids dtype: {input_ids.dtype}")
206
+ # # input_str= str(input_ids2)
207
+ # # print('input str = ', input_str)
208
+
209
+ # generation_args = {
210
+ # "max_new_tokens": max_tokens,
211
+ # "return_full_text": False,
212
+ # "temperature": temperature,
213
+ # "do_sample": False,
214
+ # }
215
+
216
+ # output = pipe(messages, **generation_args)
217
+ # print(output[0]['generated_text'])
218
+ # gen_text=output[0]['generated_text']
219
+
220
+ # # with torch.no_grad():
221
+ # # gen_tokens = model.generate(
222
+ # # input_ids,
223
+ # # max_new_tokens=max_tokens,
224
+ # # # do_sample=True,
225
+ # # temperature=temperature,
226
+ # # )
227
+
228
+ # # gen_text = tokenizer.decode(gen_tokens[0])
229
+ # # print(gen_text)
230
+ # # gen_text= gen_text.replace(input_str,'')
231
+ # # gen_text= gen_text.replace('<|im_end|>','')
232
 
233
+ # yield gen_text
234
 
235
 
236
  # messages = [