geonmin-kim commited on
Commit
f0b3470
·
verified ·
1 Parent(s): e1ed14e

Upload folder using huggingface_hub

Browse files
.DS_Store ADDED
Binary file (8.2 kB). View file
 
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/test_image.png filter=lfs diff=lfs merge=lfs -text
37
+ llm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ vlm/decoder.onnx_data filter=lfs diff=lfs merge=lfs -text
39
+ vlm/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### LLM text generation python examples
2
+
3
+ ```python
4
+ from transformers import AutoConfig, AutoTokenizer
5
+ import onnxruntime
6
+ import numpy as np
7
+
8
+ # 1. Load config, processor, and model
9
+ path_to_model = "./llm"
10
+ config = AutoConfig.from_pretrained(path_to_model)
11
+ tokenizer = AutoTokenizer.from_pretrained(path_to_model)
12
+ decoder_session = onnxruntime.InferenceSession(f"{path_to_model}/model_q4f16.onnx")
13
+
14
+ ## Set config values
15
+ num_key_value_heads = config.num_key_value_heads
16
+ head_dim = config.head_dim
17
+ num_hidden_layers = config.num_hidden_layers
18
+ eos_token_id = 106 # 106 is for <end_of_turn>
19
+
20
+ # 2. Prepare inputs
21
+ ## Create input messages
22
+ messages = [
23
+ { "role": "system", "content": "You are a helpful assistant." },
24
+ { "role": "user", "content": "Write me a short poem about Machine Learning." },
25
+ ]
26
+
27
+ ## Apply tokenizer
28
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="np")
29
+
30
+ ## Prepare decoder inputs
31
+ batch_size = inputs['input_ids'].shape[0]
32
+ past_key_values = {
33
+ f'past_key_values.{layer}.{kv}': np.zeros([batch_size, num_key_value_heads, 0, head_dim], dtype=np.float32)
34
+ for layer in range(num_hidden_layers)
35
+ for kv in ('key', 'value')
36
+ }
37
+ input_ids = inputs['input_ids']
38
+ position_ids = np.tile(np.arange(1, input_ids.shape[-1] + 1), (batch_size, 1))
39
+
40
+ # 3. Generation loop
41
+ max_new_tokens = 128
42
+ generated_tokens = np.array([[]], dtype=np.int64)
43
+ for i in range(max_new_tokens):
44
+ logits, *present_key_values = decoder_session.run(None, dict(
45
+ input_ids=input_ids,
46
+ position_ids=position_ids,
47
+ **past_key_values,
48
+ ))
49
+
50
+ ## Update values for next generation loop
51
+ input_ids = logits[:, -1].argmax(-1, keepdims=True)
52
+ position_ids = position_ids[:, -1:] + 1
53
+ for j, key in enumerate(past_key_values):
54
+ past_key_values[key] = present_key_values[j]
55
+
56
+ generated_tokens = np.concatenate([generated_tokens, input_ids], axis=-1)
57
+ if (input_ids == eos_token_id).all():
58
+ break
59
+
60
+ ## (Optional) Streaming
61
+ print(tokenizer.decode(input_ids[0]), end='', flush=True)
62
+ print()
63
+
64
+ # 4. Output result
65
+ print(tokenizer.batch_decode(generated_tokens))
66
+ ```
67
+
68
+ ### VLM text generation python examples
69
+ ```python
70
+ import argparse
71
+ import requests
72
+ import onnxruntime
73
+ from transformers import AutoTokenizer
74
+ import numpy as np
75
+ import time
76
+ from PIL import Image
77
+
78
+ IMAGE_TOKEN_INDEX = 151646
79
+ MAX_GEN_LEN = 128
80
+ USE_SAMPLING = True
81
+
82
+ print("Loading inference sessions...")
83
+ load_start = time.time()
84
+
85
+ image_emb_session = onnxruntime.InferenceSession("vlm/vision_encoder.onnx")
86
+ text_emb_session = onnxruntime.InferenceSession("vlm/token_embed_model.onnx")
87
+ decoding_session = onnxruntime.InferenceSession("vlm/decoder.onnx")
88
+
89
+
90
+ load_end = time.time()
91
+ print(f"Inference sessions are loaded. Loading takes {load_end-load_start:0.2f} sec")
92
+
93
+
94
+ def main(args):
95
+ tokenizer = AutoTokenizer.from_pretrained("./vlm")
96
+ tokenizer.add_tokens(["<image>"], special_tokens=True)
97
+
98
+ query = args.input_text
99
+ prompt = f"<|im_start|>user\n<image>\n{query}<|im_end|>\n<|im_start|>assistant\n"
100
+ past_kv_values, first_token, input_token_len = prefill(args, tokenizer, prompt)
101
+
102
+ decode(args, tokenizer, past_kv_values, first_token, input_token_len)
103
+
104
+
105
+ def process_image(image_path):
106
+ # Load image
107
+ if "https" in image_path:
108
+ image = Image.open(requests.get(image_path, stream=True).raw)
109
+ else:
110
+ image = Image.open(image_path)
111
+ crop_size = (224, 224)
112
+ do_center_crop = True
113
+ do_convert_rgb = True
114
+ do_normalize = True
115
+ do_rescale = True
116
+ do_resize = True
117
+ image_mean = [0.48145466, 0.4578275, 0.40821073]
118
+ image_std = [0.26862954, 0.26130258, 0.27577711]
119
+ rescale_factor = 0.00392156862745098 # 1/255
120
+ size = {"shortest_edge": 224}
121
+ resample = Image.BICUBIC # resample = 3
122
+
123
+ # Convert to rgb
124
+ if do_convert_rgb:
125
+ image = image.convert("RGB")
126
+
127
+ # Resize image
128
+ if do_resize:
129
+ shortest_edge = min(image.size)
130
+ scale_factor = size["shortest_edge"] / shortest_edge
131
+ new_size = (int(image.width * scale_factor), int(image.height * scale_factor))
132
+ image = image.resize(new_size, resample=resample)
133
+
134
+ # Center Crop
135
+ if do_center_crop:
136
+ left = (image.width - crop_size[0]) / 2
137
+ top = (image.height - crop_size[1]) / 2
138
+ right = (image.width + crop_size[0]) / 2
139
+ bottom = (image.height + crop_size[1]) / 2
140
+ image = image.crop((left, top, right, bottom))
141
+
142
+ # Convert to image array
143
+ image_array = np.array(image).astype(np.float32)
144
+
145
+ # Rescale (0-255 to 0-1)
146
+ if do_rescale:
147
+ image_array = image_array * rescale_factor
148
+
149
+ # Normalize
150
+ if do_normalize:
151
+ image_array = (image_array - image_mean) / image_std
152
+
153
+ # (H, W, C) -> (C, H, W)
154
+ image_array = np.transpose(image_array, (2, 0, 1))
155
+
156
+ # add batch dim (1, C, H, W)
157
+ image_array = np.expand_dims(image_array, axis=0)
158
+
159
+ return image_array.astype(np.float32)
160
+
161
+
162
+ def top_p_sampling(last_logits, top_p=0.99):
163
+ sorted_indices = np.argsort(-last_logits)
164
+ sorted_logits = last_logits[sorted_indices]
165
+
166
+ cumulative_probs = np.cumsum(np.exp(sorted_logits - np.max(sorted_logits)))
167
+ cumulative_probs /= cumulative_probs[-1]
168
+
169
+ cutoff_index = np.searchsorted(cumulative_probs, top_p, side="right")
170
+
171
+ probs = np.exp(sorted_logits[: cutoff_index + 1] - np.max(sorted_logits[: cutoff_index + 1]))
172
+ probs /= np.sum(probs)
173
+
174
+ next_token = np.random.choice(sorted_indices[: cutoff_index + 1], p=probs)
175
+
176
+ return next_token
177
+
178
+
179
+ # Prefill step
180
+ # Inputs
181
+ ## input_ids: [1, seq_len]
182
+ ## past_key_values: each layer needs key[1, 2, 0, kv_dim], value[1, 2, 0, kv_dim] => total 56 kv
183
+ # Outputs
184
+ ## logits: [1, seq_len, 151936]
185
+ ## present: each layer returns key[1, 2, seq_len, kv_dim], value[1, 2, seq_len, kv_dim] => total 56 kv
186
+ def prefill(args, tokenizer, input_prompt):
187
+ print("Running prefill step...")
188
+ prefill_start = time.time()
189
+
190
+ input_ids = tokenizer(input_prompt)["input_ids"]
191
+ image_token_pos = input_ids.index(IMAGE_TOKEN_INDEX)
192
+
193
+ pixel_value = process_image(args.image_path)
194
+
195
+ # Get image embedding & Project image embedding to text embedding space
196
+ image_emb_output = image_emb_session.run(None, {"pixel_values": pixel_value})
197
+ image_features_proj = image_emb_output[0]
198
+
199
+ # Get text embedding
200
+ text_emb_output = text_emb_session.run(None, {"input_ids": [input_ids]})
201
+ input_features = text_emb_output[0]
202
+
203
+ # Split text embedding
204
+ pre_image_text_emb = input_features[:, :image_token_pos, :]
205
+ post_image_text_emb = input_features[:, image_token_pos + 1 :, :]
206
+
207
+ # Merge text embedding and image embedding
208
+ hidden_states = np.concatenate((pre_image_text_emb, image_features_proj, post_image_text_emb), axis=1)
209
+ input_token_len = hidden_states.shape[1]
210
+
211
+ # Prepare inputs used in prefill step with dummy input for initial past kv value
212
+ prefill_input = {
213
+ "/model/embed_tokens/Gather_output_0": hidden_states,
214
+ "attention_mask": np.expand_dims(np.ones(input_token_len).astype(np.int64), axis=0),
215
+ "position_ids": np.expand_dims(np.arange(input_token_len), axis=0),
216
+ }
217
+ for i in range(24):
218
+ entities = ["key", "value"]
219
+ for entity in entities:
220
+ input_name = f"past_key_values.{i}.{entity}"
221
+ prefill_input[input_name] = np.random.rand(1, 2, 0, 64).astype(np.float32)
222
+
223
+ # Run prefill
224
+ prefill_outputs = decoding_session.run(None, prefill_input)
225
+
226
+ # Get past kv values for decode step
227
+ past_kv_values = prefill_outputs[1:]
228
+
229
+ # Get first token with top-p sampling
230
+ if USE_SAMPLING:
231
+ last_logits = prefill_outputs[0][0][-1]
232
+ next_token = top_p_sampling(last_logits)
233
+ else:
234
+ next_token = prefill_outputs[0].argmax(-1)[0][-1]
235
+
236
+ prefill_done = time.time()
237
+ print(f"Prefill step done. Throughtput: {input_token_len/(prefill_done - prefill_start):0.2f} token/sec")
238
+
239
+ return past_kv_values, next_token, input_token_len
240
+
241
+
242
+ # Generation step
243
+ # Inputs
244
+ ## input_ids: [1, 1]
245
+ ## past_key_values: each layer needs key[1, 2, past_seq_len, kv_dim], value[1, 2, past_seq_len, kv_dim] => total 56 kv
246
+ # Outputs
247
+ ## logits: [1, 1, 151936]
248
+ ## present: each layer returns key[1, 2, seq_len, kv_dim], value[1, 2, seq_len, kv_dim] => total 56 kv
249
+ def decode(args, tokenizer, past_kv_values, first_token, input_token_len):
250
+ print("Runing decode step...", end="\n\n")
251
+ decode_start = time.time()
252
+
253
+ generated_ids = [first_token]
254
+ next_token = first_token
255
+
256
+ for last_token_id in range(MAX_GEN_LEN):
257
+ embedding_output = text_emb_session.run(None, {"input_ids": [[next_token]]})
258
+
259
+ # Get new token's embedding
260
+ hidden_states = embedding_output[0]
261
+
262
+ # Prepare inputs for decoding step
263
+ decoding_input = {
264
+ "/model/embed_tokens/Gather_output_0": hidden_states.astype(np.float32),
265
+ "attention_mask": [[1]],
266
+ "position_ids": [[input_token_len]],
267
+ }
268
+ input_token_len += 1
269
+ for j in range(24):
270
+ for k in range(2):
271
+ if k == 0:
272
+ input_name = f"past_key_values.{j}.key"
273
+ else:
274
+ input_name = f"past_key_values.{j}.value"
275
+ decoding_input[input_name] = past_kv_values[2 * j + k].astype(np.float32)
276
+
277
+ # Run decoding
278
+ decoding_outputs = decoding_session.run(None, decoding_input)
279
+
280
+ # Save kv values for next step
281
+ past_kv_values = decoding_outputs[1:]
282
+
283
+ # Get next token with top_p sampling
284
+ last_logits = decoding_outputs[0][0][-1]
285
+
286
+ if USE_SAMPLING:
287
+ next_token = top_p_sampling(last_logits)
288
+ else:
289
+ next_token = decoding_outputs[0].argmax(-1)[0][-1]
290
+
291
+ if next_token == tokenizer.eos_token_id:
292
+ break
293
+
294
+ # Save generated token
295
+ generated_ids.append(next_token)
296
+
297
+ decode_done = time.time()
298
+ response = tokenizer.decode(generated_ids)
299
+ with open(args.output_path, 'w') as f:
300
+ f.write(response)
301
+ print(f"\nDecode step done. Throughtput: {last_token_id/(decode_done - decode_start):0.2f} token/sec")
302
+
303
+
304
+ if __name__ == "__main__":
305
+ parser = argparse.ArgumentParser()
306
+ parser.add_argument("--input_text", type="str", help="Input query for inference", default="Where do you think this image is from?")
307
+ parser.add_argument("--image_path", type="str", help="Local image path or image url", default="assets/test_image.png")
308
+ parser.add_argument("--output_path", type="str", help="Output path to save the response", required=True)
309
+ args = parser.parse_args()
310
+
311
+ main(args)
312
+ ```
assets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
assets/test_image.png ADDED

Git LFS Details

  • SHA256: 3f833095a0b77a705e0dc65ad155554d2b734e64a2c5be4b2017c3e28e1b21aa
  • Pointer size: 131 Bytes
  • Size of remote file: 894 kB
llm/.DS_Store ADDED
Binary file (6.15 kB). View file
 
llm/model_q4f16.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a8cb5ab287f04050d29de31e47354f8868069c0dec8cab326376274a6a12508
3
+ size 997769309
llm/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
llm/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
llm/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
vlm/.DS_Store ADDED
Binary file (6.15 kB). View file
 
vlm/added_tokens.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 151643,
3
+ "<|im_end|>": 151645,
4
+ "<|im_start|>": 151644
5
+ }
vlm/decoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74fe27d6c2e5c3c0f6a94cb8e8e62dfae8ab59db5e4b468bad57686dec87fee3
3
+ size 1344441
vlm/decoder.onnx_data ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ccdf2f606eb40209e3e3385eab6e60933356ede96ac03db234d98cb27bb7978
3
+ size 1991847936
vlm/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
vlm/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ pillow
4
+ requests
vlm/special_tokens_map.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "eos_token": {
7
+ "content": "<|im_end|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "pad_token": {
14
+ "content": "<|endoftext|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ }
20
+ }
vlm/token_embedding_model.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9cdb44e5aacbd9e54986b200eab130a1345d3ddc919032476ff54e3de8e130f2
3
+ size 271751663
vlm/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcfe42da0a4497e8b2b172c1f9f4ec423a46dc12907f4349c55025f670422ba9
3
+ size 11418266
vlm/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "added_tokens_decoder": {
4
+ "151643": {
5
+ "content": "<|endoftext|>",
6
+ "lstrip": false,
7
+ "normalized": false,
8
+ "rstrip": false,
9
+ "single_word": false,
10
+ "special": true
11
+ },
12
+ "151644": {
13
+ "content": "<|im_start|>",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false,
18
+ "special": true
19
+ },
20
+ "151645": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false,
26
+ "special": true
27
+ }
28
+ },
29
+ "additional_special_tokens": [
30
+ "<|im_start|>",
31
+ "<|im_end|>"
32
+ ],
33
+ "bos_token": null,
34
+ "chat_template": "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
35
+ "clean_up_tokenization_spaces": false,
36
+ "eos_token": "<|im_end|>",
37
+ "errors": "replace",
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "split_special_tokens": false,
41
+ "tokenizer_class": "Qwen2Tokenizer",
42
+ "unk_token": null
43
+ }
vlm/vision_encoder.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de39329bac62e7c7000f39c602369c1bec8bc1c496bbe50eec76bbefba6b5e4
3
+ size 321017807
vlm/vocab.json ADDED
The diff for this file is too large to render. See raw diff