yongqiang commited on Apr 14

Commit

f4acc5b

1 Parent(s): faed000

Initial this repo

Files changed (46) hide show

README.md +109 -1
assets/gen_out_img.jpg +0 -0
embeds/codebook_entry_embedding.npy +3 -0
embeds/codebook_entry_embedding.pt +3 -0
embeds/gen_embed.npy +3 -0
img_gen_onnx/gen_aligner.onnx +3 -0
img_gen_onnx/gen_vision_model_decode_sim.onnx +3 -0
img_gen_onnx/post_head.onnx +3 -0
img_gen_onnx/post_norm.onnx +3 -0
imgs/image.jpg +0 -0
imgs/image.png +3 -0
infer_axmodel_gen.py +276 -0
infer_axmodel_und.py +228 -0
janus_pro_1b_axmodel/llama_p640_l0_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l10_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l11_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l12_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l13_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l14_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l15_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l16_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l17_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l18_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l19_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l1_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l20_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l21_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l22_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l23_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l2_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l3_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l4_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l5_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l6_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l7_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l8_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_p640_l9_together.axmodel +3 -0
janus_pro_1b_axmodel/llama_post.axmodel +3 -0
janus_pro_1b_axmodel/model.embed_tokens.weight.npy +3 -0
janus_pro_1b_tokenizer/config.json +66 -0
janus_pro_1b_tokenizer/preprocessor_config.json +23 -0
janus_pro_1b_tokenizer/processor_config.json +9 -0
janus_pro_1b_tokenizer/special_tokens_map.json +16 -0
janus_pro_1b_tokenizer/tokenizer.json +0 -0
janus_pro_1b_tokenizer/tokenizer_config.json +10 -0
vit_axmodel/janus_warp_vit.axmodel +3 -0

README.md CHANGED Viewed

@@ -9,4 +9,112 @@ pipeline_tag: visual-question-answering
 tags:
 - DeepSeek
 - Janus-Pro-1B
----

 tags:
 - DeepSeek
 - Janus-Pro-1B
+---
+# Janus-Pro-1B-Int8
+This version of Janus-Pro-1B has been converted to run on the Axera NPU using **w8a16** quantization.
+This model has been optimized with the following LoRA:
+Compatible with Pulsar2 version: 3.3
+## Convert tools links:
+For those who are interested in model conversion, you can try to export axmodel through the original repo :
+https://huggingface.co/deepseek-ai/Janus-Pro-1B
+[Pulsar2 Link, How to Convert LLM from Huggingface to axmodel](https://pulsar2-docs.readthedocs.io/en/latest/appendix/build_llm.html)
+## Support Platform
+- AX650
+  - AX650N DEMO Board
+  - [M4N-Dock(爱芯派Pro)](https://wiki.sipeed.com/hardware/zh/maixIV/m4ndock/m4ndock.html)
+|chips|image encoder 384 | ttft | w8a16 |
+|--|--|--|--|
+|AX650| 142.682 ms | 4560.214 ms | 11.43 tokens/sec|
+## How to use
+Download all files from this repository to the device.
+**If you using AX650 Board**
+```
+root@ax650:/mnt/qtang/llm-test/temp/Janus-Pro-1B# tree -L 1
+.
+|-- config.json
+|-- internvl2_5_1b_448_ax650
+|-- internvl2_5_tokenizer
+|-- internvl2_5_tokenizer_448.py
+|-- main_internvl2_5_448_prefill
+|-- run_internvl2_5_448_ax650.sh
+`-- ssd_car.jpg
+```
+#### Install janus
+```bash
+$ git clone https://github.com/deepseek-ai/Janus
+$ cd Janus
+$ pip3 install -e .
+```
+#### Inference with AX650 Host, such as M4N-Dock(爱芯派Pro) or AX650N DEMO Board
+**Multimodal Understanding**
+input text:
+```
+Describe the picture
+```
+- input image
+![](imgs/image.png)
+log information:
+```bash
+root@ax650 ~/yongqiang/push_hugging_face/Janus-Pro-1B # python3 infer_axmodel_und.py --tokenizer_dir janus_pro_1b_tokenizer --axmodel_path janus_pro_1b_axmodel --vit_axmodel_path vit_axmodel/janus_warp_vit.axmodel -i ./imgs/image.png
+[INFO] Available providers:  ['AxEngineExecutionProvider']
+[INFO] Chip type: ChipType.MC50
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Engine version: 2.11.0a
+vit_output.shape is (1, 576, 2048), vit feature extract done!
+Init InferenceSession: 100%|██████████████████████████████████████████████████████████| 24/24 [00:04<00:00,  4.94it/s]
+model load done!
+prefill done!
+Decoder:  62%|█████████████████████████████████████████▍                         | 634/1024 [00:00<00:00, 2505.28it/s]Decoder:  72%|█████████████████████████████████████████████████▉                   | 741/1024 [00:19<00:10, 27.69it/s]hit eos!
+Decoder:  74%|███████████████████████████████████████████████████▎                 | 762/1024 [00:23<00:08, 31.84it/s]
+Janus Answers:  The image depicts three astronauts standing in a lush, green forest. They are wearing traditional white space suits with various patches and equipment attached. The suits have a reflective visor on their helmets, and they appear to be in a relaxed pose, with one astronaut raising his arms and the others standing or crouching. The forest is dense with tall trees and dense foliage, creating a serene and somewhat mysterious atmosphere.
+```
+**Text-to-Image Generation**
+input text:
+```
+"A close-up high-contrast photo of Sydney Opera House sitting next to Eiffel tower, under a blue night sky of roiling energy, exploding yellow stars, and radiating swirls of blue."
+```
+log information:
+```bash
+root@ax650 ~/yongqiang/push_hugging_face/Janus-Pro-1B # python3 infer_axmodel_gen.py --tokenizer_dir janus_pro_1b_tokenizer/ --axmodel_path janus_pro_1b_axmodel/
+[INFO] Available providers:  ['AxEngineExecutionProvider']
+Init InferenceSession:   0%|                                                                   | 0/24 [00:00<?, ?it/s][INFO] Chip type: ChipType.MC50
+[INFO] VNPU type: VNPUType.DISABLED
+[INFO] Engine version: 2.11.0a
+Init InferenceSession: 100%|██████████████████████████████████████████████████████████| 24/24 [00:14<00:00,  1.68it/s]
+2025-04-14 15:55:23.408 | INFO     | __main__:<module>:269 - model load done!
+2025-04-14 15:55:33.104 | DEBUG    | __main__:generate:158 - prefill completed!
+ImageToken:  18%|████████████                                                       | 104/575 [00:39<02:58,  2.64it/s]ImageToken:  45%|██████████████████████████████▍                                    | 261/575 [01:39<01:58,  2.65it/s]ImageToken:  73%|████████████████████████████████████████████████▊                  | 419/575 [02:39<00:58,  2.66it/s]ImageToken: 100%|███████████████████████████████████████████████████████████████████| 575/575 [03:38<00:00,  2.63it/s]
+```
+output image
+[](assets/gen_out_img.jpg)

assets/gen_out_img.jpg ADDED Viewed

embeds/codebook_entry_embedding.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97fc92031b689c685f3b36d7542eba5002cd937c63dcf33731601ef999c68613
+size 524416

embeds/codebook_entry_embedding.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a67cea6583ef3da486fdfcd6cff62a2771795c7ab46b8f1000852be4f1a137c5
+size 263473

embeds/gen_embed.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c70d799c8ab4c507b2916f304ba0f792e2dbf0a26100cb1242babe1f2e57d455
+size 524416

img_gen_onnx/gen_aligner.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0642c360b65e5f41b1caf7637650e057b38a9aad40552a7669a76b3395653c5d
+size 16860554

img_gen_onnx/gen_vision_model_decode_sim.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e27a17bc19df77059481b30582ca58e3a28bd66783fb9ca8c3022bf33e77f8bf
+size 169913021

img_gen_onnx/post_head.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:92be45cb8d1c3ae5c19c906a71195f13a0755f16881a6769e8ae9b5ca85eaa8f
+size 151070226

img_gen_onnx/post_norm.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:10899a40c25d7d1a879c0b9a7fe06255b5148d56d5965b6c5a8b8bb7d72feecf
+size 9423

imgs/image.jpg ADDED Viewed

imgs/image.png ADDED Viewed

Git LFS Details

SHA256: 622ae2d01ff4467fa69a7888728d776650117a0f4887e96ba0fb9a8a6d77b3c3
Pointer size: 131 Bytes
Size of remote file: 355 kB

infer_axmodel_gen.py ADDED Viewed

	@@ -0,0 +1,276 @@

+# REF: https://github.com/deepseek-ai/Janus
+import numpy as np
+import torch
+from axengine import InferenceSession
+from ml_dtypes import bfloat16
+from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from tqdm import tqdm
+from einops import rearrange
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.models.modeling_vlm import MultiModalityConfig
+from janus.utils.io import load_pil_images
+import os
+import PIL.Image
+from loguru import logger
+import onnxruntime
+import argparse
+parser = argparse.ArgumentParser(description="Model configuration parameters")
+parser.add_argument("--tokenizer_dir", type=str, default="Janus-Pro-1B",
+                    help="Path to HuggingFace model")
+parser.add_argument("--axmodel_path", type=str, default="janus_pro_1B_axmodel",
+                    help="Path to save compiled axmodel of llama model")
+args = parser.parse_args()
+# base info
+tokenizer_dir = args.tokenizer_dir
+axmodel_path = args.axmodel_path
+"""ONNX MODEL"""
+gen_vision_model_decode = onnxruntime.InferenceSession("./img_gen_onnx/gen_vision_model_decode_sim.onnx", providers=["CPUExecutionProvider"])
+gen_aligner = onnxruntime.InferenceSession("./img_gen_onnx/gen_aligner.onnx", providers=["CPUExecutionProvider"])
+gen_head = onnxruntime.InferenceSession("./img_gen_onnx/post_head.onnx", providers=["CPUExecutionProvider"])
+post_norm = onnxruntime.InferenceSession("./img_gen_onnx/post_norm.onnx", providers=["CPUExecutionProvider"])
+"""ONNX MODEL"""
+"""EMBEDINGs"""
+embeds = np.load(f"{axmodel_path}/model.embed_tokens.weight.npy")
+gen_embed = np.load("./embeds/gen_embed.npy")
+codebook_entry_embedding = torch.load('./embeds/codebook_entry_embedding.pt', map_location=torch.device('cpu'))
+"""EMBEDINGs"""
+def prefill(
+    cfg,
+    prefill_decoder_sessins,
+    vl_chat_processor: VLChatProcessor,
+    prompt: str,
+    temperature: float = 1,
+    parallel_size: int = 1,
+    cfg_weight: float = 5,
+    image_token_num_per_image: int = 576,
+):
+    input_ids = vl_chat_processor.tokenizer.encode(prompt)
+    input_ids = torch.LongTensor(input_ids)
+    tokens = torch.zeros((parallel_size*2, len(input_ids)), dtype=torch.int)
+    for i in range(parallel_size*2):
+        tokens[i, :] = input_ids
+        if i % 2 != 0:
+            tokens[i, 1: -1] = vl_chat_processor.pad_id
+    inputs_embeds = embeds[tokens.numpy()]
+    batch, token_len, seq_dim = inputs_embeds.shape
+    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int)
+    prefill_len = 640
+    token_ids = tokens
+    ###################################################################
+    lastN = 1023
+    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
+    batch_k_caches = {}
+    batch_v_caches = {}
+    for bid in range(batch):
+        batch_k_caches[bid] = [
+            np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+            for _ in range(cfg.num_hidden_layers)
+        ]
+        batch_v_caches[bid] = [
+            np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+            for _ in range(cfg.num_hidden_layers)
+        ]
+    ###################################################################
+    mask = np.zeros((1, prefill_len, prefill_len)) - 65536
+    for j in range(token_len):
+        mask[:, j, :j + 1] = 0
+    mask = mask.astype(bfloat16)
+    indices = np.array(list(range(prefill_len)), np.uint32).reshape(
+        (1, prefill_len)
+    )
+    indices[:, token_len:] = 0
+    hidden_states = np.zeros((batch, token_len, cfg.hidden_size)).astype(bfloat16)
+    for bid in range(batch):
+        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
+        data[:, 0:token_len] = inputs_embeds[bid].astype(bfloat16)
+        k_caches = batch_k_caches[bid]
+        v_caches = batch_v_caches[bid]
+        for i in range(cfg.num_hidden_layers):
+            input_feed = {
+                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+                "indices": indices,
+                "input": data,
+                "mask": mask,
+            }
+            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=1)
+            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
+            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
+            data[:, :token_len] = outputs[2][:, :token_len, :]
+        ######## BATCH ###########
+        hidden_states[bid] = data[:, :token_len]
+        batch_k_caches[bid] = k_caches
+        batch_v_caches[bid] = v_caches
+    ################# NORM & GEN-HEAD ########################
+    hidden_states = post_norm.run(["output"], {"input": hidden_states[:, -1:, :].astype(np.float32)})[0]
+    logits = gen_head.run(["output"], {"input": hidden_states[:, -1, :]})[0] # 与 llama head 不同, 此 head 为图像生成专用
+    ############# POST & GET NEXT TOKEN #############
+    logits = torch.from_numpy(logits)
+    logit_cond = logits[0::2, :]
+    logit_uncond = logits[1::2, :]
+    logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
+    probs = torch.softmax(logits / temperature, dim=-1)
+    next_token = torch.multinomial(probs, num_samples=1)
+    generated_tokens[:, 0] = next_token.squeeze(dim=-1)
+    next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+    ################## PREPARE_GEN_IMG_EMBEDS ##################
+    gen_embed_res = np.take(gen_embed, next_token.numpy().tolist(), axis=0)
+    img_embeds = gen_aligner.run(["output"], {"input": gen_embed_res})[0]
+    inputs_embeds = np.expand_dims(img_embeds, axis=1)
+    return inputs_embeds, token_ids, generated_tokens, batch_k_caches, batch_v_caches
+@torch.inference_mode()
+def generate(
+    cfg,
+    prefill_decoder_sessins,
+    vl_chat_processor: VLChatProcessor,
+    prompt: str,
+    temperature: float = 1,
+    parallel_size: int = 1, # 目前只支持固定为 1
+    cfg_weight: float = 5,
+    image_token_num_per_image: int = 576,
+    img_size: int = 384,
+    patch_size: int = 16,
+):
+    inputs_embeds, token_ids, generated_tokens, batch_k_caches, batch_v_caches = prefill(
+        cfg, prefill_decoder_sessins, vl_chat_processor,
+        prompt, temperature, parallel_size, cfg_weight, image_token_num_per_image
+    )
+    logger.debug("prefill completed!")
+    token_len = token_ids.shape[1]
+    lastN = 1023
+    batch = parallel_size * 2
+    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
+    mask[:, :, :lastN] -= 65536
+    mask[:, :, :token_len] = 0
+    for image_token_i in tqdm(range(1, image_token_num_per_image), desc="ImageToken"):
+        # 下面是 decode 逻辑
+        start_indice = image_token_i + token_len - 1
+        indices = np.array([start_indice], np.uint32).reshape((1, 1))
+        hidden_states = np.zeros((batch, 1, cfg.hidden_size)).astype(bfloat16) # batch, 1, seq_dim
+        assert (inputs_embeds[0] == inputs_embeds[1]).all()
+        for bid in range(batch):
+            k_caches = batch_k_caches[bid]
+            v_caches = batch_v_caches[bid]
+            data = inputs_embeds[:1, ...].astype(bfloat16)
+            for i in range(cfg.num_hidden_layers):
+                input_feed = {
+                    "K_cache": k_caches[i],
+                    "V_cache": v_caches[i],
+                    "indices": indices,
+                    "input": data,
+                    "mask": mask,
+                }
+                outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
+                k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
+                v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
+                data = outputs[2]
+            hidden_states[bid] = data
+            batch_k_caches[bid] = k_caches
+            batch_v_caches[bid] = v_caches
+        mask[..., start_indice] = 0
+        ############### NORM & GEN_HEAD #######################
+        hidden_states = post_norm.run(["output"], {"input": hidden_states.astype(np.float32)})[0]
+        logits = gen_head.run(["output"], {"input": hidden_states[:, -1, :]})[0]
+        ############# POST & GET NEXT TOKEN #############
+        logits = torch.from_numpy(logits)
+        logit_cond = logits[0::2, :]
+        logit_uncond = logits[1::2, :]
+        logits = logit_uncond + cfg_weight * (logit_cond-logit_uncond)
+        probs = torch.softmax(logits / temperature, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        generated_tokens[:, image_token_i] = next_token.squeeze(dim=-1)
+        next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+        ################## PREPARE_GEN_IMG_EMBEDS ##################
+        gen_embed_res = np.take(gen_embed, next_token.numpy().tolist(), axis=0)
+        img_embeds = gen_aligner.run(["output"], {"input": gen_embed_res})[0]
+        inputs_embeds = np.expand_dims(img_embeds, axis=1)
+    # z_q 为 vision decode 的输出
+    indices = generated_tokens.to(dtype=torch.int)
+    shape = [parallel_size, 8, img_size//patch_size, img_size//patch_size]
+    z_q = codebook_entry_embedding[indices]  # (b*h*w, c)
+    z_q = z_q.reshape(shape[0], shape[2], shape[3], shape[1])
+    # reshape back to match original input shape
+    z_q = z_q.permute(0, 3, 1, 2)
+    dec = gen_vision_model_decode.run(['image'], {'quant': z_q.to(dtype=torch.float32).numpy()})[0]
+    dec = dec.transpose(0, 2, 3, 1)
+    dec = np.clip((dec + 1) / 2 * 255, 0, 255)
+    visual_img = np.zeros((parallel_size, img_size, img_size, 3), dtype=np.uint8)
+    visual_img[:, :, :] = dec
+    os.makedirs('generated_samples', exist_ok=True)
+    for i in range(parallel_size):
+        save_path = os.path.join('generated_samples', "img_{}.jpg".format(i))
+        PIL.Image.fromarray(visual_img[i]).save(save_path)
+###################################################################
+config: MultiModalityConfig = AutoConfig.from_pretrained(tokenizer_dir, trust_remote_code=True)
+vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(tokenizer_dir)
+tokenizer = vl_chat_processor.tokenizer
+description = "A close-up high-contrast photo of Sydney Opera House sitting next to Eiffel tower, under a blue night sky of roiling energy, exploding yellow stars, and radiating swirls of blue."
+conversation = [
+    {
+        "role": "User",
+        "content": description,
+    },
+    {"role": "Assistant", "content": ""},
+]
+sft_format = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
+    conversations=conversation,
+    sft_format=vl_chat_processor.sft_format,
+    system_prompt="",
+)
+prompt = sft_format + vl_chat_processor.image_start_tag
+###################################################################
+cfg = config.language_config
+prefill_decoder_sessins = []
+for i in tqdm(range(cfg.num_hidden_layers), desc="Init InferenceSession"):
+    session = InferenceSession(
+        f"{axmodel_path}/llama_p640_l{i}_together.axmodel"
+    )
+    prefill_decoder_sessins.append(session)
+logger.info("model load done!")
+generate(
+    cfg,
+    prefill_decoder_sessins,
+    vl_chat_processor,
+    prompt
+)

infer_axmodel_und.py ADDED Viewed

	@@ -0,0 +1,228 @@

+# REF: https://github.com/deepseek-ai/Janus
+import numpy as np
+import torch
+from axengine import InferenceSession
+from ml_dtypes import bfloat16
+from transformers import AutoModel, AutoTokenizer, AutoConfig, AutoModelForCausalLM
+from tqdm import tqdm
+from einops import rearrange
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.models.modeling_vlm import MultiModalityConfig
+from janus.utils.io import load_pil_images
+import argparse
+import os
+parser = argparse.ArgumentParser(description="Model configuration parameters")
+parser.add_argument("--tokenizer_dir", type=str, default="Janus-Pro-1B",
+                    help="Path to HuggingFace model")
+parser.add_argument("--axmodel_path", type=str, default="janus_pro_1B_axmodel",
+                    help="Path to save compiled axmodel of llama model")
+parser.add_argument("-i", "--test_img_path", type=str, default="./imgs/image.png",
+                    help="Test image path (supports png/jpg formats)")
+parser.add_argument("--vit_axmodel_path", type=str, default="vit_axmodel/janus_warp_vit.axmodel",
+                    help="Path to ViT model's axmodel")
+args = parser.parse_args()
+# base info
+tokenizer_dir = args.tokenizer_dir
+axmodel_path = args.axmodel_path
+test_img_path = args.test_img_path
+vit_axmodel_path = args.vit_axmodel_path
+embeds = np.load(os.path.join(args.axmodel_path, "model.embed_tokens.weight.npy"))
+def prepare_inputs_embeds(
+    input_ids: torch.LongTensor,
+    pixel_values: torch.FloatTensor,
+    images_seq_mask: torch.LongTensor,
+    images_emb_mask: torch.LongTensor,
+    **kwargs,
+):
+    """
+    Args:
+        input_ids (torch.LongTensor): [b, T]
+        pixel_values (torch.FloatTensor):   [b, n_images, 3, h, w]
+        images_seq_mask (torch.BoolTensor): [b, T]
+        images_emb_mask (torch.BoolTensor): [b, n_images, n_image_tokens]
+        assert torch.sum(images_seq_mask) == torch.sum(images_emb_mask)
+    Returns:
+        input_embeds (torch.Tensor): [b, T, D]
+    """
+    bs, n = pixel_values.shape[0:2]
+    images = rearrange(pixel_values, "b n c h w -> (b n) c h w")
+    # [b x n, T2, D]
+    vit_session = InferenceSession(vit_axmodel_path)
+    images_embeds = vit_session.run(None, {"image": pixel_values[0].numpy()})[0] # pixel_values: [1, 1, 3, 384, 384]
+    print(f"vit_output.shape is {images_embeds.shape}, vit feature extract done!")
+    # [b x n, T2, D] -> [b, n x T2, D]
+    images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n)
+    # [b, n, T2] -> [b, n x T2]
+    images_emb_mask = rearrange(images_emb_mask, "b n t -> b (n t)")
+    # [b, T, D]
+    input_ids[input_ids < 0] = 0  # ignore the image embeddings
+    inputs_embeds = np.take(embeds, input_ids[0].cpu().numpy().tolist(), axis=0)[None, ...]
+    inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask]
+    return inputs_embeds
+def post_process(data, topk=1, topp=0.9, temperature=0.6):
+    def top_p(l: np.ndarray, p: float) -> np.ndarray:
+        index = np.argsort(l)
+        res = l.copy()
+        sum_p = 0
+        for i in index[::-1]:
+            if sum_p >= p:
+                res[i] = 0
+            sum_p += res[i]
+        return res / sum_p
+    def softmax(l: np.ndarray) -> np.ndarray:
+        l_max = l - l.max()
+        l_exp = np.exp(l_max)
+        res = l_exp / np.sum(l_exp)
+        return res.astype(np.float64)
+    r = data.astype(np.float32)
+    r = r.flatten()
+    candidate_index = np.argpartition(r, -topk)[-topk:]
+    candidate_value = r[candidate_index]
+    candidate_value /= temperature
+    candidate_soft = softmax(candidate_value)
+    candidate_soft = top_p(candidate_soft, topp)
+    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
+    pos = np.random.multinomial(1, candidate_soft).argmax()
+    next_token = candidate_index[pos]
+    return next_token, candidate_index, candidate_soft
+config: MultiModalityConfig = AutoConfig.from_pretrained(tokenizer_dir, trust_remote_code=True)
+vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(tokenizer_dir)
+tokenizer = vl_chat_processor.tokenizer
+# question = "请尝试理解这幅图中的内容."
+question = "Please describe the picture."
+conversation = [
+    {
+        "role": "User",
+        "content": f"<image_placeholder>\n{question}",
+        "images": [test_img_path],
+    },
+    {"role": "Assistant", "content": ""},
+]
+# load images and prepare for inputs
+pil_images = load_pil_images(conversation)
+prepare_inputs = vl_chat_processor(
+    conversations=conversation, images=pil_images, force_batchify=True
+)
+input_embedding = prepare_inputs_embeds(**prepare_inputs)
+token_ids = prepare_inputs['input_ids'].squeeze().numpy().tolist()
+prefill_data = input_embedding
+prefill_data = prefill_data.astype(bfloat16)
+token_len = len(token_ids)
+lastN = 1023
+cfg = config.language_config
+kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
+k_caches = [
+    np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+    for _ in range(cfg.num_hidden_layers)
+]
+v_caches = [
+    np.zeros((1, lastN, kv_dim), dtype=bfloat16)
+    for _ in range(cfg.num_hidden_layers)
+]
+prefill_decoder_sessins = []
+for i in tqdm(range(cfg.num_hidden_layers), desc="Init InferenceSession"):
+    session = InferenceSession(
+        f"{axmodel_path}/llama_p640_l{i}_together.axmodel"
+    )
+    prefill_decoder_sessins.append(session)
+post_process_session = InferenceSession(
+    f"{axmodel_path}/llama_post.axmodel"
+)
+print("model load done!")
+"""
+    prefill
+"""
+prefill_len = 640
+if prefill_len > 0:
+    indices = np.array(list(range(prefill_len)), np.uint32).reshape(
+        (1, prefill_len)
+    )
+    indices[:, token_len:] = 0
+    mask = np.zeros((1, prefill_len, prefill_len)) - 65536
+    data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
+    data[:, 0:token_len] = prefill_data
+    for i, t in enumerate(token_ids):
+        mask[:, i, : i + 1] = 0
+    mask = mask.astype(bfloat16)
+    for i in range(cfg.num_hidden_layers):
+        input_feed = {
+            "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+            "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
+            "indices": indices,
+            "input": data,
+            "mask": mask,
+        }
+        outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=1)
+        k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
+        v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
+        data[:, :token_len] = outputs[2][:, :token_len, :]
+post_out = post_process_session.run(None, {"input": data[:, token_len - 1, :][None, ...]})[0]
+next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
+posibles = [tokenizer.decode([t]) for t in posssible_tokens]
+posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
+token_ids.append(next_token)
+print("prefill done!")
+"""
+    decode
+"""
+mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
+mask[:, :, :lastN] -= 65536
+mask[:, :, :token_len] = 0
+for start_indice in tqdm(range(lastN + 1), desc="Decoder"): # lastN + 1
+    if prefill_len > 0 and start_indice < token_len:
+        continue
+    next_token = token_ids[start_indice]
+    indices = np.array([start_indice], np.uint32).reshape((1, 1))
+    data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
+    for i in range(cfg.num_hidden_layers):
+        input_feed = {
+            "K_cache": k_caches[i],
+            "V_cache": v_caches[i],
+            "indices": indices,
+            "input": data,
+            "mask": mask,
+        }
+        outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
+        k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
+        v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
+        data = outputs[2]
+    mask[..., start_indice] = 0
+    if start_indice < token_len - 1:
+        pass
+    else:
+        post_out = post_process_session.run(None, {"input": data})[0]
+        next_token, posssible_tokens, possible_soft = post_process(post_out)
+        token_ids.append(next_token)
+    if next_token == tokenizer.eos_token_id:
+        print("hit eos!")
+        break
+print("Janus Answers: ", tokenizer.decode(token_ids[token_len:], skip_special_tokens=True))

janus_pro_1b_axmodel/llama_p640_l0_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36e476b67cc13f0fe6701b7d666e9e316ee03d38998ba633964f7f96e92b8db5
+size 58843532

janus_pro_1b_axmodel/llama_p640_l10_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddf79ff6a43ead47fda4308fd89e0468a3b19ad8e5f5a912247a9de160c34954
+size 58844556

janus_pro_1b_axmodel/llama_p640_l11_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f108e870863238890fb579c7bb991abd0a8b4f695ff2b5d483c6e16a2b0433c
+size 58844684

janus_pro_1b_axmodel/llama_p640_l12_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5da5feb965e8fbb678a144ca26e5ff9d520d80c18823563b1cb382980bcabe1b
+size 58844236

janus_pro_1b_axmodel/llama_p640_l13_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7358c891f13c998f87a1f1d85f3357fffebfac4d5bb67e15868a0a93113108a9
+size 58844620

janus_pro_1b_axmodel/llama_p640_l14_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:19e2d24aa96773a866043bfefc1b815f04964c9d27b18637401de306d8bb5595
+size 58844140

janus_pro_1b_axmodel/llama_p640_l15_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:80bf1befea66e3f42d9cd77a92b35cae27683f50d34becb7095ba0f035c55cb9
+size 58844268

janus_pro_1b_axmodel/llama_p640_l16_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2283ea05dabba501779dc79ffbf5ce6e0ab18ad157a3aa2a3e488d888082b342
+size 58844396

janus_pro_1b_axmodel/llama_p640_l17_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:723136b342e5197d2e508510c7f247cdae853211e9d8710438cf2fe09712ec1a
+size 58844076

janus_pro_1b_axmodel/llama_p640_l18_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:660e661f19ccf22c91034ed4d3a1869c5963f098e4e6509193a1aca6fcb24401
+size 58844300

janus_pro_1b_axmodel/llama_p640_l19_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:deb31364fa508c5526c70915c38f8ccb052cd84d6c79893bf46590b37cce25a2
+size 58844364

janus_pro_1b_axmodel/llama_p640_l1_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3828219df2633babf673a3fbb20a5d8d8dde602dae5a5ed35a76349c0b7a2dac
+size 58844460

janus_pro_1b_axmodel/llama_p640_l20_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74dddcc432823b8257a52712f4e5cdb53391291b6b19e8f277c96550f8e118a7
+size 58844236

janus_pro_1b_axmodel/llama_p640_l21_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06961849a4c6a31fa8454abd61e80fed76e4c4a050cbd1b7d16c638c6599d529
+size 58844620

janus_pro_1b_axmodel/llama_p640_l22_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b80d1205cd37f7ff88cf522385910b1332e4a6a9c4b1419e03099c12884e718c
+size 58844108

janus_pro_1b_axmodel/llama_p640_l23_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b8f8387fcd1a8030275555828e8335fe7de694776f847d95bb048f889b880bb
+size 58843980

janus_pro_1b_axmodel/llama_p640_l2_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5ff8bd9786537b7cf155ebd64459de2fc643a101a5469a071e0758604bb14f66
+size 58844492

janus_pro_1b_axmodel/llama_p640_l3_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94cf4e816de0f8f78a6ec18917302da650673f1a9e6a907d1cab3875e2eb15ab
+size 58844556

janus_pro_1b_axmodel/llama_p640_l4_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:97c497026a610c17ca80da4f828ce71053ab71bdaadd356cc7ddbfb2a4ef5c03
+size 58844108

janus_pro_1b_axmodel/llama_p640_l5_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73e0ea653bf7410aab2a2e7e239cb57f71efdad47a78d1fab57e127f327de6fb
+size 58844300

janus_pro_1b_axmodel/llama_p640_l6_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8d6d0cc26433000a91adccd97869916bfcebff975c94a59865b8e0343b0cfee0
+size 58844460

janus_pro_1b_axmodel/llama_p640_l7_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01061af690dcf356ae74c2b2b927c1b06ccfc6e594a9c67d7cb3fdba0aca2508
+size 58843980

janus_pro_1b_axmodel/llama_p640_l8_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ce71289afc108c8e2f304d764f14f9efa13ea6342bc64e0484aba78db25e64f
+size 58844364

janus_pro_1b_axmodel/llama_p640_l9_together.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c3f4fa46650e6e2c88bc8a7cb0dd39b7fbd08652e99dac3452e437517788e69b
+size 58844364

janus_pro_1b_axmodel/llama_post.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8950aede1718e00a9f0489c90bf76a8639cd43781ae6c0b49978a3b7202513e
+size 229046979

janus_pro_1b_axmodel/model.embed_tokens.weight.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:282e7088dbdb59b03e948edd97d3768f9d5daecbb7e7cb690147ffca25948ce1
+size 838860928

janus_pro_1b_tokenizer/config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "aligner_config": {
+    "cls": "MlpProjector",
+    "model_type": "aligner",
+    "params": {
+      "depth": 2,
+      "input_dim": 1024,
+      "n_embed": 2048,
+      "projector_type": "mlp_gelu"
+    }
+  },
+  "architectures": [
+    "MultiModalityCausalLM"
+  ],
+  "gen_aligner_config": {
+    "cls": "MlpProjector",
+    "model_type": "gen_aligner",
+    "params": {
+      "depth": 2,
+      "input_dim": 8,
+      "n_embed": 2048,
+      "projector_type": "mlp_gelu"
+    }
+  },
+  "gen_head_config": {
+    "cls": "vision_head",
+    "model_type": "gen_head",
+    "params": {
+      "image_token_embed": 2048,
+      "image_token_size": 16384,
+      "n_embed": 2048
+    }
+  },
+  "gen_vision_config": {
+    "cls": "VQ-16",
+    "model_type": "gen_vision",
+    "params": {
+      "image_token_size": 16384,
+      "n_embed": 8
+    }
+  },
+  "language_config": {
+    "hidden_size": 2048,
+    "intermediate_size": 5632,
+    "max_position_embeddings": 16384,
+    "model_type": "llama",
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 16,
+    "torch_dtype": "bfloat16",
+    "vocab_size": 102400
+  },
+  "model_type": "multi_modality",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.33.1",
+  "vision_config": {
+    "cls": "CLIPVisionTower",
+    "model_type": "vision",
+    "params": {
+      "image_size": 384,
+      "model_name": "siglip_large_patch16_384",
+      "select_feature": "same",
+      "select_layer": -1
+    }
+  }
+}

janus_pro_1b_tokenizer/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "background_color": [
+    127,
+    127,
+    127
+  ],
+  "do_normalize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "VLMImageProcessor",
+  "image_size": 384,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "min_size": 14,
+  "processor_class": "VLChatProcessor",
+  "rescale_factor": 0.00392156862745098
+}

janus_pro_1b_tokenizer/processor_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "add_special_token": false,
+  "ignore_id": -100,
+  "image_tag": "<image_placeholder>",
+  "mask_prompt": true,
+  "num_image_tokens": 576,
+  "processor_class": "VLChatProcessor",
+  "sft_format": "deepseek"
+}

janus_pro_1b_tokenizer/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "additional_special_tokens": [
+    "<image_placeholder>",
+    "<patch_placeholder>",
+    "<|ref|>",
+    "<|/ref|>",
+    "<|det|>",
+    "<|/det|>",
+    "<|grounding|>",
+    "<|User|>",
+    "<|Assistant|>"
+  ],
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "pad_token": "<｜▁pad▁｜>"
+}

janus_pro_1b_tokenizer/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

janus_pro_1b_tokenizer/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token": "<｜begin▁of▁sentence｜>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<｜end▁of▁sentence｜>",
+  "model_max_length": 16384,
+  "pad_token": null,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": null,
+  "use_default_system_prompt": true
+}

vit_axmodel/janus_warp_vit.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:986d4444e88e3fcece749430abff504868eba25690e3a08dcb9568f7ad5ea0ab
+size 348623368