diff --git a/README.md b/README.md
index 1c59cee177b4fdae0c6a8a4c705026162d3951cb..524b75704fd6d4b38a127aa751fc147b410bcefc 100644
--- a/README.md
+++ b/README.md
@@ -72,5 +72,5 @@ ai@ai-bj ~/yongqiang/SmolVLM2-500M-Video-Instruct $ python3 infer_axmodel.py
 
 input prompt: Can you describe this image?
 
-answer >>  The image captures a close-up view of a pink flower, prominently featuring a bumblebee. The bumblebee, with its black and yellow stripes, is in the center of the frame, its body slightly tilted to the left. The flower, with its petals fully spread, is the main subject of the image. The background is blurred, drawing focus to the flower and the bumblebee. The blurred background suggests a garden or a field, providing a sense of depth to the image. The^@ colors in the image are vibrant, with the pink of the flower contrasting against the green of the leaves and the brown of the stems. The image does not provide enough detail to confidently identify the specific location or landmark referred to as "sa_16743".
-```
\ No newline at end of file
+answer >>  The image depicts a close-up view of a pink flower with a bee on it. The bee, which appears to be a bumblebee, is perched on the flower's center, which is surrounded by a cluster of other flowers. The bee is in the process of collecting nectar from the flower, which is a common behavior for bees. The flower itself has a yellow center with a cluster of yellow stamens surrounding it. The petals of the flower are a vibrant shade of pink, and the bee is positioned very close to^@ the camera, making it the focal point of the image. The background of the image is slightly blurred, but it appears to be a garden or a field with other flowers and plants, contributing to the overall natural setting of the image.
+```
diff --git a/infer_axmodel.py b/infer_axmodel.py
index d823827de4fcacecf30e02cb2c0bd81db8c44e69..97288aabf14de3546dfa068c0bb9dbf9c5741192 100644
--- a/infer_axmodel.py
+++ b/infer_axmodel.py
@@ -9,16 +9,15 @@ from transformers import AutoConfig
 from typing import List, Tuple
 from axengine import InferenceSession
 from ml_dtypes import bfloat16
-
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-embeddings = torch.load("./embeds/SmolVLMVisionEmbeddings.pkl", map_location=device, weights_only=False)
-embeds = np.load(os.path.join("./smolvlm2_axmodel", "model.embed_tokens.weight.npy"))
-# connector = torch.load("SmolVLMConnector.pkl", map_location=device, weights_only=False)
-encoder = ort.InferenceSession(f'./vit_mdoel/vision_model.onnx', providers=["CPUExecutionProvider"])
+from utils.infer_func import InferManager
+import argparse
+from PIL import Image
+from torchvision.transforms import Resize, ToTensor, Normalize, Compose
+from transformers.image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 
 
 def run_vision_model(
+    encoder,
     pixel_values,
     patch_attention_mask=None,
 ):
@@ -45,13 +44,15 @@ def run_vision_model(
     elif not self._use_flash_attention_2:
         patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)
 
+    # 保存 vit-encoder 的量化校准集
+    # np.save("../model_convert/vit_encoder_calibrations/hidden_states_5.npy", hidden_states.detach().cpu().to(dtype=torch.float32).numpy())
     encoder_outputs = encoder.run(None, {"input": hidden_states.detach().cpu().to(dtype=torch.float32).numpy()})[0]
     encoder_outputs = torch.from_numpy(encoder_outputs).to(device, dtype=hidden_states.dtype)
 
     return encoder_outputs
 
 
-def get_image_features(pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
+def get_image_features(encoder, pixel_values: torch.FloatTensor, pixel_attention_mask: torch.LongTensor = None):
     """
     Encodes images into continuous embeddings that can be forwarded to the language model.
 
@@ -90,7 +91,7 @@ def get_image_features(pixel_values: torch.FloatTensor, pixel_attention_mask: to
     patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
 
     # Get sequence from the vision encoder
-    image_hidden_states = run_vision_model(pixel_values, patch_attention_mask)
+    image_hidden_states = run_vision_model(encoder, pixel_values, patch_attention_mask)
 
     # Modality projection & resampling
     # image_hidden_states = connector(image_hidden_states) # 已经 fuse 到了 onnx 中
@@ -132,51 +133,59 @@ def inputs_merger(
     return merged_embeds
 
 
-def post_process(data, topk=1, topp=0.9, temperature=0.6):
-    def top_p(l: np.ndarray, p: float) -> np.ndarray:
-        index = np.argsort(l)
-        res = l.copy()
-        sum_p = 0
-        for i in index[::-1]:
-            if sum_p >= p:
-                res[i] = 0
-            sum_p += res[i]
-        return res / sum_p
-
-    def softmax(l: np.ndarray) -> np.ndarray:
-        l_max = l - l.max()
-        l_exp = np.exp(l_max)
-        res = l_exp / np.sum(l_exp)
-        return res.astype(np.float64)
-
-    r = data.astype(np.float32)
-    r = r.flatten()
-    candidate_index = np.argpartition(r, -topk)[-topk:]
-    candidate_value = r[candidate_index]
-    candidate_value /= temperature
-    candidate_soft = softmax(candidate_value)
-    candidate_soft = top_p(candidate_soft, topp)
-    candidate_soft = candidate_soft.astype(np.float64) / candidate_soft.sum()
-    pos = np.random.multinomial(1, candidate_soft).argmax()
-    next_token = candidate_index[pos]
-    return next_token, candidate_index, candidate_soft
-
-
 if __name__ == "__main__":
 
-    hf_model_path = "./smolvlm2_tokenizer/"
-    axmodel_path = "./smolvlm2_axmodel"
-    prompt = 'Can you describe this image?'
+    """
+    python3 infer_axmodel.py -i ../assets/panda.jpg --vit_model ./vit-models/vision_model.axmodel
+    """
+
+    prompt = None
+    parser = argparse.ArgumentParser(description="Model configuration parameters")
+    parser.add_argument("--hf_model", type=str, default="./SmolVLM2-500M-Video-Instruct/",
+                        help="Path to HuggingFace model")
+    parser.add_argument("--axmodel_path", type=str, default="./SmolVLM2-500M-Video-Instruct_axmodel/",
+                        help="Path to save compiled axmodel of llama model")
+    parser.add_argument("--vit_model", type=str, default='./vit-models/vision_model.axmodel',
+                        help="Path to save compiled axmodel of llama model")
+    parser.add_argument("-i", "--images", type=str, default="../assets/bee.jpg",
+                        help="Path to the test image.")
+    parser.add_argument("-q", "--question", type=str, default="Can you describe this image?",
+                        help="Your question that you want to ask the model.")
+    args = parser.parse_args()
+
+    hf_model_path = args.hf_model
+    axmodel_path = args.axmodel_path
+    images = args.images
+    prompt = args.question
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    embeddings = torch.load("./embeds/SmolVLMVisionEmbeddings.pkl", map_location=device, weights_only=False)
+    embeds = np.load(os.path.join(axmodel_path, "model.embed_tokens.weight.npy"))
+
+    encoder = InferenceSession(args.vit_model)
 
     processor = AutoProcessor.from_pretrained(hf_model_path)
     config = AutoConfig.from_pretrained(hf_model_path, trust_remote_code=True)
     tokenizer = processor.tokenizer
 
+    TARGET_IMAGE_SIZE = (512, 512)
+    image = Image.open(images).convert('RGB')
+
+    # 固定输入图像 size: 512x512
+    preprocess = Compose([
+        Resize(TARGET_IMAGE_SIZE),
+        # ToTensor(),
+        # Normalize(mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD),
+    ])
+
+    preprocessed_image = preprocess(image)
+
     messages = [
         {
             "role": "user",
             "content": [
-                {"type": "image", "url": "./assets/bee.jpg"},
+                {"type": "image", "image": preprocessed_image}, # 这里可以直接使用 PIL Image 对象
+                # {"type": "image", "url": images}, # 也可以使用 url
                 {"type": "text", "text": prompt},
             ]
         },
@@ -201,7 +210,7 @@ if __name__ == "__main__":
     """
     miniforge-pypy3/envs/lerobot/lib/python3.10/site-packages/transformers/models/smolvlm/modeling_smolvlm.py(681)get_image_features()
     """
-    image_hidden_states = get_image_features(pixel_values, pixel_attention_mask)
+    image_hidden_states = get_image_features(encoder, pixel_values, pixel_attention_mask)
 
     inputs_embeds = inputs_merger(
         input_ids=input_ids,
@@ -213,104 +222,10 @@ if __name__ == "__main__":
     prefill_data = prefill_data.astype(bfloat16)
     token_ids = input_ids[0].cpu().numpy().tolist()
     token_len = len(token_ids)
-
-    lastN = 2048
     cfg = config.text_config
 
-    kv_dim = cfg.hidden_size // cfg.num_attention_heads * cfg.num_key_value_heads
-    k_caches = [
-        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
-        for _ in range(cfg.num_hidden_layers)
-    ]
-    v_caches = [
-        np.zeros((1, lastN, kv_dim), dtype=bfloat16)
-        for _ in range(cfg.num_hidden_layers)
-    ]
-
-    prefill_decoder_sessins = []
-    for i in tqdm(range(cfg.num_hidden_layers), desc="Init InferenceSession"):
-        session = InferenceSession(
-            f"{axmodel_path}/llama_p1024_l{i}_together.axmodel"
-        )
-        prefill_decoder_sessins.append(session)
-    post_process_session = InferenceSession(
-        f"{axmodel_path}/llama_post.axmodel"
-    )
-    print("model load done!")
+    imer = InferManager(cfg, axmodel_path)
 
-    """
-        prefill
-    """
-    prefill_len = 1024
-
-    if prefill_len > 0:
-        indices = np.array(list(range(prefill_len)), np.uint32).reshape(
-            (1, prefill_len)
-        )
-        indices[:, token_len:] = 0
-        mask = np.zeros((1, prefill_len, prefill_len)) - 65536
-        data = np.zeros((1, prefill_len, cfg.hidden_size)).astype(bfloat16)
-        data[:, 0:token_len] = prefill_data
-        for i, t in enumerate(token_ids):
-            mask[:, i, : i + 1] = 0
-        mask = mask.astype(bfloat16)
-        for i in range(cfg.num_hidden_layers):
-            input_feed = {
-                "K_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
-                "V_cache": np.zeros((1, 1, cfg.hidden_size), dtype=bfloat16),
-                "indices": indices,
-                "input": data,
-                "mask": mask,
-            }
-            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=1)
-            k_caches[i][:, :token_len, :] = outputs[0][:, :token_len, :]
-            v_caches[i][:, :token_len, :] = outputs[1][:, :token_len, :]
-            data[:, :token_len] = outputs[2][:, :token_len, :]
-
-    post_out = post_process_session.run(None, {"input": data[:, token_len - 1, :][None, ...]})[0]
-    next_token, posssible_tokens, possible_soft = post_process(post_out, topk=1)
-    posibles = [tokenizer.decode([t]) for t in posssible_tokens]
-    posible_soft = [str((t, s)) for t, s in zip(posibles, possible_soft)]
-    token_ids.append(next_token)
-    # print("prefill done!")
-    print(f"input prompt: {prompt}\n")
-    print("answer >>", tokenizer.decode(token_ids[token_len], skip_special_tokens=True), end='', flush=True)
-
-    """
-        decode
-    """
-    mask = np.zeros((1, 1, lastN + 1), dtype=np.float32).astype(bfloat16)
-    mask[:, :, :lastN] -= 65536
-    mask[:, :, :token_len] = 0
-    for start_indice in range(lastN + 1):
-        if prefill_len > 0 and start_indice < token_len:
-            continue
-        next_token = token_ids[start_indice]
-        indices = np.array([start_indice], np.uint32).reshape((1, 1))
-        data = embeds[next_token, :].reshape((1, 1, cfg.hidden_size)).astype(bfloat16)
-
-        for i in range(cfg.num_hidden_layers):
-            input_feed = {
-                "K_cache": k_caches[i],
-                "V_cache": v_caches[i],
-                "indices": indices,
-                "input": data,
-                "mask": mask,
-            }
-            outputs = prefill_decoder_sessins[i].run(None, input_feed, shape_group=0)
-            k_caches[i][:, start_indice, :] = outputs[0][:, :, :]
-            v_caches[i][:, start_indice, :] = outputs[1][:, :, :]
-            data = outputs[2]
-
-        mask[..., start_indice] = 0
-        if start_indice < token_len - 1:
-            pass
-        else:
-            post_out = post_process_session.run(None, {"input": data})[0]
-            next_token, posssible_tokens, possible_soft = post_process(post_out)
-            token_ids.append(next_token)
-            print(tokenizer.decode(next_token, skip_special_tokens=True), end='', flush=True)
-
-        if next_token == tokenizer.eos_token_id:
-            break
+    token_ids = imer.prefill(tokenizer, token_ids, prefill_data[0], slice_len=128)
+    imer.decode(tokenizer, token_ids, embeds, slice_len=128)
     print("\n")
diff --git a/smolvlm2_axmodel/llama_p1024_l0_together.axmodel b/smolvlm2_axmodel/llama_p1024_l0_together.axmodel
deleted file mode 100644
index bb7510e1f9979139277744b09d9d42a37446d29b..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l0_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:125ac7e80a94dbd3920fb0e0077ccad612abe8fabc2040dda09b19813ce96f68
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l10_together.axmodel b/smolvlm2_axmodel/llama_p1024_l10_together.axmodel
deleted file mode 100644
index 50eef6df4dc313a017349e6ffc82afac85993021..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l10_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:12f5aa82a4dcc3a66aaad951b1ea87c50e618c93adade3a2d1a7b5614169f5a1
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l11_together.axmodel b/smolvlm2_axmodel/llama_p1024_l11_together.axmodel
deleted file mode 100644
index 400aaa6ef1fdd6836105b1a8757e099c99f894b6..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l11_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ba247ba036a831b6201b53a03bf9847e16be239b386846cf22980da6695cc0d6
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l12_together.axmodel b/smolvlm2_axmodel/llama_p1024_l12_together.axmodel
deleted file mode 100644
index 2df1be6952a3c5fe0db1edacd8936b31ffcf2d60..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l12_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:011aea9b7e4fcadec5d1b2c386ff4a12e2f3f0e0e31eca634afc8acc9f0d343b
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l13_together.axmodel b/smolvlm2_axmodel/llama_p1024_l13_together.axmodel
deleted file mode 100644
index f1e5389aa25c24b20de22094c6a9c4ba81d4b2c0..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l13_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9420f15bb5b591f258212242bc5fa5566ba45f4d697d0599999114961152d1fd
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l14_together.axmodel b/smolvlm2_axmodel/llama_p1024_l14_together.axmodel
deleted file mode 100644
index a8823a865444818bcaef4a2fa8d4b63b2160298f..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l14_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:397511107011f700388029e604c2f5ec6d092f9cb6e09ab890a198932173193c
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l15_together.axmodel b/smolvlm2_axmodel/llama_p1024_l15_together.axmodel
deleted file mode 100644
index 4eb4dc970616849e283a864dbb35e7d7effed53d..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l15_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:689d9286ad7cf81345352f85bfbb8387934fe7ccb76d3f56563ded5f1d7cdb7b
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l16_together.axmodel b/smolvlm2_axmodel/llama_p1024_l16_together.axmodel
deleted file mode 100644
index 54ec27b16a5263da3d9af5b023c2778870bd1d36..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l16_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b91fecc232c92c9faa5fca4ca1bff0802abc8351457f9b34ef55327ccdcbc85a
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l17_together.axmodel b/smolvlm2_axmodel/llama_p1024_l17_together.axmodel
deleted file mode 100644
index ab5adf94fc8b75250a5b5d2fefcc70ecb9847837..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l17_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9404c81f4a02fe332ae1f4ed5361d2f68eea66a9550233cc4c1d4455afc95797
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l18_together.axmodel b/smolvlm2_axmodel/llama_p1024_l18_together.axmodel
deleted file mode 100644
index 1ce680e9e3f89880c1d27afecc9a54574d9008b5..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l18_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ffa8d959498bd479d2bbb2c42e883a21bb173fbcb73f5d1bbdebe6c8365e8e21
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l19_together.axmodel b/smolvlm2_axmodel/llama_p1024_l19_together.axmodel
deleted file mode 100644
index 4087e2ec3656c72d441598ce96f95141cb6fc9b4..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l19_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:66265cbf7cd8571f949c23ca6a5918f8c95fb3413e4349cb9c9f3ac18231ca21
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l1_together.axmodel b/smolvlm2_axmodel/llama_p1024_l1_together.axmodel
deleted file mode 100644
index 54ff5cbd3f40d1a85d9b7196fe722aa832907c31..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l1_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9addcae5bad93adaf9f8df49d4cbfa82024be2d2e0b2e815537121a7417ecb88
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l20_together.axmodel b/smolvlm2_axmodel/llama_p1024_l20_together.axmodel
deleted file mode 100644
index 2bd984f528bf6fe31731b24001bd16e0baa30c7b..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l20_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:69430a836a9eb0d46242419a999e761d61a0c4cc4d17eafbe373641551ac0a8b
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l21_together.axmodel b/smolvlm2_axmodel/llama_p1024_l21_together.axmodel
deleted file mode 100644
index 57a1cdebf26d8cbf11f23177062a6a686f3e8fbd..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l21_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6a19009fd1a1d28c9414cb9421af4c66473088a0b3caea9157bde6aac071e1ce
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l22_together.axmodel b/smolvlm2_axmodel/llama_p1024_l22_together.axmodel
deleted file mode 100644
index d26cea41ca52980b25ed62dcef85ff0b341fd253..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l22_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ec30ac9fd2a52f281b76a037d0aa146b8144277aed3408a6c281e5a7df8ba62a
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l23_together.axmodel b/smolvlm2_axmodel/llama_p1024_l23_together.axmodel
deleted file mode 100644
index fdd3a674b9715c419591ee5b560e1bbc90fddaee..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l23_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1093d36fa84d6248b1a4728d8ae2aadb1143894eaf3d960e12fd3753d3ab4da2
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l24_together.axmodel b/smolvlm2_axmodel/llama_p1024_l24_together.axmodel
deleted file mode 100644
index 56f70791c12d8b53b803d4e04736de04b28d7b3a..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l24_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ff63d4efb6dd75433205ce87e4d69d7850dad86555b2919864f04c5df3a8a844
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l25_together.axmodel b/smolvlm2_axmodel/llama_p1024_l25_together.axmodel
deleted file mode 100644
index acab70b2f5d62c5450740cb98b29e658b0bdf58e..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l25_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:83d8b772f3aef6356234912a371baebcb6c0897faf3d524091b7ea2fc56f77bc
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l26_together.axmodel b/smolvlm2_axmodel/llama_p1024_l26_together.axmodel
deleted file mode 100644
index 999b84c23c3ecf7acdb498c26ea84e1ec1ded098..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l26_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:033f9deb6fe2288347d1af507d7a31deb0633614dfb0efe9a3a9c962afbe44eb
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l27_together.axmodel b/smolvlm2_axmodel/llama_p1024_l27_together.axmodel
deleted file mode 100644
index 8c0491a97ac7c047efb248464d9afaf78209c698..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l27_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c0c8c035eb371dd31d53844534c4d321efc933e1097ad3e9d87afd52dba74214
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l28_together.axmodel b/smolvlm2_axmodel/llama_p1024_l28_together.axmodel
deleted file mode 100644
index a3d991d466dfe91f64e67fd4095af5729afa214e..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l28_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8d33cae03279cab06a856cfacc3e84414c615082a4a358bd09c4a5996c17c575
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l29_together.axmodel b/smolvlm2_axmodel/llama_p1024_l29_together.axmodel
deleted file mode 100644
index 5523aedc98a6d45fe5dd019c36db9b43d3d98e24..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l29_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:84583f5ef60b629b34d47c7deeb3200c096d6d6bf3de3f6bec4da6ae005b5a1e
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l2_together.axmodel b/smolvlm2_axmodel/llama_p1024_l2_together.axmodel
deleted file mode 100644
index 3d5c0366f001ee9e1d791b659570a9da6c869bc4..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l2_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4514475633a7317118fe4486200bbed73929bd4210c6da4041591797ad93fb3a
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l30_together.axmodel b/smolvlm2_axmodel/llama_p1024_l30_together.axmodel
deleted file mode 100644
index 2132c3f6e1c9f64721eeceb84c7a841e40573280..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l30_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:39e1612aac9b1604146b61b4fc37eaada2299f62078260689bf03812c256c75b
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l31_together.axmodel b/smolvlm2_axmodel/llama_p1024_l31_together.axmodel
deleted file mode 100644
index 2aba38541eafca847cc863945fe9cec626c56e69..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l31_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5f2f54bcb7d01ea69a3177b72d49e3bdab2d0e0403e86085903389cc6839b5fd
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l3_together.axmodel b/smolvlm2_axmodel/llama_p1024_l3_together.axmodel
deleted file mode 100644
index a2368131844a163a07ed3ee7b47f81c86b9ec59b..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l3_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a991d67e4c1dc4bf58689ce4a58362f6bcc73a87257bcb2982774a0b056ca720
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l4_together.axmodel b/smolvlm2_axmodel/llama_p1024_l4_together.axmodel
deleted file mode 100644
index 62cda5209c5bfede353a4da7e851c41f01018935..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l4_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9a43e6886989c31dfffeae70177fc9464322bded5bb69515e31aaade31b431b5
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l5_together.axmodel b/smolvlm2_axmodel/llama_p1024_l5_together.axmodel
deleted file mode 100644
index 515cf387203018b105f2b0f84a75a5e469be9ce9..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l5_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ed59bef655c1eae8eb7af4566ef21fd874cfac72b67bbfd1a7279e1a1cffd2c8
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l6_together.axmodel b/smolvlm2_axmodel/llama_p1024_l6_together.axmodel
deleted file mode 100644
index ae30db179c5bc037d861cfa841cf27885be417c4..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l6_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:848640700c17925475ef9f9edeaa0fccf235e90a5ad159430682ac389910d86b
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l7_together.axmodel b/smolvlm2_axmodel/llama_p1024_l7_together.axmodel
deleted file mode 100644
index d953b19e7dda6994a8a063f16b2f618ca23b9276..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l7_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:46e4bce8f94d80d12e3b1a5ceae7ba62cbaa06f0ddf11f13999b1936a98bc0a1
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l8_together.axmodel b/smolvlm2_axmodel/llama_p1024_l8_together.axmodel
deleted file mode 100644
index 6c893dba8e98a2c1ce35b722fd95fd3c7df12913..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l8_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3ba57d8f2cd4d932445600d161a04b0a1160f452425c5abd08f94bece56f23f
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p1024_l9_together.axmodel b/smolvlm2_axmodel/llama_p1024_l9_together.axmodel
deleted file mode 100644
index ee8c9d5b437b9117f6574bdcc988780bb73bb0c6..0000000000000000000000000000000000000000
--- a/smolvlm2_axmodel/llama_p1024_l9_together.axmodel
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0464cccfdfb0566069bad977d98f70b9e15e8e0b642a6e01ca2b16b5f7eb170a
-size 12002005
diff --git a/smolvlm2_axmodel/llama_p128_l0_together.axmodel b/smolvlm2_axmodel/llama_p128_l0_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..a3bc71bc34015a2ee8bc3e69c65845bb84e4bd23
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l0_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe798dad363285aa06db28f00cabf919db772d17d7bb842a48f5b76c4bb31f17
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l10_together.axmodel b/smolvlm2_axmodel/llama_p128_l10_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..f6e650201529512998a71b16c809cb6262900e8e
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l10_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:730853a20a5ff783ccc8b97568ebd7bb4320922bd2e28383005ebca8389d40df
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l11_together.axmodel b/smolvlm2_axmodel/llama_p128_l11_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..6c9e796e559a285cd8bbc3830264da5bdc8600e2
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l11_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a9114384bfed1547e72099ed94e5f0d509170ac7872c8727e00b4d9e0a9c26a6
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l12_together.axmodel b/smolvlm2_axmodel/llama_p128_l12_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..11a9d357710b21c3b3469b86df3a69a20c348cea
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l12_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fd4b6c59aac2909279181165c81cfd6aaa7e9765b2eb5d7eab6f28b15b638c47
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l13_together.axmodel b/smolvlm2_axmodel/llama_p128_l13_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..92faa293b8ac821d583e4c88dc30e7164cd53ce0
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l13_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f21a64d4d6c47ee8c3e9784caa82037b00e380846809dda9d5f45463d6c9e259
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l14_together.axmodel b/smolvlm2_axmodel/llama_p128_l14_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..47177b98a0af8918dfb897101eae70bcc2e9e44d
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l14_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d762756ab6cf454f60238687bdb49e48f74de405a190ce3f8baea2d63fd77e15
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l15_together.axmodel b/smolvlm2_axmodel/llama_p128_l15_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..cccd54e75d4c8ffbbc8ed0915bda9d7997e4b63d
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l15_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:137147c5d1a536e31ccbd01814a4e058c3a700f88bc73fab2417724c047d1c8a
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l16_together.axmodel b/smolvlm2_axmodel/llama_p128_l16_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..c0aaa02433fbfea2b5d11d4d7fa1b51f99d18055
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l16_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6cff0f01c402c967927451782f6483830a35b9d5f247ec002c0531080f58a583
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l17_together.axmodel b/smolvlm2_axmodel/llama_p128_l17_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..cad5d322acbde7dae0df71e8fbfc1da763f61ebc
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l17_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7544a41af808812decbf454b484adc5d01317b7044616a3a6b921f81d2a07904
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l18_together.axmodel b/smolvlm2_axmodel/llama_p128_l18_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..7f0549f21e8659bcba6cc0283a83265e302b719d
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l18_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:94a891f3a8427520be964d31f2e1323b63f9d0a942cb015f2b1712339feedee9
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l19_together.axmodel b/smolvlm2_axmodel/llama_p128_l19_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..cd92d9072e833c529858a4d9a683d89db1f4c07b
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l19_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9c7f15489ea06f05b5a04a7437cb3da35bbd85a4dc92b41e235f5287c182cbbc
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l1_together.axmodel b/smolvlm2_axmodel/llama_p128_l1_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..107583e15e8399bf1fc0237f63540887abec13cc
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l1_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e7548a3c40a85f10022123f21654e88534fc4041cc36f7e12f15812675d2d693
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l20_together.axmodel b/smolvlm2_axmodel/llama_p128_l20_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..7d4bfc1e25161a6dead06a771ae9bce0e0d2ced7
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l20_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:72c500fdf67380a5d0e9bb81098b48cb172f8ee178ba5256c951a2334f079302
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l21_together.axmodel b/smolvlm2_axmodel/llama_p128_l21_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..0ccc17399dcdbc3a4826a232cf824a0b2c6ad774
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l21_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:960d618d2bf5b4d5cb78a782da41c0e3ceacf6e50684f7fdf1ccc3492c4b5044
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l22_together.axmodel b/smolvlm2_axmodel/llama_p128_l22_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..673cf835522ded755aff21f8dcc72d12e63c55ed
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l22_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:87f52c18d8599e280e3479fb5d41f8a5efa9aeb44967b8a45e301af1f7dfc4bc
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l23_together.axmodel b/smolvlm2_axmodel/llama_p128_l23_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..2f4bfe44cacf3a2f5b8f30b0246d6272e5f61356
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l23_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95c57d34b7fcba5be2f3be9402fcf2f0819ac9c711033a8a975e84e80d8112d6
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l24_together.axmodel b/smolvlm2_axmodel/llama_p128_l24_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..cf3a2212f727132d3de90df60b2f5cf95627bf16
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l24_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7fb89bdbd126e5b6b053eb2a8c0e253eb05ef46c49cfe612a9f7926c168e1b37
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l25_together.axmodel b/smolvlm2_axmodel/llama_p128_l25_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..0973bc0660c6c2bb738b74a4626192951b2b7f35
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l25_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7278b00282db9f988b095a42edb131b2c364f831fa90edda0a82457a2c519729
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l26_together.axmodel b/smolvlm2_axmodel/llama_p128_l26_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..e0295bc569102cb62beb6b665777761d63d2fb97
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l26_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5074683de5b5141e400af1648bb7d3b4e2f7d090643883457eecfac2c58030f
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l27_together.axmodel b/smolvlm2_axmodel/llama_p128_l27_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..07f1d2bcff0456f8ef0eec493c492f480b3a1ef0
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l27_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:06bf197bfe241e75bb24136dcb77590bbf427d0b8c90d0f70149dde4dfba5297
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l28_together.axmodel b/smolvlm2_axmodel/llama_p128_l28_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..b73839f20c933f0ebfa6998c5aecae9be50ad63a
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l28_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:34cf53ed688c9ebb05046dc6b4e4dde2fecef26051954c6b9d9f1da1c93aa026
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l29_together.axmodel b/smolvlm2_axmodel/llama_p128_l29_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..502a7dc2a0a1ae8b34bc5128df27c359011dc71d
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l29_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:65fad807f4dcf69ee8bcbabece6ccb96919390c3fa7578f7807b7c502063c083
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l2_together.axmodel b/smolvlm2_axmodel/llama_p128_l2_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..a497b4353e510aaff5b81d47b71289a02c792e57
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l2_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24fa6c23a025faf92d0336c3befa3ed07692f764fead57e2715139deca22aa97
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l30_together.axmodel b/smolvlm2_axmodel/llama_p128_l30_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..cd18a8eb76fb04f8a1d527d9f443a7b2b6b1bc8b
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l30_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:807fa4aba138665e167a276239db9dd09bd131f7f0497fc643b188badf0eb3bc
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l31_together.axmodel b/smolvlm2_axmodel/llama_p128_l31_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..cecef375df77e5a9190b2b15bf6b826b1a881c68
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l31_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:04249cc7a1ebc354c693d8245ea25b07ea9ac17266249a28f8d1ceeb88c0d56c
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l3_together.axmodel b/smolvlm2_axmodel/llama_p128_l3_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..f63e1a3a1850ca25c1352ac67b609f027b3bd849
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l3_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:320bc1147c878a517f8e0ccac83fe6153b047b65bc2488d3df2f89ba5f00152a
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l4_together.axmodel b/smolvlm2_axmodel/llama_p128_l4_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..0bdfa64447734d4da9e4fad7ee2299ae7481b27b
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l4_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:604359e0fe6da515d316f346e01c390e1ddb92dfa9e780922e986fe62b456d0a
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l5_together.axmodel b/smolvlm2_axmodel/llama_p128_l5_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..7ad34f3a75290bf204c95e8243a85d5ebc13195a
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l5_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a0cc6b210ebbeaf0da840dc464d5e702ce1a0fac9ca937701977e0867eb0789
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l6_together.axmodel b/smolvlm2_axmodel/llama_p128_l6_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..3c5abcc03e7cb40f58b706ae4e3e8e13761637a4
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l6_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6bf9d7c1e9194d972f4c01b2b57d7c07bb830e05c277bd441fb89f2238fd5d74
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l7_together.axmodel b/smolvlm2_axmodel/llama_p128_l7_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..84d792fe253927d14ab52d7f0dc667ff51a27d89
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l7_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e89026f634e84dfde6b6995d9a3e5e88614085834cdd7cf2e2eeabe6965a9348
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l8_together.axmodel b/smolvlm2_axmodel/llama_p128_l8_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..727102d65d2f5abe98960433effce3515978301c
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l8_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2284b1c81564dcfae9e56a27f195cb47b24046607858aab9f0b45885a69b2744
+size 14502053
diff --git a/smolvlm2_axmodel/llama_p128_l9_together.axmodel b/smolvlm2_axmodel/llama_p128_l9_together.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..766c5e477f1e33d1ffafd6454e9df6f24ce06a48
--- /dev/null
+++ b/smolvlm2_axmodel/llama_p128_l9_together.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6d3df2f619c2ae2d8f1ecbe42e02a57efb94bb41346f62dcd51b1f9542282e53
+size 14502053
diff --git a/smolvlm2_axmodel/llama_post.axmodel b/smolvlm2_axmodel/llama_post.axmodel
index 49c0589ee3cf22904877d75a71cd7ab5b2df5fd2..e76642a2d7be8663ed05004d7ec9231b3bf83859 100644
--- a/smolvlm2_axmodel/llama_post.axmodel
+++ b/smolvlm2_axmodel/llama_post.axmodel
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:89e16c32d05a23b3449b298d8df16bc80edba5c719812c2567e074bdccafbd50
-size 51580706
+oid sha256:cc69bae0cfc8068a4386886559ad6dbadb939bdbade2df7d3d720f84d43a6c50
+size 51580713
diff --git a/utils/infer_func.py b/utils/infer_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..67efe05f71cea11eb76628663b90af6999d167f7
--- /dev/null
+++ b/utils/infer_func.py
@@ -0,0 +1,214 @@
+import torch
+import numpy as np
+from typing import List, Tuple
+from tqdm import tqdm
+from axengine import InferenceSession
+from ml_dtypes import bfloat16
+
+
+class InferManager:
+    def __init__(self, config, model_dir):
+
+        self.config = config
+        self.max_seq_len = 2559
+        self.kv_dim = config.hidden_size // config.num_attention_heads * config.num_key_value_heads
+
+        self.k_caches = [
+            np.zeros((1, self.max_seq_len, self.kv_dim), dtype=bfloat16)
+            for _ in range(config.num_hidden_layers)
+        ]
+        self.v_caches = [
+            np.zeros((1, self.max_seq_len, self.kv_dim), dtype=bfloat16)
+            for _ in range(config.num_hidden_layers)
+        ]
+
+        self.decoder_sessions = []
+        for layer_idx in tqdm(range(config.num_hidden_layers), desc="Init InferenceSession"):
+            session = InferenceSession(
+                f"{model_dir}/llama_p128_l{layer_idx}_together.axmodel"
+            )
+            self.decoder_sessions.append(session)
+        self.post_process_session = InferenceSession(
+            f"{model_dir}/llama_post.axmodel"
+        )
+        print("Model loaded successfully!")
+
+    @staticmethod
+    def _top_p(probs: np.ndarray, p: float) -> np.ndarray:
+        sorted_indices = np.argsort(probs)
+        filtered = probs.copy()
+        cumulative = 0
+        for idx in sorted_indices[::-1]:
+            if cumulative >= p:
+                filtered[idx] = 0
+            cumulative += filtered[idx]
+        return filtered / cumulative
+
+    @staticmethod
+    def _softmax(logits: np.ndarray) -> np.ndarray:
+        logits = logits - logits.max()
+        exp_logits = np.exp(logits)
+        return (exp_logits / np.sum(exp_logits)).astype(np.float64)
+
+    def post_process(self, logits, top_k=1, top_p=0.9, temperature=0.6):
+        logits = logits.astype(np.float32).flatten()
+        candidate_indices = np.argpartition(logits, -top_k)[-top_k:]
+        candidate_logits = logits[candidate_indices] / temperature
+        candidate_probs = self._softmax(candidate_logits)
+        candidate_probs = self._top_p(candidate_probs, top_p)
+        candidate_probs = candidate_probs.astype(np.float64) / candidate_probs.sum()
+        chosen_idx = np.random.multinomial(1, candidate_probs).argmax()
+        next_token = candidate_indices[chosen_idx]
+        return next_token, candidate_indices, candidate_probs
+
+    def gen_slice_indices(self, token_len, prefill=128, expand=128):
+        remaining = max(0, token_len - prefill)
+        extra_blocks = (remaining + expand - 1) // expand
+        return list(range(extra_blocks + 1))
+
+    def prefill(
+        self,
+        tokenizer,
+        token_ids,
+        embed_data,
+        slice_len=128,
+    ):
+        """
+        Prefill step for chunked inference.
+        """
+        seq_len = len(token_ids)
+        slice_indices = [i for i in range(seq_len // slice_len + 1)]
+        print(f"slice_indices: {slice_indices}")
+        # total_prefill_len = (
+        #     slice_len * slice_indices[-1]
+        #     if slice_indices[-1] != 0
+        #     else slice_len
+        # )
+        total_prefill_len = slice_len * (slice_indices[-1] + 1)
+        # slice_indices = self.gen_slice_indices(seq_len)
+        # import pdb; pdb.set_trace()
+
+        if total_prefill_len > 0:
+            for slice_idx in slice_indices:
+                indices = np.arange(
+                    slice_idx * slice_len,
+                    (slice_idx + 1) * slice_len,
+                    dtype=np.uint32
+                ).reshape((1, slice_len))
+
+                mask = (
+                    np.zeros((1, slice_len, slice_len * (slice_idx + 1)))
+                    - 65536
+                )
+                data = np.zeros((1, slice_len, self.config.hidden_size)).astype(bfloat16)
+                for i, t in enumerate(
+                    range(
+                        slice_idx * slice_len,
+                        (slice_idx + 1) * slice_len,
+                    )
+                ):
+                    if t < len(token_ids):
+                        mask[:, i, : slice_idx * slice_len + i + 1] = 0
+                        data[:, i : i + 1, :] = (
+                            embed_data[t]
+                            .reshape((1, 1, self.config.hidden_size))
+                            .astype(bfloat16)
+                        )
+
+                remain_len = (
+                    seq_len - slice_idx * slice_len
+                    if slice_idx == slice_indices[-1]
+                    else slice_len
+                )
+                mask = mask.astype(bfloat16)
+                for layer_idx in range(self.config.num_hidden_layers):
+                    input_feed = {
+                        "K_cache": (
+                            self.k_caches[layer_idx][:, 0 : slice_len * slice_idx, :]
+                            if slice_idx
+                            else np.zeros((1, 1, self.config.hidden_size), dtype=bfloat16)
+                        ),
+                        "V_cache": (
+                            self.v_caches[layer_idx][:, 0 : slice_len * slice_idx, :]
+                            if slice_idx
+                            else np.zeros((1, 1, self.config.hidden_size), dtype=bfloat16)
+                        ),
+                        "indices": indices,
+                        "input": data,
+                        "mask": mask,
+                    }
+                    # import pdb; pdb.set_trace()
+                    outputs = self.decoder_sessions[layer_idx].run(None, input_feed, shape_group=slice_idx + 1)
+                    self.k_caches[layer_idx][
+                        :,
+                        slice_idx * slice_len : slice_idx * slice_len + remain_len,
+                        :,
+                    ] = outputs[0][:, :remain_len, :]
+                    self.v_caches[layer_idx][
+                        :,
+                        slice_idx * slice_len : slice_idx * slice_len + remain_len,
+                        :,
+                    ] = outputs[1][:, :remain_len, :]
+                    data = outputs[2]
+
+                print("Slice prefill done:", slice_idx)
+            post_out = self.post_process_session.run(
+                None,
+                {
+                    "input": data[
+                        :, seq_len - (len(slice_indices) - 1) * slice_len - 1, None, :
+                    ]
+                }
+            )[0]
+            next_token, possible_tokens, possible_probs = self.post_process(post_out)
+            possible_decoded = [tokenizer.decode([t]) for t in possible_tokens]
+            possible_probs_str = [str((t, p)) for t, p in zip(possible_decoded, possible_probs)]
+            token_ids.append(next_token)
+            return token_ids
+
+    def decode(
+        self,
+        tokenizer,
+        token_ids,
+        embed_matrix,
+        prefill_len=128,
+        slice_len=128
+    ):
+        # import pdb; pdb.set_trace()
+        print("answer >>", tokenizer.decode(token_ids[-1], skip_special_tokens=True), end='', flush=True)
+        self.max_seq_len = 2559
+        mask = np.zeros((1, 1, self.max_seq_len + 1), dtype=np.float32).astype(bfloat16)
+        mask[:, :, :self.max_seq_len] -= 65536
+        seq_len = len(token_ids) - 1
+        if prefill_len > 0:
+            mask[:, :, :seq_len] = 0
+        for step_idx in range(self.max_seq_len):
+            if prefill_len > 0 and step_idx < seq_len:
+                continue
+            # import pdb; pdb.set_trace()
+            cur_token = token_ids[step_idx]
+            indices = np.array([step_idx], np.uint32).reshape((1, 1))
+            data = embed_matrix[cur_token, :].reshape((1, 1, self.config.hidden_size)).astype(bfloat16)
+            for layer_idx in range(self.config.num_hidden_layers):
+                input_feed = {
+                    "K_cache": self.k_caches[layer_idx],
+                    "V_cache": self.v_caches[layer_idx],
+                    "indices": indices,
+                    "input": data,
+                    "mask": mask,
+                }
+                outputs = self.decoder_sessions[layer_idx].run(None, input_feed, shape_group=0)
+                self.k_caches[layer_idx][:, step_idx, :] = outputs[0][:, :, :]
+                self.v_caches[layer_idx][:, step_idx, :] = outputs[1][:, :, :]
+                data = outputs[2]
+            mask[..., step_idx] = 0
+            if step_idx < seq_len - 1:
+                continue
+            else:
+                post_out = self.post_process_session.run(None, {"input": data})[0]
+                next_token, possible_tokens, possible_probs = self.post_process(post_out)
+                token_ids.append(next_token)
+                if next_token == tokenizer.eos_token_id and next_token > seq_len:
+                    break
+            print(tokenizer.decode(next_token, skip_special_tokens=True), end='', flush=True)
+
diff --git a/vit_mdoel/vision_model.onnx b/vit_mdoel/vision_model.onnx
deleted file mode 100644
index 7170a5f3bb9af71594641c69fedce5db0c62f1c5..0000000000000000000000000000000000000000
--- a/vit_mdoel/vision_model.onnx
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b5b317aa656fc27e49745a23253ee9adcd14ca90e3a9145bdd4568a5a18b2f41
-size 387531753
diff --git a/vit_model/vision_model.axmodel b/vit_model/vision_model.axmodel
new file mode 100644
index 0000000000000000000000000000000000000000..2566e4251fd7be339369cec13644a01faefc966a
--- /dev/null
+++ b/vit_model/vision_model.axmodel
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:238527dc4922d54edffeafaec89663674a93c7388f7b2b7020ce2812df376c7c
+size 301404785