Upload HfMoondream

Browse files

Files changed (11) hide show

config.json +2 -2
config.py +16 -8
hf_moondream.py +46 -5
image_crops.py +36 -13
layers.py +109 -6
lora.py +82 -0
model.safetensors +2 -2
moondream.py +330 -61
region.py +50 -3
text.py +34 -18
vision.py +3 -3

config.json CHANGED Viewed

@@ -8,6 +8,6 @@
   },
   "config": {},
   "model_type": "moondream1",
-  "torch_dtype": "float16",
-  "transformers_version": "4.44.0"
 }

   },
   "config": {},
   "model_type": "moondream1",
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.4"
 }

config.py CHANGED Viewed

@@ -12,6 +12,7 @@ class TextConfig:
     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730
 @dataclass(frozen=True)
@@ -37,22 +38,29 @@ class RegionConfig:
     size_feat_dim: int = 512
     size_out_dim: int = 2048
     inner_dim: int = 8192
 @dataclass(frozen=True)
 class TokenizerConfig:
-    bos_id: int = 50256
-    eos_id: int = 50256
     templates: Dict[str, Optional[Dict[str, List[int]]]] = field(
         default_factory=lambda: {
             "caption": {
-                "short": [198, 198, 16438, 8305, 25],
-                "normal": [198, 198, 24334, 1159, 25],
-                "long": [198, 198, 14617, 8305, 25],
             },
-            "query": {"prefix": [198, 198, 24361, 25], "suffix": [198, 198, 33706, 25]},
-            "detect": {"prefix": [198, 198, 47504, 25], "suffix": [628]},
-            "point": {"prefix": [198, 198, 12727, 25], "suffix": [628]},
         }
     )

     n_heads: int = 32
     n_kv_heads: int = 32
     prefix_attn: int = 730
+    group_size: Optional[int] = None
 @dataclass(frozen=True)
     size_feat_dim: int = 512
     size_out_dim: int = 2048
     inner_dim: int = 8192
+    group_size: Optional[int] = None
 @dataclass(frozen=True)
 class TokenizerConfig:
+    bos_id: int = 0
+    eos_id: int = 0
+    answer_id: int = 3
+    thinking_id: int = 4
+    coord_id: int = 5
+    size_id: int = 6
+    start_ground_points_id: int = 7
+    end_ground_id: int = 9
     templates: Dict[str, Optional[Dict[str, List[int]]]] = field(
         default_factory=lambda: {
             "caption": {
+                "short": [1, 32708, 2, 12492, 3],
+                "normal": [1, 32708, 2, 6382, 3],
+                "long": [1, 32708, 2, 4059, 3],
             },
+            "query": {"prefix": [1, 15381, 2], "suffix": [3]},
+            "detect": {"prefix": [1, 7235, 476, 2], "suffix": [3]},
+            "point": {"prefix": [1, 2581, 2], "suffix": [3]},
         }
     )

hf_moondream.py CHANGED Viewed

@@ -1,4 +1,8 @@
 from transformers import PreTrainedModel, PretrainedConfig
 from .config import MoondreamConfig
 from .moondream import MoondreamModel
@@ -123,7 +127,7 @@ class HfMoondream(PreTrainedModel):
             )
             def generator():
-                for token in self.model._generate_text(
                     prompt_tokens,
                     image_embeds.kv_cache,
                     image_embeds.pos,
@@ -135,8 +139,45 @@ class HfMoondream(PreTrainedModel):
         return [answer]
-    def get_input_embeddings(self):
-        return super().get_input_embeddings()
-    def input_embeds(self, *args, **kwargs):
-        self._unsupported_exception()

+import torch
+import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
+from typing import Union
 from .config import MoondreamConfig
 from .moondream import MoondreamModel
             )
             def generator():
+                for token in self.model._generate_answer(
                     prompt_tokens,
                     image_embeds.kv_cache,
                     image_embeds.pos,
         return [answer]
+    def get_input_embeddings(self) -> nn.Embedding:
+        """
+        Lazily wrap the raw parameter `self.model.text.wte` in a real
+        `nn.Embedding` layer so that HF mix-ins recognise it.  The wrapper
+        **shares** the weight tensor—no copy is made.
+        """
+        if not hasattr(self, "_input_embeddings"):
+            self._input_embeddings = nn.Embedding.from_pretrained(
+                self.model.text.wte,  # tensor created in text.py
+                freeze=True,  # set to False if you need it trainable
+            )
+        return self._input_embeddings
+    def set_input_embeddings(self, value: Union[nn.Embedding, nn.Module]) -> None:
+        """
+        Lets HF functions (e.g. `resize_token_embeddings`) replace or resize the
+        embeddings and keeps everything tied to `self.model.text.wte`.
+        """
+        # 1. point the low-level parameter to the new weight matrix
+        self.model.text.wte = value.weight
+        # 2. keep a reference for get_input_embeddings()
+        self._input_embeddings = value
+    def input_embeds(
+        self,
+        input_ids: Union[torch.LongTensor, list, tuple],
+        *,
+        device: torch.device | None = None
+    ) -> torch.FloatTensor:
+        """
+        Back-compat wrapper that turns token IDs into embeddings.
+        Example:
+            ids = torch.tensor([[1, 2, 3]])
+            embeds = model.input_embeds(ids)      # (1, 3, hidden_dim)
+        """
+        if not torch.is_tensor(input_ids):
+            input_ids = torch.as_tensor(input_ids)
+        if device is not None:
+            input_ids = input_ids.to(device)
+        return self.get_input_embeddings()(input_ids)

image_crops.py CHANGED Viewed

@@ -1,10 +1,18 @@
 import math
 import numpy as np
 import torch
-import pyvips
 from typing import TypedDict
 def select_tiling(
     height: int, width: int, crop_size: int, max_crops: int
@@ -113,18 +121,33 @@ def overlap_crop_image(
         tiling[1] * crop_window_size + total_margin_pixels,
     )
-    # Convert to vips for resizing
-    vips_image = pyvips.Image.new_from_array(image)
-    scale_x = target_size[1] / image.shape[1]
-    scale_y = target_size[0] / image.shape[0]
-    resized = vips_image.resize(scale_x, vscale=scale_y)
-    image = resized.numpy()
-    # Create global crop
-    scale_x = base_size[1] / vips_image.width
-    scale_y = base_size[0] / vips_image.height
-    global_vips = vips_image.resize(scale_x, vscale=scale_y)
-    crops[0] = global_vips.numpy()
     for i in range(tiling[0]):
         for j in range(tiling[1]):

 import math
 import numpy as np
 import torch
 from typing import TypedDict
+try:
+    import pyvips
+    HAS_VIPS = True
+except:
+    from PIL import Image
+    HAS_VIPS = False
 def select_tiling(
     height: int, width: int, crop_size: int, max_crops: int
         tiling[1] * crop_window_size + total_margin_pixels,
     )
+    if HAS_VIPS:
+        # Convert to vips for resizing
+        vips_image = pyvips.Image.new_from_array(image)
+        scale_x = target_size[1] / image.shape[1]
+        scale_y = target_size[0] / image.shape[0]
+        resized = vips_image.resize(scale_x, vscale=scale_y)
+        image = resized.numpy()
+        # Create global crop
+        scale_x = base_size[1] / vips_image.width
+        scale_y = base_size[0] / vips_image.height
+        global_vips = vips_image.resize(scale_x, vscale=scale_y)
+        crops[0] = global_vips.numpy()
+    else:
+        # Fallback to PIL
+        pil_img = Image.fromarray(image)
+        resized = pil_img.resize(
+            (int(target_size[1]), int(target_size[0])),
+            resample=Image.Resampling.LANCZOS,
+        )
+        image = np.asarray(resized)
+        # Create global crop
+        global_pil = pil_img.resize(
+            (int(base_size[1]), int(base_size[0])), resample=Image.Resampling.LANCZOS
+        )
+        crops[0] = np.asarray(global_pil)
     for i in range(tiling[0]):
         for j in range(tiling[1]):

layers.py CHANGED Viewed

@@ -1,8 +1,24 @@
 from dataclasses import dataclass
-from typing import Literal
-import torch
-from torch.nn import functional as F
 def gelu_approx(x):
@@ -19,6 +35,80 @@ def linear(x: torch.Tensor, w: LinearWeights) -> torch.Tensor:
     return F.linear(x, w.weight, w.bias)
 @dataclass
 class LayerNormWeights:
     weight: torch.Tensor
@@ -36,10 +126,23 @@ class MLPWeights:
     act: Literal["gelu_approx"] = "gelu_approx"
-def mlp(x: torch.Tensor, w: MLPWeights) -> torch.Tensor:
-    x = w.fc1(x)
     x = gelu_approx(x)
-    x = w.fc2(x)
     return x

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from dataclasses import dataclass
+from typing import Literal, Optional
+try:
+    from torchao import quantize_
+    from torchao.quantization import int4_weight_only
+except ImportError:
+    def quantize_(model, quant_mode):
+        raise ImportError(
+            "torchao is not installed. Please install it with `pip install torchao`."
+        )
+    def int4_weight_only(group_size):
+        raise ImportError(
+            "torchao is not installed. Please install it with `pip install torchao`."
+        )
 def gelu_approx(x):
     return F.linear(x, w.weight, w.bias)
+def dequantize_tensor(W_q, scale, zero, orig_shape, dtype=torch.bfloat16):
+    _step = W_q.shape[0]
+    W_r = torch.empty([2 * _step, W_q.shape[1]], dtype=dtype, device=W_q.device)
+    W_r[:_step] = (W_q & 0b11110000) >> 4
+    W_r[_step:] = W_q & 0b00001111
+    W_r.sub_(zero).mul_(scale)
+    return W_r.reshape(orig_shape)
+class QuantizedLinear(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        dtype: torch.dtype,
+    ):
+        # TODO: Take group_size as an input instead of hardcoding it here.
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.ParameterDict(
+            {
+                "packed": nn.Parameter(
+                    torch.empty(
+                        out_features * in_features // (128 * 2), 128, dtype=torch.uint8
+                    ),
+                    requires_grad=False,
+                ),
+                "scale": nn.Parameter(
+                    torch.empty(out_features * in_features // 128, 1),
+                    requires_grad=False,
+                ),
+                "zero_point": nn.Parameter(
+                    torch.empty(out_features * in_features // 128, 1),
+                    requires_grad=False,
+                ),
+            }
+        )
+        self.bias = nn.Parameter(torch.empty(out_features), requires_grad=False)
+        self.unpacked = False
+    def unpack(self):
+        if self.unpacked:
+            return
+        self.weight = nn.Parameter(
+            dequantize_tensor(
+                self.weight["packed"],
+                self.weight["scale"],
+                self.weight["zero_point"],
+                (self.out_features, self.in_features),
+                torch.bfloat16,
+            )
+        )
+        with torch.device("meta"):
+            self.linear = nn.Linear(
+                self.in_features, self.out_features, dtype=torch.bfloat16
+            )
+        self.linear.weight = self.weight
+        self.linear.bias = nn.Parameter(
+            self.bias.to(torch.bfloat16), requires_grad=False
+        )
+        del self.weight, self.bias
+        quantize_(self, int4_weight_only(group_size=128))
+        self.unpacked = True
+        torch.cuda.empty_cache()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if not self.unpacked:
+            self.unpack()
+        return self.linear(x)
 @dataclass
 class LayerNormWeights:
     weight: torch.Tensor
     act: Literal["gelu_approx"] = "gelu_approx"
+def mlp(x: torch.Tensor, w: MLPWeights, lora: Optional[dict] = None) -> torch.Tensor:
+    x0 = w.fc1(x)
+    if lora is not None:
+        x1 = F.linear(F.linear(x, lora["fc1"]["A"]), lora["fc1"]["B"])
+        x = x0 + x1
+    else:
+        x = x0
     x = gelu_approx(x)
+    x0 = w.fc2(x)
+    if lora is not None:
+        x1 = F.linear(F.linear(x, lora["fc2"]["A"]), lora["fc2"]["B"])
+        x = x0 + x1
+    else:
+        x = x0
     return x

lora.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import functools
+import os
+import shutil
+import torch
+from pathlib import Path
+from urllib.request import Request, urlopen
+from typing import Optional
+def variant_cache_dir():
+    hf_hub_cache = os.environ.get("HF_HUB_CACHE")
+    if hf_hub_cache is not None:
+        return Path(hf_hub_cache) / "md_variants"
+    hf_home = os.environ.get("HF_HOME")
+    if hf_home is not None:
+        return Path(hf_home) / "hub" / "md_variants"
+    return Path("~/.cache/huggingface/hub").expanduser() / "md_variants"
+def cached_variant_path(variant_id: str):
+    variant, *rest = variant_id.split("/", 1)
+    step = rest[0] if rest else "final"
+    cache_dir = variant_cache_dir() / variant
+    os.makedirs(cache_dir, exist_ok=True)
+    dest = cache_dir / f"{step}.pt"
+    if dest.exists():
+        return dest
+    md_endpoint = os.getenv("MOONDREAM_ENDPOINT", "https://api.moondream.ai")
+    headers = {"User-Agent": "moondream-torch"}
+    api_key = os.getenv("MOONDREAM_API_KEY")
+    if api_key is not None:
+        headers["X-Moondream-Auth"] = api_key
+    req = Request(f"{md_endpoint}/v1/variants/{variant_id}/download", headers=headers)
+    with urlopen(req) as r, open(dest, "wb") as f:
+        shutil.copyfileobj(r, f)
+    return dest
+def nest(flat):
+    tree = {}
+    for k, v in flat.items():
+        parts = k.split(".")
+        d = tree
+        for p in parts[:-1]:
+            d = d.setdefault(p, {})
+        d[parts[-1]] = v
+    return tree
+@functools.lru_cache(maxsize=5)
+def variant_state_dict(variant_id: Optional[str] = None, device: str = "cpu"):
+    if variant_id is None:
+        return None
+    state_dict = torch.load(
+        cached_variant_path(variant_id), map_location=device, weights_only=True
+    )
+    # TODO: Move these into the training code that saves checkpoints...
+    rename_rules = [
+        ("text_model.transformer.h", "text.blocks"),
+        (".mixer", ".attn"),
+        (".out_proj", ".proj"),
+        (".Wqkv", ".qkv"),
+        (".parametrizations.weight.0", ""),
+    ]
+    new_state_dict = {}
+    for key, tensor in state_dict.items():
+        new_key = key
+        for old, new in rename_rules:
+            if old in new_key:
+                new_key = new_key.replace(old, new)
+        new_state_dict[new_key] = tensor
+    return nest(new_state_dict)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:96dce588e4a319fde7af3c70fbf27e726f4850e22522d0fdc4b165d5e6003ad5
-size 3854538376

 version https://git-lfs.github.com/spec/v1
+oid sha256:70a7d94c0c8349eb58ed2d9e636ef2d0916960f321ecabeac6354b8ba3d7403f
+size 3854538968

moondream.py CHANGED Viewed

@@ -11,9 +11,23 @@ from .config import MoondreamConfig
 from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
 from .text import build_text_model, text_encoder, lm_head, text_decoder
-from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
 from .utils import remove_outlier_points
 TextSamplingSettings = TypedDict(
     "TextSamplingSettings",
@@ -21,16 +35,18 @@ TextSamplingSettings = TypedDict(
         "max_tokens": int,
         "temperature": float,
         "top_p": float,
     },
     total=False,
 )
 ObjectSamplingSettings = TypedDict(
     "ObjectSamplingSettings",
-    {"max_objects": int},
     total=False,
 )
 DEFAULT_MAX_TOKENS = 768
 DEFAULT_TEMPERATURE = 0.5
 DEFAULT_TOP_P = 0.3
@@ -63,43 +79,47 @@ class KVCache(nn.Module):
 class MoondreamModel(nn.Module):
-    def __init__(self, config: MoondreamConfig, dtype=torch.float16, setup_caches=True):
         super().__init__()
         self.config = config
-        self.tokenizer = Tokenizer.from_pretrained(
-            "vikhyatk/moondream2", revision="2025-01-09"
-        )
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
         # Region Model
         self.region = nn.ModuleDict(
             {
-                "coord_encoder": nn.Linear(
                     config.region.coord_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "coord_decoder": nn.ModuleDict(
                     {
-                        "fc1": nn.Linear(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
-                        "fc2": nn.Linear(
                             config.region.inner_dim,
                             config.region.coord_out_dim,
                             dtype=dtype,
                         ),
                     }
                 ),
-                "size_encoder": nn.Linear(
                     config.region.size_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "size_decoder": nn.ModuleDict(
                     {
-                        "fc1": nn.Linear(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
-                        "fc2": nn.Linear(
                             config.region.inner_dim,
                             config.region.size_out_dim,
                             dtype=dtype,
@@ -151,17 +171,31 @@ class MoondreamModel(nn.Module):
     def _vis_proj(self, g: torch.Tensor, r: torch.Tensor):
         return vision_projection(g, r, self.vision, self.config.vision)
-    def _prefill(self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor):
-        return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text)
     def _decode_one_tok(
-        self, x: torch.Tensor, attn_mask: torch.Tensor, pos_ids: torch.Tensor
     ):
-        hidden = text_decoder(x, self.text, attn_mask, pos_ids, self.config.text)
         logits = lm_head(hidden, self.text)
         return logits, hidden
     def compile(self):
         # TODO: vision_projection is not being compiled
         self._vis_enc = torch.compile(self._vis_enc, fullgraph=True)
         self._prefill = torch.compile(self._prefill, fullgraph=True)
@@ -171,6 +205,7 @@ class MoondreamModel(nn.Module):
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
         torch._dynamo.mark_dynamic(all_crops, 0)
         outputs = self._vis_enc(all_crops)
@@ -192,12 +227,22 @@ class MoondreamModel(nn.Module):
         return self._vis_proj(global_features, reconstructed)
-    def encode_image(self, image: Union[Image.Image, EncodedImage]) -> EncodedImage:
         if isinstance(image, EncodedImage):
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
@@ -209,7 +254,7 @@ class MoondreamModel(nn.Module):
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
             pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
-            self._prefill(inputs_embeds, mask, pos_ids)
         return EncodedImage(
             pos=inputs_embeds.size(1),
@@ -233,31 +278,167 @@ class MoondreamModel(nn.Module):
         return next_probs
     def _prefill_prompt(
-        self, prompt_tokens: torch.Tensor, pos: int, temperature: float, top_p: float
     ):
         with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
             torch._dynamo.mark_dynamic(prompt_emb, 1)
-            mask = self.attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
             pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.long)
-            hidden = self._prefill(prompt_emb, mask, pos_ids)
-            logits = lm_head(hidden, self.text)
             if temperature == 0:
-                next_token = torch.argmax(logits, dim=-1).unsqueeze(1)
             else:
-                probs = torch.softmax(logits / temperature, dim=-1)
                 probs = self._apply_top_p(probs, top_p)
                 next_token = torch.multinomial(probs, num_samples=1)
         pos = pos + prompt_emb.size(1)
-        return logits, hidden, next_token, pos
-    def _generate_text(
         self,
         prompt_tokens: torch.Tensor,
         pos: int,
         settings: Optional[TextSamplingSettings] = None,
     ):
         max_tokens = (
             settings.get("max_tokens", DEFAULT_MAX_TOKENS)
@@ -270,9 +451,21 @@ class MoondreamModel(nn.Module):
             else DEFAULT_TEMPERATURE
         )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
         _, _, next_token, pos = self._prefill_prompt(
-            prompt_tokens, pos, temperature, top_p
         )
         def generator(next_token, pos):
@@ -287,7 +480,7 @@ class MoondreamModel(nn.Module):
             while (
                 next_token_id := next_token.item()
-            ) != self.config.tokenizer.eos_id and generated_tokens < max_tokens:
                 # Add token to our cache
                 token_cache.append(next_token_id)
@@ -307,7 +500,7 @@ class MoondreamModel(nn.Module):
                     print_len += len(printable_text)
                     if printable_text:
                         yield printable_text
-                # Otherwise, only print up to the last space to avoid cutting words
                 else:
                     last_space_idx = text.rfind(" ", print_len)
                     if last_space_idx >= print_len:
@@ -319,13 +512,18 @@ class MoondreamModel(nn.Module):
                 with torch.inference_mode():
                     next_emb = text_encoder(next_token, self.text)
                     mask[:, :, pos], pos_ids[0] = 1, pos
-                    logits, _ = self._decode_one_tok(next_emb, mask, pos_ids)
                     pos += 1
                     if temperature == 0:
-                        next_token = torch.argmax(logits, dim=-1).unsqueeze(1)  # (1, 1)
                     else:
-                        probs = torch.softmax(logits / temperature, dim=-1)  # (1, V)
                         probs = self._apply_top_p(probs, top_p)
                         next_token = torch.multinomial(probs, num_samples=1)  # (1, 1)
@@ -342,34 +540,82 @@ class MoondreamModel(nn.Module):
     def query(
         self,
-        image: Union[Image.Image, EncodedImage],
-        question: str,
         stream: bool = False,
         settings: Optional[TextSamplingSettings] = None,
     ):
         if self.config.tokenizer.templates["query"] is None:
             raise NotImplementedError("Model does not support querying.")
-        image = self.encode_image(image)
-        self.load_encoded_image(image)
-        prompt_tokens = torch.tensor(
-            [
-                self.config.tokenizer.templates["query"]["prefix"]
-                + self.tokenizer.encode(" " + question).ids
-                + self.config.tokenizer.templates["query"]["suffix"]
-            ],
-            device=self.device,
-        )
         def generator():
-            for token in self._generate_text(prompt_tokens, image.pos, settings):
                 yield token
         if stream:
-            return {"answer": generator()}
         else:
-            return {"answer": "".join(list(generator()))}
     def load_encoded_image(self, encoded_image: EncodedImage):
         for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
@@ -388,7 +634,7 @@ class MoondreamModel(nn.Module):
         if length not in self.config.tokenizer.templates["caption"]:
             raise ValueError(f"Model does not support caption length '{length}'.")
-        image = self.encode_image(image)
         self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
@@ -396,7 +642,7 @@ class MoondreamModel(nn.Module):
         )
         def generator():
-            for token in self._generate_text(prompt_tokens, image.pos, settings):
                 yield token
         if stream:
@@ -411,6 +657,7 @@ class MoondreamModel(nn.Module):
         pos: int,
         include_size: bool = True,
         max_objects: int = DEFAULT_MAX_OBJECTS,
     ):
         out = []
         mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
@@ -430,7 +677,7 @@ class MoondreamModel(nn.Module):
                 # Decode y-coordinate
                 mask[:, :, pos], pos_ids[0] = 1, pos
-                _, hidden = self._decode_one_tok(next_emb, mask, pos_ids)
                 pos += 1
                 y_logits = decode_coordinate(hidden, self.region)
                 y_center = torch.argmax(y_logits, dim=-1) / y_logits.size(-1)
@@ -441,7 +688,7 @@ class MoondreamModel(nn.Module):
                 # Decode size
                 if include_size:
                     mask[:, :, pos], pos_ids[0] = 1, pos
-                    logits, hidden = self._decode_one_tok(next_emb, mask, pos_ids)
                     pos += 1
                     size_logits = decode_size(hidden, self.region)
@@ -479,7 +726,7 @@ class MoondreamModel(nn.Module):
                 # Decode next token (x-coordinate, or eos)
                 mask[:, :, pos], pos_ids[0] = 1, pos
-                logits, hidden = self._decode_one_tok(next_emb, mask, pos_ids)
                 pos += 1
                 next_token = torch.argmax(logits, dim=-1)
@@ -494,7 +741,7 @@ class MoondreamModel(nn.Module):
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
-        image = self.encode_image(image)
         self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
@@ -506,8 +753,14 @@ class MoondreamModel(nn.Module):
             device=self.device,
         )
         _, hidden, next_token, pos = self._prefill_prompt(
-            prompt_tokens, image.pos, temperature=0, top_p=0
         )
         hidden = hidden[:, -1:, :]
@@ -517,7 +770,12 @@ class MoondreamModel(nn.Module):
             else DEFAULT_MAX_OBJECTS
         )
         objects = self._generate_points(
-            hidden, next_token, pos, include_size=True, max_objects=max_objects
         )
         return {"objects": objects}
@@ -531,7 +789,7 @@ class MoondreamModel(nn.Module):
         if self.config.tokenizer.templates["point"] is None:
             raise NotImplementedError("Model does not support pointing.")
-        image = self.encode_image(image)
         self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
@@ -543,8 +801,14 @@ class MoondreamModel(nn.Module):
             device=self.device,
         )
         _, hidden, next_token, pos = self._prefill_prompt(
-            prompt_tokens, image.pos, temperature=0, top_p=0
         )
         hidden = hidden[:, -1:, :]
@@ -554,7 +818,12 @@ class MoondreamModel(nn.Module):
             else DEFAULT_MAX_OBJECTS
         )
         objects = self._generate_points(
-            hidden, next_token, pos, include_size=False, max_objects=max_objects
         )
         return {"points": objects}
@@ -579,11 +848,11 @@ class MoondreamModel(nn.Module):
                 self.text,
             )
             x_emb = encode_coordinate(
-                torch.tensor([[[source[0]]]], device=self.device, dtype=torch.float16),
                 self.region,
             )
             y_emb = encode_coordinate(
-                torch.tensor([[[source[1]]]], device=self.device, dtype=torch.float16),
                 self.region,
             )
@@ -595,7 +864,7 @@ class MoondreamModel(nn.Module):
             pos_ids = torch.arange(
                 image.pos, image.pos + prompt_emb.size(1), dtype=torch.long
             )
-            hidden = self._prefill(prompt_emb, mask, pos_ids)
             logits = lm_head(hidden, self.text)
             next_token = torch.argmax(logits, dim=-1)
             pos = image.pos + prompt_emb.size(1)

 from .image_crops import reconstruct_from_crops
 from .vision import vision_encoder, vision_projection, prepare_crops, build_vision_model
 from .text import build_text_model, text_encoder, lm_head, text_decoder
+from .region import (
+    decode_coordinate,
+    encode_coordinate,
+    decode_size,
+    encode_size,
+    encode_spatial_refs,
+    SpatialRefs,
+)
+from .layers import QuantizedLinear
+from .lora import variant_state_dict
 from .utils import remove_outlier_points
+ImageEncodingSettings = TypedDict(
+    "ImageEncodingSettings",
+    {"variant": str},
+    total=False,
+)
 TextSamplingSettings = TypedDict(
     "TextSamplingSettings",
         "max_tokens": int,
         "temperature": float,
         "top_p": float,
+        "variant": str,
     },
     total=False,
 )
 ObjectSamplingSettings = TypedDict(
     "ObjectSamplingSettings",
+    {"max_objects": int, "variant": str},
     total=False,
 )
 DEFAULT_MAX_TOKENS = 768
 DEFAULT_TEMPERATURE = 0.5
 DEFAULT_TOP_P = 0.3
 class MoondreamModel(nn.Module):
+    def __init__(
+        self, config: MoondreamConfig, dtype=torch.bfloat16, setup_caches=True
+    ):
         super().__init__()
         self.config = config
+        self.tokenizer = Tokenizer.from_pretrained("moondream/starmie-v1")
         self.vision = build_vision_model(config.vision, dtype)
         self.text = build_text_model(config.text, dtype)
         # Region Model
+        linear_cls = (
+            QuantizedLinear if config.region.group_size is not None else nn.Linear
+        )
         self.region = nn.ModuleDict(
             {
+                "coord_encoder": linear_cls(
                     config.region.coord_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "coord_decoder": nn.ModuleDict(
                     {
+                        "fc1": linear_cls(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
+                        "fc2": linear_cls(
                             config.region.inner_dim,
                             config.region.coord_out_dim,
                             dtype=dtype,
                         ),
                     }
                 ),
+                "size_encoder": linear_cls(
                     config.region.size_feat_dim, config.region.dim, dtype=dtype
                 ),
                 "size_decoder": nn.ModuleDict(
                     {
+                        "fc1": linear_cls(
                             config.region.dim, config.region.inner_dim, dtype=dtype
                         ),
+                        "fc2": linear_cls(
                             config.region.inner_dim,
                             config.region.size_out_dim,
                             dtype=dtype,
     def _vis_proj(self, g: torch.Tensor, r: torch.Tensor):
         return vision_projection(g, r, self.vision, self.config.vision)
+    def _prefill(
+        self,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor,
+        pos_ids: torch.Tensor,
+        lora: Optional[torch.Tensor],
+    ):
+        return text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, lora)
     def _decode_one_tok(
+        self,
+        x: torch.Tensor,
+        attn_mask: torch.Tensor,
+        pos_ids: torch.Tensor,
+        lora: Optional[torch.Tensor],
     ):
+        hidden = text_decoder(x, self.text, attn_mask, pos_ids, self.config.text, lora)
         logits = lm_head(hidden, self.text)
         return logits, hidden
     def compile(self):
+        for module in self.modules():
+            if isinstance(module, QuantizedLinear):
+                module.unpack()
         # TODO: vision_projection is not being compiled
         self._vis_enc = torch.compile(self._vis_enc, fullgraph=True)
         self._prefill = torch.compile(self._prefill, fullgraph=True)
     def _run_vision_encoder(self, image: Image.Image) -> torch.Tensor:
         all_crops, tiling = prepare_crops(image, self.config.vision, device=self.device)
         torch._dynamo.mark_dynamic(all_crops, 0)
         outputs = self._vis_enc(all_crops)
         return self._vis_proj(global_features, reconstructed)
+    def encode_image(
+        self,
+        image: Union[Image.Image, EncodedImage],
+        settings: Optional[ImageEncodingSettings] = None,
+    ) -> EncodedImage:
         if isinstance(image, EncodedImage):
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and settings["variant"] is not None
+            else None
+        )
         # Run through text model in addition to the vision encoder, to minimize
         # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
             pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
+            self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),
         return next_probs
     def _prefill_prompt(
+        self,
+        prompt_tokens: torch.Tensor,
+        pos: int,
+        temperature: float,
+        top_p: float,
+        spatial_refs: Optional[SpatialRefs] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+        lora: Optional[dict] = None,
     ):
         with torch.inference_mode():
             prompt_emb = text_encoder(prompt_tokens, self.text)
+            if spatial_refs:
+                encoded_refs = encode_spatial_refs(spatial_refs, self.region)
+                prompt_emb[prompt_tokens == self.config.tokenizer.coord_id] = (
+                    encoded_refs["coords"]
+                )
+                if encoded_refs["sizes"] is not None:
+                    prompt_emb[prompt_tokens == self.config.tokenizer.size_id] = (
+                        encoded_refs["sizes"]
+                    )
             torch._dynamo.mark_dynamic(prompt_emb, 1)
+            if attn_mask is None:
+                attn_mask = self.attn_mask
+            mask = attn_mask[:, :, pos : pos + prompt_emb.size(1), :]
             pos_ids = torch.arange(pos, pos + prompt_emb.size(1), dtype=torch.long)
+            hidden_BC = self._prefill(prompt_emb, mask, pos_ids, lora)
+            logits_BV = lm_head(hidden_BC, self.text)
             if temperature == 0:
+                next_token = torch.argmax(logits_BV, dim=-1).unsqueeze(1)
             else:
+                probs = torch.softmax(logits_BV / temperature, dim=-1)
                 probs = self._apply_top_p(probs, top_p)
                 next_token = torch.multinomial(probs, num_samples=1)
         pos = pos + prompt_emb.size(1)
+        return logits_BV, hidden_BC, next_token, pos
+    def _generate_reasoning(
+        self,
+        prompt_tokens,
+        pos,
+        settings: Optional[TextSamplingSettings] = None,
+        spatial_refs: Optional[SpatialRefs] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> Tuple[int, str, List[dict]]:
+        max_tokens = (
+            settings.get("max_tokens", DEFAULT_MAX_TOKENS)
+            if settings
+            else DEFAULT_MAX_TOKENS
+        )
+        temperature = (
+            settings.get("temperature", DEFAULT_TEMPERATURE)
+            if settings
+            else DEFAULT_TEMPERATURE
+        )
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
+        top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
+        eos_id = self.config.tokenizer.answer_id
+        _, last_hidden_BC, next_token, pos = self._prefill_prompt(
+            prompt_tokens,
+            pos,
+            temperature,
+            top_p,
+            spatial_refs,
+            attn_mask=attn_mask,
+            lora=lora,
+        )
+        text_token_chunks = [[]]
+        grounding_chunks = [[]]
+        mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
+        mask[:, :, :pos] = 1
+        pos_ids = torch.tensor([pos], device=self.device, dtype=torch.long)
+        generated_tokens = 0
+        while (
+            next_token_id := next_token.item()
+        ) != eos_id and generated_tokens < max_tokens:
+            if (
+                next_token_id == self.config.tokenizer.start_ground_points_id
+                or next_token_id == self.config.tokenizer.end_ground_id
+            ):
+                text_token_chunks.append([])
+                grounding_chunks.append([])
+            text_token_chunks[-1].append(next_token_id)
+            with torch.inference_mode():
+                if next_token_id == self.config.tokenizer.coord_id:
+                    coord_logits = decode_coordinate(last_hidden_BC, self.region)
+                    coord = torch.argmax(coord_logits, dim=-1) / coord_logits.size(-1)
+                    grounding_chunks[-1].append(coord.item())
+                    next_emb = encode_coordinate(
+                        coord.to(dtype=coord_logits.dtype), self.region
+                    ).unsqueeze(0)
+                else:
+                    next_emb = text_encoder(next_token, self.text)
+                mask[:, :, pos], pos_ids[0] = 1, pos
+                logits_BV, last_hidden_BC = self._decode_one_tok(
+                    next_emb, mask, pos_ids, lora
+                )
+                logits_BV[:, self.config.tokenizer.eos_id] = float("-inf")
+                logits_BV[:, self.config.tokenizer.size_id] = float("-inf")
+                pos += 1
+                if temperature == 0:
+                    next_token = torch.argmax(logits_BV, dim=-1).unsqueeze(1)  # (1, 1)
+                else:
+                    probs = torch.softmax(logits_BV / temperature, dim=-1)  # (1, V)
+                    probs = self._apply_top_p(probs, top_p)
+                    next_token = torch.multinomial(probs, num_samples=1)  # (1, 1)
+                generated_tokens += 1
+        text_chunks = [
+            self.tokenizer.decode(chunk_tokens) for chunk_tokens in text_token_chunks
+        ]
+        text = "".join(text_chunks)
+        start_idx = 0
+        grounding = []
+        for text_chunk, grounding_chunk in zip(text_chunks, grounding_chunks):
+            if len(grounding_chunk) > 1:
+                points = []
+                for i in range(0, len(grounding_chunk) - (len(grounding_chunk) % 2), 2):
+                    points.append((grounding_chunk[i], grounding_chunk[i + 1]))
+                grounding.append(
+                    {
+                        "start_idx": start_idx,
+                        "end_idx": start_idx + len(text_chunk),
+                        "points": points,
+                    }
+                )
+            start_idx += len(text_chunk)
+        return pos, text, grounding
+    def _generate_answer(
         self,
         prompt_tokens: torch.Tensor,
         pos: int,
         settings: Optional[TextSamplingSettings] = None,
+        spatial_refs: Optional[SpatialRefs] = None,
+        eos_id: Optional[int] = None,
+        attn_mask: Optional[torch.Tensor] = None,
     ):
         max_tokens = (
             settings.get("max_tokens", DEFAULT_MAX_TOKENS)
             else DEFAULT_TEMPERATURE
         )
         top_p = settings.get("top_p", DEFAULT_TOP_P) if settings else DEFAULT_TOP_P
+        eos_id = eos_id if eos_id is not None else self.config.tokenizer.eos_id
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         _, _, next_token, pos = self._prefill_prompt(
+            prompt_tokens,
+            pos,
+            temperature,
+            top_p,
+            spatial_refs,
+            attn_mask=attn_mask,
+            lora=lora,
         )
         def generator(next_token, pos):
             while (
                 next_token_id := next_token.item()
+            ) != eos_id and generated_tokens < max_tokens:
                 # Add token to our cache
                 token_cache.append(next_token_id)
                     print_len += len(printable_text)
                     if printable_text:
                         yield printable_text
+                # Otherwise, only yield up to the last space to avoid cutting words
                 else:
                     last_space_idx = text.rfind(" ", print_len)
                     if last_space_idx >= print_len:
                 with torch.inference_mode():
                     next_emb = text_encoder(next_token, self.text)
                     mask[:, :, pos], pos_ids[0] = 1, pos
+                    logits_BV, _ = self._decode_one_tok(next_emb, mask, pos_ids, lora)
+                    logits_BV[:, self.config.tokenizer.answer_id] = float("-inf")
                     pos += 1
                     if temperature == 0:
+                        next_token = torch.argmax(logits_BV, dim=-1).unsqueeze(
+                            1
+                        )  # (1, 1)
                     else:
+                        probs = torch.softmax(logits_BV / temperature, dim=-1)  # (1, V)
                         probs = self._apply_top_p(probs, top_p)
                         next_token = torch.multinomial(probs, num_samples=1)  # (1, 1)
     def query(
         self,
+        image: Optional[Union[Image.Image, EncodedImage]] = None,
+        question: str = None,
+        reasoning: bool = False,
+        spatial_refs: Optional[SpatialRefs] = None,
         stream: bool = False,
         settings: Optional[TextSamplingSettings] = None,
     ):
         if self.config.tokenizer.templates["query"] is None:
             raise NotImplementedError("Model does not support querying.")
+        if question is None:
+            raise ValueError("question must be provided.")
+        if spatial_refs and image is None:
+            raise ValueError("spatial_refs can only be used with an image.")
+        attn_mask = self.attn_mask
+        if image is not None:
+            image = self.encode_image(image, settings)
+            self.load_encoded_image(image)
+            pos = image.pos
+            prompt_toks = self.config.tokenizer.templates["query"]["prefix"]
+        else:
+            self._setup_caches()
+            pos = 0
+            prompt_toks = [
+                self.config.tokenizer.bos_id
+            ] + self.config.tokenizer.templates["query"]["prefix"]
+            max_context = self.config.text.max_context
+            attn_mask = torch.tril(
+                torch.ones(1, 1, max_context, max_context, dtype=torch.bool)
+            ).to(self.device)
+        spatial_toks = []
+        if spatial_refs:
+            for ref in spatial_refs:
+                coord_id = self.config.tokenizer.coord_id
+                size_id = self.config.tokenizer.size_id
+                if len(ref) == 2:
+                    spatial_toks.extend([coord_id, coord_id])
+                else:
+                    spatial_toks.extend([coord_id, coord_id, size_id])
+        prompt_tokens = [
+            prompt_toks
+            + spatial_toks
+            + self.tokenizer.encode(question).ids
+            + self.config.tokenizer.templates["query"]["suffix"]
+        ]
+        if reasoning:
+            prompt_tokens[0] += [self.config.tokenizer.thinking_id]
+            prompt_tokens = torch.tensor(prompt_tokens, device=self.device)
+            pos, reasoning_text, reasoning_grounding = self._generate_reasoning(
+                prompt_tokens, pos, settings, spatial_refs, attn_mask=attn_mask
+            )
+            prompt_tokens = [self.config.tokenizer.templates["query"]["suffix"]]
+            reasoning_dict = {
+                "reasoning": {"text": reasoning_text, "grounding": reasoning_grounding}
+            }
+        else:
+            prompt_tokens[0] += self.config.tokenizer.templates["query"]["suffix"]
+            reasoning_dict = {}
+        prompt_tokens = torch.tensor(prompt_tokens, device=self.device)
         def generator():
+            for token in self._generate_answer(
+                prompt_tokens, pos, settings, spatial_refs, attn_mask=attn_mask
+            ):
                 yield token
         if stream:
+            return {**reasoning_dict, "answer": generator()}
         else:
+            return {**reasoning_dict, "answer": "".join(list(generator()))}
     def load_encoded_image(self, encoded_image: EncodedImage):
         for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
         if length not in self.config.tokenizer.templates["caption"]:
             raise ValueError(f"Model does not support caption length '{length}'.")
+        image = self.encode_image(image, settings)
         self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
         )
         def generator():
+            for token in self._generate_answer(prompt_tokens, image.pos, settings):
                 yield token
         if stream:
         pos: int,
         include_size: bool = True,
         max_objects: int = DEFAULT_MAX_OBJECTS,
+        lora: Optional[dict] = None,
     ):
         out = []
         mask = torch.zeros(1, 1, 2048, device=self.device, dtype=torch.bool)
                 # Decode y-coordinate
                 mask[:, :, pos], pos_ids[0] = 1, pos
+                _, hidden = self._decode_one_tok(next_emb, mask, pos_ids, lora)
                 pos += 1
                 y_logits = decode_coordinate(hidden, self.region)
                 y_center = torch.argmax(y_logits, dim=-1) / y_logits.size(-1)
                 # Decode size
                 if include_size:
                     mask[:, :, pos], pos_ids[0] = 1, pos
+                    logits, hidden = self._decode_one_tok(next_emb, mask, pos_ids, lora)
                     pos += 1
                     size_logits = decode_size(hidden, self.region)
                 # Decode next token (x-coordinate, or eos)
                 mask[:, :, pos], pos_ids[0] = 1, pos
+                logits, hidden = self._decode_one_tok(next_emb, mask, pos_ids, lora)
                 pos += 1
                 next_token = torch.argmax(logits, dim=-1)
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
+        image = self.encode_image(image, settings)
         self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             device=self.device,
         )
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         _, hidden, next_token, pos = self._prefill_prompt(
+            prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora
         )
         hidden = hidden[:, -1:, :]
             else DEFAULT_MAX_OBJECTS
         )
         objects = self._generate_points(
+            hidden,
+            next_token,
+            pos,
+            include_size=True,
+            max_objects=max_objects,
+            lora=lora,
         )
         return {"objects": objects}
         if self.config.tokenizer.templates["point"] is None:
             raise NotImplementedError("Model does not support pointing.")
+        image = self.encode_image(image, settings)
         self.load_encoded_image(image)
         prompt_tokens = torch.tensor(
             device=self.device,
         )
+        lora = (
+            variant_state_dict(settings["variant"], device=self.device)
+            if settings is not None and "variant" in settings
+            else None
+        )
         _, hidden, next_token, pos = self._prefill_prompt(
+            prompt_tokens, image.pos, temperature=0, top_p=0, lora=lora
         )
         hidden = hidden[:, -1:, :]
             else DEFAULT_MAX_OBJECTS
         )
         objects = self._generate_points(
+            hidden,
+            next_token,
+            pos,
+            include_size=False,
+            max_objects=max_objects,
+            lora=lora,
         )
         return {"points": objects}
                 self.text,
             )
             x_emb = encode_coordinate(
+                torch.tensor([[[source[0]]]], device=self.device, dtype=torch.bfloat16),
                 self.region,
             )
             y_emb = encode_coordinate(
+                torch.tensor([[[source[1]]]], device=self.device, dtype=torch.bfloat16),
                 self.region,
             )
             pos_ids = torch.arange(
                 image.pos, image.pos + prompt_emb.size(1), dtype=torch.long
             )
+            hidden = self._prefill(prompt_emb, mask, pos_ids, lora=None)
             logits = lm_head(hidden, self.text)
             next_token = torch.argmax(logits, dim=-1)
             pos = image.pos + prompt_emb.size(1)

region.py CHANGED Viewed

@@ -2,7 +2,11 @@ import torch
 import torch.nn as nn
 import math
-from .layers import linear, mlp
 def fourier_features(x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
@@ -36,7 +40,7 @@ def encode_coordinate(coord: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
-    return linear(fourier_features(coord, w.coord_features), w.coord_encoder)
 def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
@@ -64,7 +68,7 @@ def encode_size(size: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
-    return linear(fourier_features(size, w.size_features), w.size_encoder)
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
@@ -87,3 +91,46 @@ def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
         Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
     return mlp(hidden_state, w.size_decoder).view(2, -1)

 import torch.nn as nn
 import math
+from typing import List, Tuple, Union
+from .layers import mlp
+SpatialRefs = List[Union[Tuple[float, float], Tuple[float, float, float, float]]]
 def fourier_features(x: torch.Tensor, w: torch.Tensor) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
+    return w.coord_encoder(fourier_features(coord, w.coord_features))
 def decode_coordinate(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     Returns:
         Encoded hidden states tensor for input to text model
     """
+    return w.size_encoder(fourier_features(size, w.size_features))
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
         Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
     return mlp(hidden_state, w.size_decoder).view(2, -1)
+def encode_spatial_refs(spatial_refs: SpatialRefs, w: nn.Module) -> torch.Tensor:
+    """
+    Takes a list of spatial references (points or regions) and encodes them into
+    hidden states for input to the text model.
+    Args:
+        spatial_refs: List of spatial references (points or boxes)
+            - Points are represented as normalized (x, y) tuples
+            - Boxes are represented as normalized (x_min, y_min, x_max, y_max) tuples
+    Returns:
+        {"coords": torch.Tensor, "sizes": Optional[torch.Tensor]}
+    """
+    coords, sizes = [], []
+    for ref in spatial_refs:
+        if len(ref) == 2:
+            coords.append(ref[0])
+            coords.append(ref[1])
+        else:
+            x_c = (ref[0] + ref[2]) / 2
+            y_c = (ref[1] + ref[3]) / 2
+            width = ref[2] - ref[0]
+            height = ref[3] - ref[1]
+            coords.append(x_c)
+            coords.append(y_c)
+            sizes.append([width, height])
+    coords = torch.tensor(
+        coords, device=w.coord_features.device, dtype=w.coord_features.dtype
+    ).view(-1, 1)
+    coords = encode_coordinate(coords, w)
+    if sizes:
+        sizes = torch.tensor(
+            sizes, device=w.size_features.device, dtype=w.size_features.dtype
+        )
+        sizes = encode_size(sizes, w)
+    else:
+        sizes = None
+    return {"coords": coords, "sizes": sizes}

text.py CHANGED Viewed

@@ -2,8 +2,9 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
-from .layers import layer_norm, mlp
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
@@ -21,25 +22,22 @@ def attn(
     n_heads: int,
     n_kv_heads: int,
     position_ids: torch.Tensor,
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
-    q = qkv_out[..., :q_dim].view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
-    k = (
-        qkv_out[..., q_dim : q_dim + kv_dim]
-        .view(bsz, q_len, n_kv_heads, head_dim)
-        .transpose(1, 2)
-    )
-    v = (
-        qkv_out[..., q_dim + kv_dim :]
-        .view(bsz, q_len, n_kv_heads, head_dim)
-        .transpose(1, 2)
-    )
     q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
     k = apply_rotary_emb(k, freqs_cis, position_ids, n_kv_heads)
@@ -51,7 +49,14 @@ def attn(
         q, k, v, attn_mask=attn_mask, enable_gqa=n_heads != n_kv_heads
     )
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
-    out = w.proj(out)
     return out
@@ -126,8 +131,17 @@ def text_decoder(
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
 ):
     for i, block in enumerate(w.blocks):
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
             l_in,
@@ -138,8 +152,9 @@ def text_decoder(
             n_heads=config.n_heads,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
         )
-        l_mlp = mlp(l_in, block.mlp)
         x = x + l_attn + l_mlp
     return x
@@ -160,6 +175,7 @@ def _lm_head(hidden_BTC: torch.Tensor, w: nn.Module):
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
     qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
     text = nn.ModuleDict(
         {
@@ -170,18 +186,18 @@ def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
                             "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
-                                    "qkv": nn.Linear(config.dim, qkv_dim, dtype=dtype),
-                                    "proj": nn.Linear(
                                         config.dim, config.dim, dtype=dtype
                                     ),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
-                                    "fc1": nn.Linear(
                                         config.dim, config.ff_dim, dtype=dtype
                                     ),
-                                    "fc2": nn.Linear(
                                         config.ff_dim, config.dim, dtype=dtype
                                     ),
                                 }

 import torch.nn as nn
 from torch.nn import functional as F
+from typing import Optional
+from .layers import layer_norm, mlp, QuantizedLinear
 from .rope import apply_rotary_emb, precompute_freqs_cis
 from .config import TextConfig
     n_heads: int,
     n_kv_heads: int,
     position_ids: torch.Tensor,
+    lora: Optional[dict],
 ):
     bsz, q_len, d_model = x.shape
     head_dim = d_model // n_heads
     qkv_out = w.qkv(x)  # shape: (bsz, q_len, (n_heads + 2*n_kv_heads)*head_dim)
+    if lora is not None:
+        qkv_out += F.linear(F.linear(x, lora["qkv"]["A"]), lora["qkv"]["B"])
     q_dim = n_heads * head_dim
     kv_dim = n_kv_heads * head_dim
+    q, k, v = qkv_out.split([q_dim, kv_dim, kv_dim], dim=-1)
+    del qkv_out
+    q = q.view(bsz, q_len, n_heads, head_dim).transpose(1, 2)
+    k = k.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
+    v = v.view(bsz, q_len, n_kv_heads, head_dim).transpose(1, 2)
     q = apply_rotary_emb(q, freqs_cis, position_ids, n_heads)
     k = apply_rotary_emb(k, freqs_cis, position_ids, n_kv_heads)
         q, k, v, attn_mask=attn_mask, enable_gqa=n_heads != n_kv_heads
     )
     out = out.transpose(1, 2).reshape(bsz, q_len, d_model)
+    out0 = w.proj(out)
+    if lora is not None:
+        out1 = F.linear(F.linear(x, lora["proj"]["A"]), lora["proj"]["B"])
+        out = out0 + out1
+    else:
+        out = out0
     return out
     attn_mask: torch.Tensor,
     position_ids: torch.Tensor,
     config: TextConfig,
+    lora: Optional[dict],
 ):
     for i, block in enumerate(w.blocks):
+        if lora is not None:
+            layer_lora = lora["text"]["blocks"][str(i)]
+            mlp_lora = layer_lora["mlp"]
+            attn_lora = layer_lora["attn"]
+        else:
+            mlp_lora = None
+            attn_lora = None
         l_in = layer_norm(x, block.ln)
         l_attn = attn(
             l_in,
             n_heads=config.n_heads,
             n_kv_heads=config.n_kv_heads,
             position_ids=position_ids,
+            lora=attn_lora,
         )
+        l_mlp = mlp(l_in, block.mlp, lora=mlp_lora)
         x = x + l_attn + l_mlp
     return x
 def build_text_model(config: TextConfig, dtype: torch.dtype) -> nn.Module:
     qkv_dim = int(config.dim * (1 + 2 * config.n_kv_heads / config.n_heads))
+    linear_cls = QuantizedLinear if config.group_size is not None else nn.Linear
     text = nn.ModuleDict(
         {
                             "ln": nn.LayerNorm(config.dim, dtype=dtype),
                             "attn": nn.ModuleDict(
                                 {
+                                    "qkv": linear_cls(config.dim, qkv_dim, dtype=dtype),
+                                    "proj": linear_cls(
                                         config.dim, config.dim, dtype=dtype
                                     ),
                                 }
                             ),
                             "mlp": nn.ModuleDict(
                                 {
+                                    "fc1": linear_cls(
                                         config.dim, config.ff_dim, dtype=dtype
                                     ),
+                                    "fc2": linear_cls(
                                         config.ff_dim, config.dim, dtype=dtype
                                     ),
                                 }

vision.py CHANGED Viewed

@@ -6,7 +6,7 @@ import numpy as np
 from typing import Union, Tuple
 from PIL import Image
-from .layers import attn, layer_norm, linear, mlp
 from .image_crops import overlap_crop_image
 from .config import VisionConfig
@@ -33,7 +33,7 @@ def prepare_crops(
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
     all_crops = (
         torch.from_numpy(all_crops)
-        .to(device=device, dtype=torch.float16)
         .div_(255.0)
         .sub_(0.5)
         .div_(0.5)
@@ -64,7 +64,7 @@ def create_patches(x, patch_size):
 def vision_encoder(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):
     x = create_patches(input_BCHW, config.enc_patch_size)
-    x = linear(x, w.patch_emb)
     x = x + w.pos_emb
     for block in w.blocks:
         x = x + attn(layer_norm(x, block.ln1), block.attn, n_heads=config.enc_n_heads)

 from typing import Union, Tuple
 from PIL import Image
+from .layers import attn, layer_norm, mlp
 from .image_crops import overlap_crop_image
 from .config import VisionConfig
     all_crops = np.transpose(all_crops, (0, 3, 1, 2))
     all_crops = (
         torch.from_numpy(all_crops)
+        .to(device=device, dtype=torch.bfloat16)
         .div_(255.0)
         .sub_(0.5)
         .div_(0.5)
 def vision_encoder(input_BCHW: torch.Tensor, w: nn.Module, config: VisionConfig):
     x = create_patches(input_BCHW, config.enc_patch_size)
+    x = w.patch_emb(x)
     x = x + w.pos_emb
     for block in w.blocks:
         x = x + attn(layer_norm(x, block.ln1), block.attn, n_heads=config.enc_n_heads)