JigsawStack
/

moondream2-batched

Image-Text-to-Text

Model card Files Files and versions

HV-Khurdula commited on 25 days ago

Commit

17e2272

·

verified ·

1 Parent(s): 09597f3

Update region.py

fix: decode_size.

Files changed (1) hide show

region.py +12 -10

region.py CHANGED Viewed

@@ -73,20 +73,22 @@ def encode_size(size: torch.Tensor, w: nn.Module) -> torch.Tensor:
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
-    Original contract: returns (2, C) for width/height logits.
-    This keeps all downstream code in moondream.py working as-is.
     """
-    # w.size_decoder is your 2*C-projection MLP/Linear
-    x = w.size_decoder(hidden_state)  # (..., 2*C) in practice called on the last token
-    if x.dim() != 1:
-        # Most of the original code paths call this on a single hidden vector.
-        # If a higher-rank tensor slips in, collapse it conservatively.
-        x = x.reshape(-1)[-x.shape[-1]:]  # take the final vector
     last = x.shape[-1]
     if last % 2 != 0:
         raise RuntimeError(f"size_out_dim must be even, got {last}")
-    # (2, C)
-    return x.view(2, last // 2)

 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
+    Takes as input the last hidden state from the text model and outputs logits
+    for 1024 bins representing width and height in log-scale.
+    Returns logits shaped (..., 2, C) so batched code can handle it directly.
     """
+    # Run the two-layer MLP that projects to 2*C (width+height) bins
+    x = mlp(hidden_state, w.size_decoder)               # shape: (..., 2*C)
     last = x.shape[-1]
     if last % 2 != 0:
         raise RuntimeError(f"size_out_dim must be even, got {last}")
+    C = last // 2
+    # Keep any leading batch/seq dims intact and split the last dim into (2, C)
+    return x.view(*x.shape[:-1], 2, C)