JigsawStack
/

moondream2-batched

Image-Text-to-Text

Model card Files Files and versions

HV-Khurdula commited on Sep 24

Commit

d53b116

·

verified ·

1 Parent(s): 7eac0da

Update region.py

Files changed (1) hide show

region.py +16 -3

region.py CHANGED Viewed

@@ -72,9 +72,22 @@ def encode_size(size: torch.Tensor, w: nn.Module) -> torch.Tensor:
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
-    # Original API expected by moondream.py: shape (2, C) when called on the last hidden state
-    x = mlp(hidden_state, w.size_decoder)  # (..., 2*C)
-    return x.view(2, -1)

 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
+    """
+    Original contract: returns (2, C) for width/height logits.
+    This keeps all downstream code in moondream.py working as-is.
+    """
+    # w.size_decoder is your 2*C-projection MLP/Linear
+    x = w.size_decoder(hidden_state)  # (..., 2*C) in practice called on the last token
+    if x.dim() != 1:
+        # Most of the original code paths call this on a single hidden vector.
+        # If a higher-rank tensor slips in, collapse it conservatively.
+        x = x.reshape(-1)[-x.shape[-1]:]  # take the final vector
+    last = x.shape[-1]
+    if last % 2 != 0:
+        raise RuntimeError(f"size_out_dim must be even, got {last}")
+    # (2, C)
+    return x.view(2, last // 2)