metadata

task_categories:
  - visual-question-answering
language:
  - en
tags:
  - remyx
  - SpatialReasoning
  - spatial-reasoning
  - test-time-compute
  - thinking
  - reasoning
  - multimodal
  - vlm
  - vision-language
  - distance-estimation
  - quantitative-spatial-reasoning
pretty_name: SpaceOm
license: apache-2.0
datasets:
  - remyxai/SpaceThinker
base_model:
  - UCSC-VLAA/VLAA-Thinker-Qwen2.5VL-3B
pipeline_tag: image-text-to-text
library_name: transformers
model-index:
  - name: SpaceOm
    results:
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: 3DSRBench
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.5419
        results_by_subcategory:
          - name: 3D Positional Relation / Orientation
            success_rate: 0.4877
          - name: Object Localization / 3D Localization
            success_rate: 0.6337
          - name: Object Properties / Size
            success_rate: 0.5043
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: BLINK
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.599
        results_by_subcategory:
          - name: 3D Positional Relation / Orientation
            success_rate: 0.7972
          - name: Counting / Object Counting
            success_rate: 0.6167
          - name: Depth and Distance / Relative
            success_rate: 0.621
          - name: Object Localization / 2D Localization
            success_rate: 0.582
          - name: Point and Object Tracking / Point Correspondence
            success_rate: 0.3779
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: MMIU
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.388
        results_by_subcategory:
          - name: Camera and Image Transformation / 2D Transformation
            success_rate: 0.255
          - name: Camera and Image Transformation / 3D Camera Pose
            success_rate: 0.4
          - name: Camera and Image Transformation / Camera Motion
            success_rate: 0.4436
          - name: Depth and Distance / Absolute
            success_rate: 0.265
          - name: Object Localization / 3D Localization
            success_rate: 0.3625
          - name: Point and Object Tracking / 3D Tracking
            success_rate: 0.725
          - name: Point and Object Tracking / Point Correspondence
            success_rate: 0.265
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: MMVP
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.5833
        results_by_subcategory:
          - name: Others / Miscellaneous
            success_rate: 0.5833
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: QSpatialBench-Plus
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.4455
        results_by_subcategory:
          - name: Depth and Distance / Absolute
            success_rate: 0.4455
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: QSpatialBench-ScanNet
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.4876
        results_by_subcategory:
          - name: Depth and Distance / Absolute
            success_rate: 0.464
          - name: Object Properties / Size
            success_rate: 0.5111
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: RealWorldQA
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.6105
        results_by_subcategory:
          - name: Others / Miscellaneous
            success_rate: 0.6105
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: SpatialSense
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.7043
        results_by_subcategory:
          - name: 3D Positional Relation / Orientation
            success_rate: 0.7043
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: VGBench
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.3504
        results_by_subcategory:
          - name: Camera and Image Transformation / 2D Transformation
            success_rate: 0.2568
          - name: Camera and Image Transformation / 3D Camera Pose
            success_rate: 0.4371
          - name: Depth and Distance / Absolute
            success_rate: 0.3339
          - name: Depth and Distance / Relative
            success_rate: 0.32
          - name: Object Localization / 3D Localization
            success_rate: 0.4283
          - name: Point and Object Tracking / 3D Tracking
            success_rate: 0.3264
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: VSI-Bench_8
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.2558
        results_by_subcategory:
          - name: 3D Positional Relation / Orientation
            success_rate: 0.3998
          - name: Counting / Object Counting
            success_rate: 0.229
          - name: Depth and Distance / Absolute
            success_rate: 0.1562
          - name: Depth and Distance / Relative
            success_rate: 0.3648
          - name: Object Properties / Size
            success_rate: 0.1645
          - name: Others / Miscellaneous
            success_rate: 0.2204
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: VSR-ZeroShot
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.8085
        results_by_subcategory:
          - name: 3D Positional Relation / Orientation
            success_rate: 0.8085
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: cvbench
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.6839
        results_by_subcategory:
          - name: Counting / Object Counting
            success_rate: 0.6294
          - name: Depth and Distance / Relative
            success_rate: 0.7408
          - name: Object Localization / 3D Localization
            success_rate: 0.6815
      - task:
          type: visual-question-answering
          name: Spatial Reasoning
        dataset:
          name: spatialbench
          type: benchmark
        metrics:
          - type: success_rate
            name: Overall Success Rate
            value: 0.6553
        results_by_subcategory:
          - name: 3D Positional Relation / Orientation
            success_rate: 0.6765
          - name: Counting / Object Counting
            success_rate: 0.75
          - name: Object Properties / Existence
            success_rate: 0.925
          - name: Object Properties / Reachability
            success_rate: 0.55
          - name: Object Properties / Size
            success_rate: 0.375

SpaceOm

📚 Contents

🧠 Model Overview
📊 Evaluation & Benchmarks
🏃‍♀️ Running SpaceOm
🏋️‍♂️ Training Configuration
📂 Dataset Info
⚠️ Limitations
📜 Citation

Model Overview

SpaceOm improves over SpaceThinker by adding:

the target module o_proj in LoRA fine-tuning
SpaceOm dataset for longer reasoning traces
Robo2VLM-Reasoning dataset for more robotics domain and MCVQA examples

The choice to include o_proj among the target modules in LoRA finetuning was inspired by the study here, which argues for the importance of this module in reasoning models.

The reasoning traces in the SpaceThinker dataset average ~200 "thinking" tokens so now we've included longer reasoning traces in the training data to help the model use more tokens in reasoning.

Aiming to improve alignment for robotics applications, we've trained with synthetic reasoning traces, derived from the Robo2VLM-1 dataset.

Running SpaceOm

Ollama

To launch with ollama, run:

ollama run hf.co/remyxai/SpaceOm:latest

ollama run remyxai/spaceom

llama.cpp

To run locally with llama.cpp, install and build this branch and download the .gguf weights here

./llama-qwen2vl-cli -m spaceom-F16.gguf
--mmproj spaceom-vision.gguf
--image images/example_1.jpg --threads 24 -ngl 9
-p "Does the man in blue shirt working have a greater \\
height compared to the wooden pallet with boxes on floor?"

Transformers

Run locally using Transformers

import torch
from PIL import Image
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
import requests
from io import BytesIO

# Configuration
model_id = "remyxai/SpaceOm"
image_path = "images/example_1.jpg"  # or local path
prompt = "What can you infer from this image about the environment?"
system_message = (
  "You are VL-Thinking 🤔, a helpful assistant with excellent reasoning ability. "
  "You should first think about the reasoning process and then provide the answer. "
  "Use <think>...</think> and <answer>...</answer> tags."
)

# Load model and processor
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    model_id, device_map="auto", torch_dtype=torch.bfloat16
)
processor = AutoProcessor.from_pretrained(model_id)

# Load and preprocess image
if image_path.startswith("http"):
    image = Image.open(BytesIO(requests.get(image_path).content)).convert("RGB")
else:
    image = Image.open(image_path).convert("RGB")
if image.width > 512:
    ratio = image.height / image.width
    image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS)

# Format input
chat = [
    {"role": "system", "content": [{"type": "text", "text": system_message}]},
    {"role": "user", "content": [{"type": "image", "image": image},
                                {"type": "text", "text": prompt}]}
]
text_input = processor.apply_chat_template(chat, tokenize=False,
                                                  add_generation_prompt=True)

# Tokenize
inputs = processor(text=[text_input], images=[image],
                                      return_tensors="pt").to("cuda")

# Generate response
generated_ids = model.generate(**inputs, max_new_tokens=1024)
output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

print("Response:\n", output)

Dataset Info

The SpaceThinker dataset includes over 12K samples synthesized using VQASynth on a subset of images in the localized narratives split of the cauldron. SpaceThinker is formatted similar to the Llama-Nemotron-Post-Training-Dataset-v1 to toggle reasoning.

The SpaceOm dataset includes ~1K samples synthesized using VQASynth to include longer reasoning traces.

The Robo2VLM-Reasoning datasert is a subset of the original Robo2VLM dataset modified to include reasoning traces.

These datasets were combined to create the final training data for this model.

The model builds upon the ideas from SpatialVLM (Chen et al., 2024), introducing synthetic reasoning traces grounded on a 3D scene reconstruction pipeline using Molmo, VGGT, SAM2.

Training SpaceOm

PEFT Configuration

Architecture: Qwen2.5-VL-3B
Base model: UCSC-VLAA/VLAA-Thinker-Qwen2.5VL-3B
Method: LoRA finetuning (PEFT)
LoRA Alpha: 256
LoRA Rank: 128
Target Modules: q_proj, v_proj, o_proj
Optimizer: AdamW (lr=2e-5), batch size = 1, epochs = 3
Max input length: 1024 tokens

Reproduce LoRA SFT training with included script:

python train.py

Model Evaluation

OmniSpatial

Benchmark leaderboard with SpaceOm highlighted.

Model	Avg	Manip	Motion	Traffic	Locate	Geospatial	Pattern	Geometric	Ego	Allo	Hypo
🥇 o3-2025-04-16	56.33	71.89	66.18	61.18	68.57	65.45	40.21	29.68	77.06	48.40	48.19
🥈 Gemini-2.5-pro-preview-05-06	55.19	67.57	71.39	62.35	75.24	64.55	43.30	34.84	74.51	38.03	37.35
🥉 Gemini-2.5-flash-thinking-05-20	53.16	70.27	64.74	61.18	72.38	58.18	35.05	36.13	74.12	40.96	32.53
o4-mini-04-16	52.77	72.97	59.83	60.00	73.33	61.82	34.02	36.77	73.53	40.69	40.96
Gemini-2.5-flash-preview-05-20	52.12	67.57	62.72	68.24	73.33	60.91	38.14	34.19	75.49	35.90	33.73
GPT-4.1-2025-04-14	51.78	66.22	64.74	60.00	65.33	60.18	31.75	30.06	70.98	40.64	39.04
o1-2024-12-17	50.36	71.62	60.98	57.65	63.81	60.00	39.18	27.10	71.57	38.03	36.14
InternVL3-78B	49.33	63.78	63.12	56.24	59.24	51.45	27.63	30.19	74.51	38.46	35.90
GPT-4.1-mini-2025-04-14	48.87	64.32	56.53	59.06	60.19	56.36	29.28	30.19	72.55	39.57	39.28
Claude-3-7-thinking-20250219	48.62	57.21	59.73	53.73	67.94	57.27	30.24	28.17	68.63	37.94	36.95
InternVL3-38B	48.48	63.42	63.58	54.59	58.29	50.55	29.90	28.52	72.16	36.76	33.49
Gemini-2.0-flash-exp	48.40	61.89	56.01	51.76	63.43	59.09	20.82	33.81	72.75	39.20	39.28
Qwen-VL2.5-72B	47.85	58.38	60.12	50.12	59.81	53.64	26.19	33.03	71.37	36.81	36.39
GPT-4o-2024-11-20	47.81	65.54	57.23	56.47	52.38	54.09	26.29	25.48	75.98	39.49	39.76
Claude-3-7-sonnet-20250219	47.53	57.57	55.95	56.71	63.81	59.09	29.48	28.39	72.16	36.06	36.63
Qwen-VL2.5-32B	47.36	63.06	55.09	51.76	66.29	56.91	26.39	27.48	68.04	37.50	40.24
Claude-3-5-sonnet-20241022	46.86	54.05	54.57	58.12	68.38	53.09	26.60	31.74	70.00	34.79	39.52
InternVL3-14B	45.94	54.32	60.17	50.35	51.81	51.45	28.04	28.26	68.04	35.37	34.46
LLaVA-onevision-qwen2-72B	45.66	62.16	50.29	54.12	60.95	56.36	22.68	25.81	76.47	37.23	33.73
SoFar-Qwen2.5-3B	45.14	56.49	51.16	54.12	53.14	52.73	31.75	22.88	71.60	36.56	41.69
Gemma-3-27B	44.75	56.76	55.78	57.65	50.48	52.73	27.84	29.03	64.71	33.51	32.53
Gemini-2.0-flash-lite	44.03	59.19	46.71	60.24	49.52	53.27	21.65	31.23	66.47	36.81	38.80
Gemma-3-12B	43.71	54.05	54.91	54.12	47.62	45.45	16.49	30.32	63.73	36.70	33.73
GPT-4o-mini-2024-07-18	42.64	55.95	50.29	54.59	43.43	44.91	22.47	29.42	61.57	36.76	34.22
GPT-4.1-nano-2025-04-14	42.62	50.90	53.85	54.90	40.95	42.42	24.40	30.11	53.59	37.23	33.73
🧘‍♂️ SpaceOm	41.79	51.89	47.98	50.82	39.62	43.64	27.63	27.61	70.00	35.74	33.73
InternVL3-8B	41.60	52.43	40.87	48.94	51.05	44.77	24.95	28.63	64.20	38.62	40.96
SpaceThinker-Qwen2.5-3B	40.42	47.84	53.06	43.29	35.43	38.73	24.33	28.00	58.04	35.11	31.08
Qwen-VL2.5-3B	40.30	55.41	47.51	46.12	42.29	44.73	32.16	23.87	59.41	33.30	30.84
SpaceQwen2.5-VL-3B	40.25	58.11	39.88	41.18	40.95	40.91	29.90	25.81	63.73	38.83	39.76
Gemma-3-4B	39.79	41.89	49.71	56.47	27.62	36.36	23.71	24.52	59.80	36.17	38.55
Qwen-VL2.5-7B	39.18	58.38	35.09	50.12	45.33	44.00	31.13	29.42	64.51	33.19	37.35
InternVL3-2B	37.98	50.00	40.58	43.29	40.00	40.55	21.86	28.52	55.49	35.11	33.01
SpaceMantis-13B	36.36	47.03	36.59	40.94	34.86	33.09	22.27	24.39	49.22	38.25	39.28
RoboPoint-vicuna-7B	35.85	57.03	28.61	34.82	37.33	40.55	29.90	22.71	50.20	38.72	40.96
LLaVA-onevision-qwen2-7B	35.68	43.24	38.15	32.94	29.52	41.82	28.87	22.58	47.06	36.17	37.35
SpatialBot-3B	35.68	43.24	38.15	32.94	29.52	41.82	28.87	22.58	47.06	36.17	37.35
LLaVA-1.5-vicuna-7B	34.97	54.46	31.23	35.29	36.19	33.94	29.01	24.18	55.60	34.66	36.14
RoboPoint-vicuna-13B	34.60	55.68	28.15	42.82	32.19	32.55	24.12	27.74	49.02	37.66	33.49

See full SpaceOm results here for the OmniSpatial benchmark.

SpatialScore

Top scores in each category are bolded in partial table of 3B/4B models.

Model	Overall	Count.	Obj.-Loc.	Pos.-Rel.	Dist.	Obj.-Prop.	Cam.&IT.	Tracking	Others
InternVL2.5-4B	49.82	53.32	62.02	62.82	42.30	27.00	32.49	37.02	48.95
SpaceOm	48.15	47.84	55.24	61.83	41.48	30.97	32.94	37.20	43.74
Qwen2.5-VL-3B	47.90	46.62	55.55	62.23	37.53	32.59	35.85	36.90	42.19
SpaceQwen2.5-VL-3B	42.31	45.01	49.78	57.88	27.36	34.11	26.34	26.44	43.58
SpatialBot-Phi2-3B	41.65	53.25	54.32	55.40	27.12	26.10	24.21	27.57	41.66

See all results for evaluating SpaceOm on the SpatialScore benchmark.

SpaCE-10

Top scores in each category are bolded in partial table of 3B/4B models.

Model	Overall	EQ	SQ	SA	OO	OS	EP	FR	SP	Source
InternVL2.5-4B	36.01	34.30	34.40	43.60	44.40	16.50	31.10	50.10	33.70	Table
SpaceThinker	32.72	32.73	24.81	47.26	50.33	33.63	9.25	37.54	26.25	GPT Eval
SpaceOm	32.32	32.47	24.81	47.63	50.00	32.52	9.12	37.04	25.00	GPT Eval
SpaceQwen	31.98	31.19	25.89	41.61	51.98	35.18	10.97	36.54	22.50	GPT Eval
Qwen2.5-VL-3B-Instruct	30.00	31.70	45.50	39.00	43.00	25.30	11.50	22.80	21.20	Table

Legend:

EQ: Entity Quantification
SQ: Scene Quantification
SA: Size Assessment
OO: Object-Object spatial relations
OS: Object-Scene spatial relations
EP: Entity Presence
FR: Functional Reasoning
SP: Spatial Planning

ℹ️ Note: Scores for SpaceQwen, SpaceThinker, SpaceOm are generated via gpt_eval_score on single-choice (*-single) versions of the SpaCE-10 benchmark tasks. Other entries reflect leaderboard accuracy scores from the official SpaCE-10 evaluation table.

Read more about the SpaCE-10 benchmark or see results here

Limitations

Performance may degrade in cluttered environments or camera perspective.
This model was fine-tuned using synthetic reasoning over an internet image dataset.
Multimodal biases inherent to the base model (Qwen2.5-VL) may persist.
Not intended for use in safety-critical or legal decision-making.

Users are encouraged to evaluate outputs critically and consider fine-tuning for domain-specific safety and performance. Distances estimated using autoregressive transformers may help in higher-order reasoning for planning and behavior but may not be suitable replacements for measurements taken with high-precision sensors, calibrated stereo vision systems, or specialist monocular depth estimation models capable of more accurate, pixel-wise predictions and real-time performance.

Citation

@article{chen2024spatialvlm,
  title = {SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities},
  author = {Chen, Boyuan and Xu, Zhuo and Kirmani, Sean and Ichter, Brian and Driess, Danny and Florence, Pete and Sadigh, Dorsa and Guibas, Leonidas and Xia, Fei},
  journal = {arXiv preprint arXiv:2401.12168},
  year = {2024},
  url = {https://arxiv.org/abs/2401.12168},
}

@misc{qwen2.5-VL,
  title = {Qwen2.5-VL},
  url = {https://qwenlm.github.io/blog/qwen2.5-vl/},
  author = {Qwen Team},
  month = {January},
  year = {2025}
}

@misc{vl-thinking2025,
  title={SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models },
  author={Hardy Chen and Haoqin Tu and Fali Wang and Hui Liu and Xianfeng Tang and Xinya Du and Yuyin Zhou and Cihang Xie},
  year = {2025},
  publisher = {GitHub},
  journal = {GitHub repository},
  howpublished = {\url{https://github.com/UCSC-VLAA/VLAA-Thinking}},
}


@article{wu2025spatialscore,
    author    = {Wu, Haoning and Huang, Xiao and Chen, Yaohui and Zhang, Ya and Wang, Yanfeng and Xie, Weidi},
    title     = {SpatialScore: Towards Unified Evaluation for Multimodal Spatial Understanding},
    journal   = {arXiv preprint arXiv:2505.17012},
    year      = {2025},
}