--- task_categories: - visual-question-answering language: - en tags: - remyx - SpatialReasoning - spatial-reasoning - test-time-compute - thinking - reasoning - multimodal - vlm - vision-language - distance-estimation - quantitative-spatial-reasoning pretty_name: SpaceOm license: apache-2.0 datasets: - remyxai/SpaceThinker base_model: - UCSC-VLAA/VLAA-Thinker-Qwen2.5VL-3B pipeline_tag: image-text-to-text library_name: transformers model-index: - name: SpaceOm results: - task: type: visual-question-answering name: Spatial Reasoning dataset: name: 3DSRBench type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.5419 results_by_subcategory: - name: 3D Positional Relation / Orientation success_rate: 0.4877 - name: Object Localization / 3D Localization success_rate: 0.6337 - name: Object Properties / Size success_rate: 0.5043 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: BLINK type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.599 results_by_subcategory: - name: 3D Positional Relation / Orientation success_rate: 0.7972 - name: Counting / Object Counting success_rate: 0.6167 - name: Depth and Distance / Relative success_rate: 0.621 - name: Object Localization / 2D Localization success_rate: 0.582 - name: Point and Object Tracking / Point Correspondence success_rate: 0.3779 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: MMIU type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.388 results_by_subcategory: - name: Camera and Image Transformation / 2D Transformation success_rate: 0.255 - name: Camera and Image Transformation / 3D Camera Pose success_rate: 0.4 - name: Camera and Image Transformation / Camera Motion success_rate: 0.4436 - name: Depth and Distance / Absolute success_rate: 0.265 - name: Object Localization / 3D Localization success_rate: 0.3625 - name: Point and Object Tracking / 3D Tracking success_rate: 0.725 - name: Point and Object Tracking / Point Correspondence success_rate: 0.265 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: MMVP type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.5833 results_by_subcategory: - name: Others / Miscellaneous success_rate: 0.5833 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: QSpatialBench-Plus type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.4455 results_by_subcategory: - name: Depth and Distance / Absolute success_rate: 0.4455 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: QSpatialBench-ScanNet type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.4876 results_by_subcategory: - name: Depth and Distance / Absolute success_rate: 0.464 - name: Object Properties / Size success_rate: 0.5111 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: RealWorldQA type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.6105 results_by_subcategory: - name: Others / Miscellaneous success_rate: 0.6105 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: SpatialSense type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.7043 results_by_subcategory: - name: 3D Positional Relation / Orientation success_rate: 0.7043 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: VGBench type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.3504 results_by_subcategory: - name: Camera and Image Transformation / 2D Transformation success_rate: 0.2568 - name: Camera and Image Transformation / 3D Camera Pose success_rate: 0.4371 - name: Depth and Distance / Absolute success_rate: 0.3339 - name: Depth and Distance / Relative success_rate: 0.32 - name: Object Localization / 3D Localization success_rate: 0.4283 - name: Point and Object Tracking / 3D Tracking success_rate: 0.3264 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: VSI-Bench_8 type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.2558 results_by_subcategory: - name: 3D Positional Relation / Orientation success_rate: 0.3998 - name: Counting / Object Counting success_rate: 0.229 - name: Depth and Distance / Absolute success_rate: 0.1562 - name: Depth and Distance / Relative success_rate: 0.3648 - name: Object Properties / Size success_rate: 0.1645 - name: Others / Miscellaneous success_rate: 0.2204 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: VSR-ZeroShot type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.8085 results_by_subcategory: - name: 3D Positional Relation / Orientation success_rate: 0.8085 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: cvbench type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.6839 results_by_subcategory: - name: Counting / Object Counting success_rate: 0.6294 - name: Depth and Distance / Relative success_rate: 0.7408 - name: Object Localization / 3D Localization success_rate: 0.6815 - task: type: visual-question-answering name: Spatial Reasoning dataset: name: spatialbench type: benchmark metrics: - type: success_rate name: Overall Success Rate value: 0.6553 results_by_subcategory: - name: 3D Positional Relation / Orientation success_rate: 0.6765 - name: Counting / Object Counting success_rate: 0.75 - name: Object Properties / Existence success_rate: 0.925 - name: Object Properties / Reachability success_rate: 0.55 - name: Object Properties / Size success_rate: 0.375 --- [![Official](https://img.shields.io/badge/Official-%239a0018.svg?logo=data:image/svg+xml;base64,PG5zMDpzdmcgeG1sbnM6bnMwPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgYmFzZVByb2ZpbGU9ImZ1bGwiIGhlaWdodD0iMjQiIHZlcnNpb249IjEuMSIgdmlld0JveD0iNiAyMiA1MiAyMiIgd2lkdGg9IjI0Ij48bnMwOmRlZnMgLz48bnMwOnBhdGggZD0iTTEzLDMzIEw5LDM3IEw5LDUwIEwxMCw1MSBMMTMsNTEgTDE0LDUwIEwxNCwzOCBMMTUsMzcgTDE2LDM4IEwxNiw1MCBMMTcsNTEgTDE5LDUxIEwyMCw1MCBMMjAsMzggTDIxLDM3IEwyMiwzOCBMMjIsMzkgTDIzLDQwIEwyMyw1MSBMMjYsNTEgTDI3LDUwIEwyNyw0MyBMMjgsNDIgTDMyLDQ2IEwzMiw1MSBMMzUsNTEgTDM2LDUwIEwzNiw0NSBMNDEsNDAgTDQyLDQxIEw0Miw0MyBMNDEsNDQgTDQxLDUwIEw0Miw1MSBMNDQsNTEgTDQ1LDUwIEw0NSw0NSBMNDYsNDQgTDQ5LDQ0IEw1MCw0NSBMNTAsNTAgTDUxLDUxIEw1Myw1MSBMNTQsNTAgTDU0LDQ0IEw1Myw0MyBMNTMsNDIgTDUyLDQxIEw1NCwzOSBMNTQsMzQgTDUzLDM0IEw1MiwzMyBMNTEsMzMgTDUwLDM0IEw1MCwzOSBMNDksNDAgTDQ2LDQwIEw0NSwzOSBMNDUsMzQgTDQ0LDM0IEw0MywzMyBMNDIsMzMgTDQxLDM0IEw0MCwzNCBMMzksMzMgTDM4LDMzIEwzNywzNCBMMzYsMzQgTDM2LDQwIEwzNSw0MSBMMzMsNDEgTDMxLDM5IEwzMSwzNCBMMzAsMzMgTDI5LDMzIEwyOCwzNCBMMjcsMzQgTDI3LDM2IEwyNiwzNyBMMjUsMzYgTDI1LDM1IEwyNCwzNCBMMjMsMzQgTDIyLDMzIFoiIGZpbGw9IiNmZmZmZmYiIC8+PG5zMDpwYXRoIGQ9Ik0xMSwxNCBMMTAsMTUgTDEwLDI5IEwxMSwzMCBMMTUsMzAgTDE2LDI5IEwxNiwyNSBMMTcsMjQgTDIzLDI0IEwyNSwyNiBMMjUsMjggTDI3LDMwIEwzMSwzMCBMMzEsMjcgTDMyLDI2IEwzMywyNyBMMzMsMjggTDM0LDI5IEwzNSwyOSBMMzYsMzAgTDUyLDMwIEw1MywyOSBMNTMsMjcgTDUyLDI2IEwzOSwyNiBMMzgsMjUgTDM5LDI0IEw1MSwyNCBMNTIsMjMgTDUzLDIzIEw1MywyMSBMNTIsMjAgTDM5LDIwIEwzOCwxOSBMNDAsMTcgTDUzLDE3IEw1MywxNCBMMzYsMTQgTDMzLDE3IEwzMywxOCBMMzIsMTkgTDMxLDE4IEwzMSwxNyBMMjgsMTQgWiIgZmlsbD0iI2ZmZmZmZiIgLz48bnMwOnBhdGggZD0iTTQsMSBaIiBmaWxsPSIjZmZmZmZmIiAvPjwvbnMwOnN2Zz4=)](https://remyx.ai/?model_id=SpaceThinker-Qwen2.5VL-3B&sha256=abc123def4567890abc123def4567890abc123def4567890abc123def4567890) # SpaceOm ## 📚 Contents - [🧠 Model Overview](#model-overview) - [📊 Evaluation & Benchmarks](#model-evaluation) - [đŸƒâ€â™€ī¸ Running SpaceOm](#running-spaceom) - [đŸ‹ī¸â€â™‚ī¸ Training Configuration](#training-spaceom) - [📂 Dataset Info](#dataset-info) - [âš ī¸ Limitations](#limitations) - [📜 Citation](#citation) ## Model Overview **SpaceOm** improves over **SpaceThinker** by adding: * the target module `o_proj` in LoRA fine-tuning * **SpaceOm** [dataset](https://huggingface.co/datasets/salma-remyx/SpaceOm) for longer reasoning traces * **Robo2VLM-Reasoning** [dataset](https://huggingface.co/datasets/salma-remyx/Robo2VLM-Reasoning) for more robotics domain and MCVQA examples The choice to include `o_proj` among the target modules in LoRA finetuning was inspired by the study [here](https://arxiv.org/pdf/2505.20993v1), which argues for the importance of this module in reasoning models. The reasoning traces in the SpaceThinker dataset average ~200 "thinking" tokens so now we've included longer reasoning traces in the training data to help the model use more tokens in reasoning. Aiming to improve alignment for robotics applications, we've trained with synthetic reasoning traces, derived from the **Robo2VLM-1** [dataset](https://huggingface.co/datasets/keplerccc/Robo2VLM-1). ## Running SpaceOm ### Ollama To launch with ollama, run: ```bash ollama run hf.co/remyxai/SpaceOm:latest ``` or ```bash ollama run remyxai/spaceom ``` ### llama.cpp To run locally with **llama.cpp**, install and build this [branch](https://github.com/HimariO/llama.cpp.qwen2.5vl/tree/qwen25-vl) and download the [.gguf weights here](https://huggingface.co/remyxai/SpaceThinker-Qwen2.5VL-3B/tree/main/gguf) ```bash ./llama-qwen2vl-cli -m spaceom-F16.gguf --mmproj spaceom-vision.gguf --image images/example_1.jpg --threads 24 -ngl 9 -p "Does the man in blue shirt working have a greater \\ height compared to the wooden pallet with boxes on floor?" ``` ### Transformers Run locally using **Transformers** ```python import torch from PIL import Image from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor import requests from io import BytesIO # Configuration model_id = "remyxai/SpaceOm" image_path = "images/example_1.jpg" # or local path prompt = "What can you infer from this image about the environment?" system_message = ( "You are VL-Thinking 🤔, a helpful assistant with excellent reasoning ability. " "You should first think about the reasoning process and then provide the answer. " "Use ... and ... tags." ) # Load model and processor model = Qwen2_5_VLForConditionalGeneration.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16 ) processor = AutoProcessor.from_pretrained(model_id) # Load and preprocess image if image_path.startswith("http"): image = Image.open(BytesIO(requests.get(image_path).content)).convert("RGB") else: image = Image.open(image_path).convert("RGB") if image.width > 512: ratio = image.height / image.width image = image.resize((512, int(512 * ratio)), Image.Resampling.LANCZOS) # Format input chat = [ {"role": "system", "content": [{"type": "text", "text": system_message}]}, {"role": "user", "content": [{"type": "image", "image": image}, {"type": "text", "text": prompt}]} ] text_input = processor.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) # Tokenize inputs = processor(text=[text_input], images=[image], return_tensors="pt").to("cuda") # Generate response generated_ids = model.generate(**inputs, max_new_tokens=1024) output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] print("Response:\n", output) ``` ## Dataset Info The [SpaceThinker](https://huggingface.co/datasets/remyxai/SpaceThinker) dataset includes over 12K samples synthesized using VQASynth on a subset of images in the localized narratives split of [the cauldron](https://huggingface.co/datasets/HuggingFaceM4/the_cauldron). **SpaceThinker** is formatted similar to the [Llama-Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset) to toggle reasoning. The [SpaceOm](https://huggingface.co/datasets/remyxai/SpaceOm) dataset includes ~1K samples synthesized using VQASynth to include longer reasoning traces. The [Robo2VLM-Reasoning](https://huggingface.co/datasets/remyxai/Robo2VLM-Reasoning) datasert is a subset of the original [Robo2VLM](https://huggingface.co/datasets/remyxai/Robo2VLM-Reasoning) dataset modified to include reasoning traces. These datasets were combined to create the final training data for this model. The model builds upon the ideas from [SpatialVLM (Chen et al., 2024)](https://spatial-vlm.github.io/), introducing synthetic reasoning traces grounded on a 3D scene reconstruction pipeline using **Molmo, VGGT, SAM2**. ## Training SpaceOm **PEFT Configuration** - Architecture: Qwen2.5-VL-3B - Base model: UCSC-VLAA/VLAA-Thinker-Qwen2.5VL-3B - Method: LoRA finetuning (PEFT) - LoRA Alpha: 256 - LoRA Rank: 128 - Target Modules: q_proj, v_proj, o_proj - Optimizer: AdamW (lr=2e-5), batch size = 1, epochs = 3 - Max input length: 1024 tokens Reproduce LoRA SFT training with included script: ```bash python train.py ``` ## Model Evaluation ### OmniSpatial Benchmark leaderboard with **SpaceOm** highlighted. | Model | Avg | Manip | Motion | Traffic | Locate | Geospatial | Pattern | Geometric | Ego | Allo | Hypo | |-----------------------------|--------|--------|--------|---------|--------|------------|---------|-----------|--------|--------|--------| | đŸĨ‡ o3-2025-04-16 | 56.33 | 71.89 | 66.18 | 61.18 | 68.57 | 65.45 | 40.21 | 29.68 | 77.06 | 48.40 | 48.19 | | đŸĨˆ Gemini-2.5-pro-preview-05-06 | 55.19 | 67.57 | 71.39 | 62.35 | 75.24 | 64.55 | 43.30 | 34.84 | 74.51 | 38.03 | 37.35 | | đŸĨ‰ Gemini-2.5-flash-thinking-05-20 | 53.16 | 70.27 | 64.74 | 61.18 | 72.38 | 58.18 | 35.05 | 36.13 | 74.12 | 40.96 | 32.53 | | o4-mini-04-16 | 52.77 | 72.97 | 59.83 | 60.00 | 73.33 | 61.82 | 34.02 | 36.77 | 73.53 | 40.69 | 40.96 | | Gemini-2.5-flash-preview-05-20 | 52.12 | 67.57 | 62.72 | 68.24 | 73.33 | 60.91 | 38.14 | 34.19 | 75.49 | 35.90 | 33.73 | | GPT-4.1-2025-04-14 | 51.78 | 66.22 | 64.74 | 60.00 | 65.33 | 60.18 | 31.75 | 30.06 | 70.98 | 40.64 | 39.04 | | o1-2024-12-17 | 50.36 | 71.62 | 60.98 | 57.65 | 63.81 | 60.00 | 39.18 | 27.10 | 71.57 | 38.03 | 36.14 | | InternVL3-78B | 49.33 | 63.78 | 63.12 | 56.24 | 59.24 | 51.45 | 27.63 | 30.19 | 74.51 | 38.46 | 35.90 | | GPT-4.1-mini-2025-04-14 | 48.87 | 64.32 | 56.53 | 59.06 | 60.19 | 56.36 | 29.28 | 30.19 | 72.55 | 39.57 | 39.28 | | Claude-3-7-thinking-20250219| 48.62 | 57.21 | 59.73 | 53.73 | 67.94 | 57.27 | 30.24 | 28.17 | 68.63 | 37.94 | 36.95 | | InternVL3-38B | 48.48 | 63.42 | 63.58 | 54.59 | 58.29 | 50.55 | 29.90 | 28.52 | 72.16 | 36.76 | 33.49 | | Gemini-2.0-flash-exp | 48.40 | 61.89 | 56.01 | 51.76 | 63.43 | 59.09 | 20.82 | 33.81 | 72.75 | 39.20 | 39.28 | | Qwen-VL2.5-72B | 47.85 | 58.38 | 60.12 | 50.12 | 59.81 | 53.64 | 26.19 | 33.03 | 71.37 | 36.81 | 36.39 | | GPT-4o-2024-11-20 | 47.81 | 65.54 | 57.23 | 56.47 | 52.38 | 54.09 | 26.29 | 25.48 | 75.98 | 39.49 | 39.76 | | Claude-3-7-sonnet-20250219 | 47.53 | 57.57 | 55.95 | 56.71 | 63.81 | 59.09 | 29.48 | 28.39 | 72.16 | 36.06 | 36.63 | | Qwen-VL2.5-32B | 47.36 | 63.06 | 55.09 | 51.76 | 66.29 | 56.91 | 26.39 | 27.48 | 68.04 | 37.50 | 40.24 | | Claude-3-5-sonnet-20241022 | 46.86 | 54.05 | 54.57 | 58.12 | 68.38 | 53.09 | 26.60 | 31.74 | 70.00 | 34.79 | 39.52 | | InternVL3-14B | 45.94 | 54.32 | 60.17 | 50.35 | 51.81 | 51.45 | 28.04 | 28.26 | 68.04 | 35.37 | 34.46 | | LLaVA-onevision-qwen2-72B | 45.66 | 62.16 | 50.29 | 54.12 | 60.95 | 56.36 | 22.68 | 25.81 | 76.47 | 37.23 | 33.73 | | SoFar-Qwen2.5-3B | 45.14 | 56.49 | 51.16 | 54.12 | 53.14 | 52.73 | 31.75 | 22.88 | 71.60 | 36.56 | 41.69 | | Gemma-3-27B | 44.75 | 56.76 | 55.78 | 57.65 | 50.48 | 52.73 | 27.84 | 29.03 | 64.71 | 33.51 | 32.53 | | Gemini-2.0-flash-lite | 44.03 | 59.19 | 46.71 | 60.24 | 49.52 | 53.27 | 21.65 | 31.23 | 66.47 | 36.81 | 38.80 | | Gemma-3-12B | 43.71 | 54.05 | 54.91 | 54.12 | 47.62 | 45.45 | 16.49 | 30.32 | 63.73 | 36.70 | 33.73 | | GPT-4o-mini-2024-07-18 | 42.64 | 55.95 | 50.29 | 54.59 | 43.43 | 44.91 | 22.47 | 29.42 | 61.57 | 36.76 | 34.22 | | GPT-4.1-nano-2025-04-14 | 42.62 | 50.90 | 53.85 | 54.90 | 40.95 | 42.42 | 24.40 | 30.11 | 53.59 | 37.23 | 33.73 | | đŸ§˜â€â™‚ī¸ **SpaceOm** | 41.79 | 51.89 | 47.98 | 50.82 | 39.62 | 43.64 | 27.63 | 27.61 | 70.00 | 35.74 | 33.73 | | InternVL3-8B | 41.60 | 52.43 | 40.87 | 48.94 | 51.05 | 44.77 | 24.95 | 28.63 | 64.20 | 38.62 | 40.96 | | SpaceThinker-Qwen2.5-3B | 40.42 | 47.84 | 53.06 | 43.29 | 35.43 | 38.73 | 24.33 | 28.00 | 58.04 | 35.11 | 31.08 | | Qwen-VL2.5-3B | 40.30 | 55.41 | 47.51 | 46.12 | 42.29 | 44.73 | 32.16 | 23.87 | 59.41 | 33.30 | 30.84 | | SpaceQwen2.5-VL-3B | 40.25 | 58.11 | 39.88 | 41.18 | 40.95 | 40.91 | 29.90 | 25.81 | 63.73 | 38.83 | 39.76 | | Gemma-3-4B | 39.79 | 41.89 | 49.71 | 56.47 | 27.62 | 36.36 | 23.71 | 24.52 | 59.80 | 36.17 | 38.55 | | Qwen-VL2.5-7B | 39.18 | 58.38 | 35.09 | 50.12 | 45.33 | 44.00 | 31.13 | 29.42 | 64.51 | 33.19 | 37.35 | | InternVL3-2B | 37.98 | 50.00 | 40.58 | 43.29 | 40.00 | 40.55 | 21.86 | 28.52 | 55.49 | 35.11 | 33.01 | | SpaceMantis-13B | 36.36 | 47.03 | 36.59 | 40.94 | 34.86 | 33.09 | 22.27 | 24.39 | 49.22 | 38.25 | 39.28 | | RoboPoint-vicuna-7B | 35.85 | 57.03 | 28.61 | 34.82 | 37.33 | 40.55 | 29.90 | 22.71 | 50.20 | 38.72 | 40.96 | | LLaVA-onevision-qwen2-7B | 35.68 | 43.24 | 38.15 | 32.94 | 29.52 | 41.82 | 28.87 | 22.58 | 47.06 | 36.17 | 37.35 | | SpatialBot-3B | 35.68 | 43.24 | 38.15 | 32.94 | 29.52 | 41.82 | 28.87 | 22.58 | 47.06 | 36.17 | 37.35 | | LLaVA-1.5-vicuna-7B | 34.97 | 54.46 | 31.23 | 35.29 | 36.19 | 33.94 | 29.01 | 24.18 | 55.60 | 34.66 | 36.14 | | RoboPoint-vicuna-13B | 34.60 | 55.68 | 28.15 | 42.82 | 32.19 | 32.55 | 24.12 | 27.74 | 49.02 | 37.66 | 33.49 | See full **SpaceOm** [results here](https://huggingface.co/datasets/salma-remyx/SpaceOm_OmniSpatial/blob/main/OmniSpatial_spaceom_results.json) for the **OmniSpatial** [benchmark](https://qizekun.github.io/omnispatial/). ### SpatialScore Top scores in each category are **bolded** in partial table of 3B/4B models. | **Model** | **Overall** | **Count.** | **Obj.-Loc.** | **Pos.-Rel.** | **Dist.** | **Obj.-Prop.** | **Cam.&IT.** | **Tracking** | **Others** | |------------------------|-------------|------------|----------------|----------------|-----------|----------------|---------------|---------------|------------| | InternVL2.5-4B | 49.82 | **53.32** | **62.02** | **62.82** | **42.30** | 27.00 | 32.49 | 37.02 | **48.95** | | **SpaceOm** | 48.15 | 47.84 | 55.24 | 61.83 | 41.48 | 30.97 | 32.94 | **37.20** | 43.74 | | Qwen2.5-VL-3B | 47.90 | 46.62 | 55.55 | 62.23 | 37.53 | 32.59 | **35.85** | 36.90 | 42.19 | | SpaceQwen2.5-VL-3B | 42.31 | 45.01 | 49.78 | 57.88 | 27.36 | **34.11** | 26.34 | 26.44 | 43.58 | | SpatialBot-Phi2-3B | 41.65 | 53.25 | 54.32 | 55.40 | 27.12 | 26.10 | 24.21 | 27.57 | 41.66 | See [all results](https://huggingface.co/datasets/salma-remyx/SpaceOm_SpatialScore) for evaluating **SpaceOm** on the **SpatialScore** [benchmark](https://haoningwu3639.github.io/SpatialScore/). ### SpaCE-10 Top scores in each category are **bolded** in partial table of 3B/4B models. [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YpIOjJFZ-Zaomg77ImeQHSqYBLB8T1Ce?usp=sharing) | **Model** | **Overall** | **EQ** | **SQ** | **SA** | **OO** | **OS** | **EP** | **FR** | **SP** | **Source** | |--------------------------|-------------|----------|----------|----------|----------|----------|----------|----------|----------|-------------| | InternVL2.5-4B | **36.01** | **34.30**| 34.40 | 43.60 | 44.40 | 16.50 | **31.10**| **50.10**| **33.70**| Table | | SpaceThinker | 32.72 | 32.73 | 24.81 | 47.26 | 50.33 | 33.63 | 9.25 | 37.54 | 26.25 | GPT Eval | | **SpaceOm** | 32.32 | 32.47 | 24.81 | **47.63**| 50.00 | 32.52 | 9.12 | 37.04 | 25.00 | GPT Eval | | SpaceQwen | 31.98 | 31.19 | 25.89 | 41.61 | **51.98**| **35.18**| 10.97 | 36.54 | 22.50 | GPT Eval | | Qwen2.5-VL-3B-Instruct | 30.00 | 31.70 | **45.50**| 39.00 | 43.00 | 25.30 | 11.50 | 22.80 | 21.20 | Table | **Legend:** - EQ: Entity Quantification - SQ: Scene Quantification - SA: Size Assessment - OO: Object-Object spatial relations - OS: Object-Scene spatial relations - EP: Entity Presence - FR: Functional Reasoning - SP: Spatial Planning > â„šī¸ Note: Scores for SpaceQwen, SpaceThinker, SpaceOm are generated via `gpt_eval_score` on single-choice (`*-single`) versions of the SpaCE-10 benchmark tasks. Other entries reflect leaderboard accuracy scores from the official SpaCE-10 evaluation table. Read more about the [SpaCE-10 benchmark](https://arxiv.org/pdf/2506.07966v1) or see [results here](https://huggingface.co/datasets/salma-remyx/SpaceOm_SpaCE-10_Results/blob/main/20250611_041721_results.json) ## Limitations - Performance may degrade in cluttered environments or camera perspective. - This model was fine-tuned using synthetic reasoning over an internet image dataset. - Multimodal biases inherent to the base model (Qwen2.5-VL) may persist. - Not intended for use in safety-critical or legal decision-making. > Users are encouraged to evaluate outputs critically and consider fine-tuning for domain-specific safety and performance. Distances estimated using autoregressive > transformers may help in higher-order reasoning for planning and behavior but may not be suitable replacements for measurements taken with high-precision sensors, > calibrated stereo vision systems, or specialist monocular depth estimation models capable of more accurate, pixel-wise predictions and real-time performance. ## Citation ``` @article{chen2024spatialvlm, title = {SpatialVLM: Endowing Vision-Language Models with Spatial Reasoning Capabilities}, author = {Chen, Boyuan and Xu, Zhuo and Kirmani, Sean and Ichter, Brian and Driess, Danny and Florence, Pete and Sadigh, Dorsa and Guibas, Leonidas and Xia, Fei}, journal = {arXiv preprint arXiv:2401.12168}, year = {2024}, url = {https://arxiv.org/abs/2401.12168}, } @misc{qwen2.5-VL, title = {Qwen2.5-VL}, url = {https://qwenlm.github.io/blog/qwen2.5-vl/}, author = {Qwen Team}, month = {January}, year = {2025} } @misc{vl-thinking2025, title={SFT or RL? An Early Investigation into Training R1-Like Reasoning Large Vision-Language Models }, author={Hardy Chen and Haoqin Tu and Fali Wang and Hui Liu and Xianfeng Tang and Xinya Du and Yuyin Zhou and Cihang Xie}, year = {2025}, publisher = {GitHub}, journal = {GitHub repository}, howpublished = {\url{https://github.com/UCSC-VLAA/VLAA-Thinking}}, } @article{wu2025spatialscore, author = {Wu, Haoning and Huang, Xiao and Chen, Yaohui and Zhang, Ya and Wang, Yanfeng and Xie, Weidi}, title = {SpatialScore: Towards Unified Evaluation for Multimodal Spatial Understanding}, journal = {arXiv preprint arXiv:2505.17012}, year = {2025}, } ```