π Journey to Reproduce Search-R1
Seungyoun Shin Β· 2025
Just wrapped up a deep-dive project to reproduceβand slightly beatβthe original Search-R1 retrieval-augmented approach. Along the way, I uncovered a key insight: the use of reflective reasoning phrases (e.g., "think again," "re-verify") is strongly proportional to improvements in benchmark performance. Hereβs the ride π
π What I did
- RL-tuned Qwen 2.5-3B-Instruct on Wikipedia corpus with GRPO inside VERL.
- Implemented and tracked the usage of reflective reasoning phrases (e.g., "think again," "re-verify")
- Logged every run in Weights & Biases for full transparency. wandb report
π Results (Pass@1)
Dataset | Paper (Search-R1,Qwen2.5-3B-it) | This run |
---|---|---|
NQ | 39.7 % | 40.6 % |
TriviaQA | 56.5 % | 58.2 % |
PopQA | 39.1 % | 42.0 % |
HotpotQA | 33.1 % | 33.8 % |
2Wiki | 31.0 % | 33.2 % |
Musique | 12.4 % | 11.1 % |
Bamboogle | 23.2 % | 29.6 % |
Retrieved passages are masked from the loss, so the model learns when to search rather than memorising answers.
This work achieves an average score of 0.349, which represents approximately a 1.29Γ improvement over the RAG baseline (0.270). Notably, it also performs slightly better than the original search-r1 model.
π§ Reasoning style
<think>
β <tool_call>
β <tool_response>
β <answer>
β Reflective Behaviour
Reflective Phrases (e.g., "think again," "re-verify," "letβs reconsider") gradually increase in usage throughout the training steps. This figure compares the average performance scores across 7 benchmarks (NQ, TriviaQA, PopQA, Musique, HotpotQA, Bamboogle and 2wikimultihopqa) against the Reflective Phrase Ratio. Notably, the Reflective Phrase Ratio and the Avg Score show a strong proportional relationship, indicating that more reflective language corresponds with better benchmark performance.
You can observe self-verification lines such as βUpon close inspectionβ and βThink again,β showing the modelβs built-in tendency to re-evaluate its own claims before answering.
π» Full inference script (drop next to the weights & run)
#!/usr/bin/env python3
"""
Minimal multi-turn tool-calling demo for the Qwen2.5-3B Search-R1 model
"""
from __future__ import annotations
import json, re, sys
from typing import List
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from duckduckgo_search import DDGS
SYS = "You are a helpful and harmless assistant."
USR_PRE = ("Answer the given question. You must conduct reasoning inside <think> and "
"</think> first every time you get new information. After reasoning, if you "
"lack knowledge, call a search engine by <tool_call> query </tool_call>. "
"It returns results between <tool_response> and </tool_response>. "
"Provide the final answer inside <answer>. For example, <answer> Beijing </answer>. Question: ")
MODEL = "Seungyoun/qwen2.5-3b-it_searchR1-like-multiturn"
MAX_TURNS, MAX_NEW = 4, 512
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEARCH_SCHEMA = {
"type": "function",
"function": {
"name": "search",
"description": "DuckDuckGo web search",
"parameters": {
"type": "object",
"properties": {
"query_list": {
"type": "array",
"items": {"type": "string"},
"description": "Fully-formed semantic queries."
}
},
"required": ["query_list"],
},
},
}
def prompt(q: str) -> List[dict]:
return [{"role": "system", "content": SYS},
{"role": "user", "content": USR_PRE + q}]
def ddg(q: str, k: int = 5) -> str:
with DDGS() as ddgs:
hits = list(ddgs.text(q, safesearch="moderate", max_results=k))
return "\n".join(f"{i+1}. {h['title']} β {h['body']} ({h['href']})" for i, h in enumerate(hits))
def parse_queries(raw: str) -> List[str]:
try:
p = json.loads(raw)
if p.get("name") == "search":
return p.get("arguments", {}).get("query_list", [])
except json.JSONDecodeError:
pass
return [raw]
def main() -> None:
q = sys.argv[1] if len(sys.argv) > 1 else "How is the weather in Seoul?"
tok = AutoTokenizer.from_pretrained(MODEL, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, device_map="auto")
hist = tok.apply_chat_template(prompt(q), tools=[SEARCH_SCHEMA], add_generation_prompt=True, tokenize=False)
pat = re.compile(r"<tool_call>\s*(.*?)\s*</tool_call>", re.S)
for turn in range(MAX_TURNS):
enc = tok(hist, return_tensors="pt").to(DEVICE)
out = model.generate(**enc, max_new_tokens=MAX_NEW, temperature=0.7, do_sample=True)
delta = tok.decode(out[0][enc.input_ids.shape[1]:], skip_special_tokens=True)
print(f"\n===== Assistant (turn {turn+1}) =====\n{delta}\n")
hist += delta
m = pat.search(delta)
if not m: break
results = "\n---\n".join(ddg(s, 5) for s in parse_queries(m.group(1)))
hist += f"<tool_response>\n{results}\n</tool_response>"
if __name__ == "__main__":
main()
This will get :
π Try it yourself
Weights + script: https://huggingface.co/Seungyoun/qwen2.5-3b-it_searchR1-like-multiturn
π Thanks
Qwen team, Search-R1 authors, VERL maintainers, and the open-source community that makes reproducibility possible.