Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

__init__.py +6 -0
__pycache__/__init__.cpython-310.pyc +0 -0
inference/__init__.py +0 -0
inference/__pycache__/__init__.cpython-310.pyc +0 -0
inference/__pycache__/o1_searcher.cpython-310.pyc +0 -0
inference/__pycache__/r1_searcher.cpython-310.pyc +0 -0
inference/__pycache__/re_call.cpython-310.pyc +0 -0
inference/__pycache__/simpledeepsearch.cpython-310.pyc +0 -0
inference/__pycache__/zerosearch.cpython-310.pyc +0 -0
inference/o1_searcher.py +481 -0
inference/oss.py +195 -0
inference/r1_searcher.py +344 -0
inference/re_call.py +980 -0
inference/simpledeepsearch.py +417 -0
inference/zerosearch.py +249 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .inference.re_call import ReCall
+from .inference.r1_searcher import R1Searcher, R1SearchConfig
+from .inference.zerosearch import ZeroSearchInference, ZeroSearchConfig
+from .inference.o1_searcher import O1Cfg, O1Searcher
+from .inference.simpledeepsearch import SDSCfg, SDSearcher
+__all__ = ["ReCall", "R1Searcher", "ZeroSearchInference", "ZeroSearchConfig", "R1SearchConfig", "O1Cfg", "O1Searcher", "SDSCfg", "SDSearcher"]

__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (613 Bytes). View file

inference/__init__.py ADDED Viewed

File without changes

inference/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (187 Bytes). View file

inference/__pycache__/o1_searcher.cpython-310.pyc ADDED Viewed

Binary file (15.9 kB). View file

inference/__pycache__/r1_searcher.cpython-310.pyc ADDED Viewed

Binary file (11.1 kB). View file

inference/__pycache__/re_call.cpython-310.pyc ADDED Viewed

Binary file (27.6 kB). View file

inference/__pycache__/simpledeepsearch.cpython-310.pyc ADDED Viewed

Binary file (13.7 kB). View file

inference/__pycache__/zerosearch.cpython-310.pyc ADDED Viewed

Binary file (7.86 kB). View file

inference/o1_searcher.py ADDED Viewed

	@@ -0,0 +1,481 @@

+#!/usr/bin/env python3
+"""o1_searcher_inference.py — Serper‑based Search‑o1 re‑implementation
+with *original* in‑house summarisation workflow, step‑replacement logic and
+bug‑fixes for duplicate queries / ValueError.
+"""
+from __future__ import annotations
+import os, re, json, time, string, pathlib
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+import requests, trafilatura
+import threading
+from openai import OpenAI, APIStatusError
+# -----------------------------------------------------------------------------
+# Optional NLTK sentence tokenizer (fallback to regex) -------------------------
+try:
+    from nltk.tokenize import sent_tokenize  # type: ignore
+except Exception:  # ImportError *or* missing punkt data
+    def sent_tokenize(x: str):
+        return re.split(r"(?<=[.!?]) +", x)
+def _oa() -> OpenAI:
+    th = threading.current_thread()
+    if not hasattr(th, "_oa"):
+        th._oa = OpenAI()
+    return th._oa
+# -----------------------------------------------------------------------------
+# Special tags & constants -----------------------------------------------------
+BEGIN_SEARCH_QUERY  = "<|begin_search_query|>"
+END_SEARCH_QUERY    = "<|end_search_query|>"
+BEGIN_DOCUMENT_QUERY    = "<|begin_of_document|>"
+END_DOCUMENT_QUERY    = "<|end_of_document|>"
+THINK_OPEN, THINK_CLOSE = "<think>", "</think>"
+EOS_TOKEN  = "<|im_end|>"
+ANSWER_OPEN, ANSWER_CLOSE = "<answer>", "</answer>"
+STOP_STRINGS = [END_SEARCH_QUERY, ANSWER_CLOSE, EOS_TOKEN, "<|endoftext|>"]
+ALLOWED_DATASETS = {"musique", "frames", "simpleqa", "browsercomp"}
+# tokenizer =
+TOKENIZER_DIR = "/home/fractal_admin/shreyas/models/Qwen3-4B"
+# ─────────────────────────  BASIC UTILS  ──────────────────────────────
+def retry(max_attempts: int = 4, sleep: int = 1, fallback=None):
+    """Tiny retry decorator with fixed back‑off."""
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            for i in range(max_attempts):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as exc:
+                    if i == max_attempts - 1:
+                        #print(f"[retry] {func.__name__} failed – giving up: {exc}")
+                        return fallback
+                    #print(f"[retry] {func.__name__}: attempt {i+1}/{max_attempts} → {exc}")
+                    time.sleep(sleep)
+        return wrapper
+    return decorator
+# ───────────────────────── tokenizer ────────────────────────────────────────
+try:
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
+except Exception as e:
+    import sys
+    sys.exit(f"❌  Could not load Qwen3 tokenizer: {e}")
+# -----------------------------------------------------------------------------
+# Helper functions -------------------------------------------------------------
+def remove_punc(t: str) -> str:
+    return t.translate(str.maketrans("", "", string.punctuation))
+# legacy aliases for older checkpoints  ---------------------------------------
+_nopunc = remove_punc
+def f1(a: set, b: set) -> float:
+    inter = len(a & b)
+    return 0.0 if inter == 0 else 2 * inter / (len(a) + len(b))
+# legacy alias
+_f1 = f1
+def extract_snippet_ctx(text: str, snippet: str, win: int = 2500) -> str:
+    """Return *window*‑sized context around the sentence most similar to snippet."""
+    text = text[:50_000]
+    sn_set = set(remove_punc(snippet.lower()).split())
+    best, best_score = None, 0.20
+    for sent in sent_tokenize(text):
+        score = f1(sn_set, set(remove_punc(sent.lower()).split()))
+        if score > best_score:
+            best, best_score = sent, score
+    if best:
+        pos = text.find(best)
+        return text[max(0, pos - win): pos + len(best) + win]
+    return text[: 2 * win]
+# -----------------------------------------------------------------------------
+# Config dataclass -------------------------------------------------------------
+@dataclass
+class O1Cfg:
+    serper_api_key: str = "7bfe51ead1a1766b656c1355b292d1d29c15c114"
+    gl: str = "us"; hl: str = "en"
+    top_k: int = 10; max_doc_len: int = 3000
+    max_search: int = 10; max_turn: int = 15
+    use_jina: bool = True
+    jina_tpl: str = "https://r.jina.ai/http://{}"
+    # generation params
+    temperature: float = 0.7; top_p: float = 0.8; top_k_sampling: int = 20
+    rep_pen: float = 1.05; thinker_max_tokens: int = 32768
+    summariser_model: str = "gpt-4o-mini"
+# -----------------------------------------------------------------------------
+# Serper search + page fetch ---------------------------------------------------
+def serper_search(q: str, num: int, key: str, gl="us", hl="en") -> List[Dict]:
+    hdr = {"X-API-KEY": key, "Content-Type": "application/json"}
+    body = {"q": q, "num": num, "gl": gl, "hl": hl}
+    r = requests.post("https://google.serper.dev/search", json=body, headers=hdr, timeout=20)
+    r.raise_for_status(); return r.json().get("organic", [])
+def fetch_page(url: str, cfg: O1Cfg, snippet: str = "") -> str:
+    try:
+        txt = ""
+        if cfg.use_jina:
+            r = requests.get(cfg.jina_tpl.format(url), timeout=15)
+            if r.ok and len(r.text.strip()) > 100:
+                txt = r.text.strip()
+        if txt == "":
+            r = requests.get(url, timeout=15); r.raise_for_status()
+            txt = trafilatura.extract(r.text, output_format="txt") or ""
+        if snippet:
+            txt = extract_snippet_ctx(txt, snippet, cfg.max_doc_len)
+        return txt
+    except Exception:
+        return ""
+# -----------------------------------------------------------------------------
+# replace_recent_steps  --------------------------------------------------------
+def replace_recent_steps(origin: str, patch: str) -> str:
+    """Apply *patch* (containing numbered `Step N:` lines) to *origin*."""
+    step_re = re.compile(r"Step\s+(\d+):\s*")
+    def parse(block: str) -> Dict[int, str]:
+        cur, buf, out = None, [], {}
+        for line in block.splitlines():
+            m = step_re.match(line)
+            if m:
+                if cur is not None:
+                    out[cur] = "\n".join(buf).strip()
+                cur, buf = int(m.group(1)), [line[m.end():].strip()]
+            elif cur is not None:
+                buf.append(line)
+        if cur is not None:
+            out[cur] = "\n".join(buf).strip()
+        return out
+    base = parse(origin); mod = parse(patch)
+    for k, v in mod.items():
+        if "DELETE THIS STEP" in v:
+            base.pop(k, None)
+        else:
+            base[k] = v
+    return "\n\n".join(base[k] for k in sorted(base))
+# -----------------------------------------------------------------------------
+# Prompts ----------------------------------------------------------------------
+# from prompts import get_webpage_to_reasonchain_instruction  # keep original helper
+# -----------------------------------------------------------------------------
+# Main agent -------------------------------------------------------------------
+class O1Searcher:
+    # STOP_TOKENS = [
+    #     "<|im_end|>",
+    #     "<|endoftext|>",
+    #    "<|end_of_query|>",
+    #    " <|end_of_query|>",
+    #    "<|end_of_query|>\n",
+    #    "<|end_of_query|>\n\n",
+    #    " <|end_of_query|>\n",
+    #    " <|end_of_query|>\n\n",
+    # ]
+    get_webpage_to_reasonchain_instruction = """**Task Instruction:**
+    You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question.
+    **Guidelines:**
+    1. **Analyze the Searched Web Pages:**
+    - Carefully review the content of each searched web page.
+    - Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question.
+    2. **Extract Relevant Information:**
+    - Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**.
+    - Ensure that the extracted information is accurate and relevant.
+    3. **Output Format:**
+    - **If the web pages provide helpful information for current search query:** Present the information beginning with **Final Information** as shown below.
+    **Final Information**
+    [Helpful information]
+    - **If the web pages do not provide any helpful information for current search query:** Output the following text.
+    **Final Information**
+    No helpful information found.
+    **Inputs:**
+    - **Previous Reasoning Steps:**
+    {prev_reasoning}
+    - **Current Search Query:**
+    {search_query}
+    - **Searched Web Pages:**
+    {document}
+    Now you should analyze each web page and find helpful information based on the current search query {search_query} and previous reasoning steps.
+    Return the Helpful information in the <information></information> tags
+    """
+    SUMMARY_PROMPT = (
+        """## Task Description:\n"
+        "Given the search query and the content of the searched webpage, "
+        "extract information relevant to the query and write one summary paragraph."\n\n"
+        "## Guidelines:\n"
+        "(1) The extracted content should be relevant to the query.\n"
+        "(2) The form of the extracted content **must be a summary paragraph** rather than a direct answer.\n"
+        "(3) If the webpage content is unrelated to the query, output \"None\".\n\n"
+        "## Output Format:\n"
+        "[Exacted Content]: <summary‑paragraph‑or‑None>\n\n"
+        "## Inputs:\n"
+        "[Search Query]\n{search_query}\n\n"
+        "[Webpage Content]\n{document}\n\n"
+        "## Output:\n"""
+    )
+    sys_prompt_multiqa = (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to 16.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Alice David is the voice of Lara Croft in a video game developed by which company?\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to find out who voices Lara Croft in the video game.\n"
+        "- Then, I need to determine which company developed that video game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>Alice David Lara Croft voice<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant thinks: The search results indicate that Alice David is the voice of Lara Croft in a specific video game. Now, I need to find out which company developed that game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>video game developed by Alice David Lara Croft<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n"
+        "Always give you final answer between <answer></answer> tags"
+    )
+    def __init__(self, cfg: O1Cfg, thinker_url: str):
+        if not cfg.serper_api_key:
+            raise ValueError("SERPER_API_KEY required")
+        self.cfg, self.model_url = cfg, thinker_url.rstrip("/")
+        self.search_cache: Dict[str, List[Dict]] = {}
+        self.page_cache: Dict[Tuple[str, str], str] = {}
+        self.openai = _oa()
+    # --- low‑level generation call ------------------------------------------
+    @retry(4,1)
+    def _generate(self, prompt: str) -> str:
+        prompt_tokens = tokenizer(prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+        max_tokens_left = self.cfg.thinker_max_tokens - len(prompt_tokens) - 100
+        resp = requests.post(
+            f"{self.model_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": self.cfg.temperature,
+                    "top_p": self.cfg.top_p,
+                    "max_new_tokens": max_tokens_left,
+                    "repetition_penalty": self.cfg.rep_pen,
+                    "stop": STOP_STRINGS,
+                },
+            },
+            timeout=60,
+        ).json()
+        # resp.raise_for_status()
+        generated = resp["text"]                       # what you have now
+        matched   = resp["meta_info"]["finish_reason"].get("matched")
+        reason = resp["meta_info"]["finish_reason"].get("type")
+        # ⇢ append the tag back only if it was removed
+        if reason == "stop" and matched in STOP_STRINGS:
+            if not "<|end_of_query|>" in generated:
+                generated += matched
+        if reason == "stop" and matched == 151645:
+             if not generated.endswith("<|im_end|>"):
+                generated += "<|im_end|>"
+        if reason == "stop" and matched == 151643:
+             if not generated.endswith("<|endoftext|>"):
+                generated += "<|endoftext|>"
+        return generated
+    # @retry(fallback="None")
+    def _summarise_openai(self, query: str, doc: str) -> str:
+        prompt = self.SUMMARY_PROMPT.format(search_query=query, document=doc)
+        resp = self.openai.chat.completions.create(
+            model=self.cfg.summariser_model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=1024,
+            temperature=0.0,
+        )
+        # print(resp)
+        text = resp.choices[0].message.content
+        return text.split("[Exacted Content]:")[-1].strip()
+    def _generate_summary(self, prompt: str) -> str:
+        summary_url = "http://0.0.0.0:1241"
+        prompt_tokens = tokenizer(prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+        max_tokens_left = self.cfg.thinker_max_tokens - len(prompt_tokens) - 100
+        resp = requests.post(
+            f"{summary_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": self.cfg.temperature,
+                    "max_new_tokens": 8192,#max_tokens_left,
+                    "stop": STOP_STRINGS,
+                },
+            },
+            timeout=60,
+        ).json()
+        generated = resp["text"]                       # what you have now
+        matched   = resp["meta_info"]["finish_reason"].get("matched")
+        reason = resp["meta_info"]["finish_reason"].get("type")
+        # ##print("-"*100)
+        # ##print(resp)
+        # ##print(matched)
+        # ##print("-"*100)
+        # ⇢ append the tag back only if it was removed
+        if reason == "stop" and matched in STOP_STRINGS:
+            if not "<|end_of_query|>" in generated:
+                generated += matched + EOS_TOKEN
+        if reason == "stop" and matched == 151645:
+             if not generated.endswith("<|im_end|>"):
+                generated += "<|im_end|>"
+        if reason == "stop" and matched == 151643:
+             if not generated.endswith("<|endoftext|>"):
+                generated += "<|endoftext|>"
+        return generated
+    # --- public entry -------------------------------------------------------
+    def run(self, question: str):
+        prompt = (
+            f"<|im_start|>system\n{self.sys_prompt_multiqa}<|im_end|>\n"
+            f"<|im_start|>user\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n{THINK_OPEN}"
+        )
+        full_trace = prompt  # <-- Track full trace
+        queries: List[str] = []
+        seen_queries: set[str] = set()
+        for i in range(self.cfg.max_turn):
+            chunk = self._generate(prompt)
+            prompt += chunk
+            if ANSWER_CLOSE in chunk:
+                break
+            ##print(f"step-{i}")
+            ##print(chunk)
+            query = self._extract_query(chunk)
+            ##print(query)
+            if not query or len(queries) >= self.cfg.max_search:
+                break
+            if query in seen_queries:
+                continue
+            queries.append(query)
+            seen_queries.add(query)
+            doc = self._retrieve_doc(query)
+            prev_reasoning = self._extract_reasoning(prompt)
+            # summary = "\n<|im_start|>user" + self._summarise_openai(query, doc) + EOS_TOKEN + "\n<|im_start|>assistant" + THINK_OPEN
+            summary = "\n<|im_start|>user" + self._summarise(prev_reasoning, query, doc) + EOS_TOKEN + "\n<|im_start|>assistant" + THINK_OPEN
+            ##print("summary")
+            # print(summary)
+            prompt += summary   # <-- Log summary to trace
+            # new_reasoning = replace_recent_steps(prev_reasoning, summary)
+            # if prev_reasoning:
+            #     prompt = prompt.rsplit(prev_reasoning, 1)[0] + new_reasoning + THINK_CLOSE + THINK_OPEN
+            # else:
+                # prompt += new_reasoning + THINK_CLOSE + THINK_OPEN
+            # full_trace +=  + THINK_CLOSE + THINK_OPEN + "\n"  # <-- Log reasoning to trace
+        else:
+            final = f"{ANSWER_OPEN}I don't know.{ANSWER_CLOSE}"
+            prompt += final
+            # full_trace += final
+        return prompt, queries
+    # ---------------------------------------------------------------------
+    # helpers --------------------------------------------------------------
+    def _extract_query(self, txt: str) -> Optional[str]:
+        if BEGIN_SEARCH_QUERY not in txt or END_SEARCH_QUERY not in txt:
+            return None
+        frag = txt.split(BEGIN_SEARCH_QUERY)[-1].split(END_SEARCH_QUERY)[0]
+        # strip quotes / ellipsis / tabs
+        return re.sub(r"[\"'…\t]", " ", frag.split("<|")[0]).strip()
+    def _retrieve_doc(self, query: str) -> str:
+        if query not in self.search_cache:
+            self.search_cache[query] = serper_search(query, self.cfg.top_k, self.cfg.serper_api_key,
+                                                     gl=self.cfg.gl, hl=self.cfg.hl)
+        for hit in self.search_cache[query]:
+            # ##print("hit")
+            # ##print(hit)
+            url, sn = hit.get("link", ""), hit.get("snippet", "")
+            if not url:
+                continue
+            key = (url, sn)
+            if key not in self.page_cache:
+                self.page_cache[key] = fetch_page(url, self.cfg, sn)
+            if self.page_cache[key]:
+                return self.page_cache[key]
+        return ""
+    def _summarise(self, prev: str, query: str, doc: str) -> str:
+        rid_prompt = self.get_webpage_to_reasonchain_instruction.format(prev_reasoning = prev, search_query = query, document = doc)
+        chat = f"<|im_start|>user\\n{rid_prompt}\\n<|im_end|>\\n<|im_start|>assistant\\n"
+        resp = self._generate_summary(chat)
+        # ##print("summarization out \n", resp)
+        return BEGIN_DOCUMENT_QUERY + self._extract_summary(resp)  + END_DOCUMENT_QUERY
+        # ##print("summary")
+        # ##print(resp)
+        # match = re.search(r"Final Information\*\*\s*\n(.+?)<\|im_end\|>", resp)
+        # if match:
+        #     final_info = match.group(1).strip()
+        #     ##print(final_info)
+        # return final_info
+    def _extract_summary(self, prompt: str) -> str:
+        if "<information>" in prompt:
+            summary = prompt.split("<information>")[-1].split("</information>")[0] if THINK_OPEN in prompt else ""
+            return summary
+        else:
+            match = re.search(r"\*\*Final Information\*\*\s*\n(.+?)<\|im_end\|>", prompt)
+            if match:
+                final_info = match.group(1).strip()
+                return final_info
+        return prompt
+    def _extract_reasoning(self, prompt: str) -> str:
+        return prompt.split(THINK_OPEN)[-1].split(THINK_CLOSE)[0] if THINK_OPEN in prompt else ""
+# -----------------------------------------------------------------------------
+# CLI -------------------------------------------------------------------------
+# if __name__ == "__main__":
+#     # import argparse, json
+#     # parser = argparse.ArgumentParser()
+#     # parser.add_argument("question"); parser.add_argument("--dataset", required=True, choices=sorted(ALLOWED_DATASETS)); parser.add_argument("--model-url", required

inference/oss.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# recall_oss_harmony.py
+# ReCall loop using the Harmony chat format renderer, calling vLLM's
+# OpenAI-compatible **/v1/completions** endpoint with a rendered prompt.
+import json
+import re
+import time
+from functools import wraps
+from typing import List, Optional
+import requests
+from openai_harmony import (
+    load_harmony_encoding,
+    HarmonyEncodingName,
+    Role,
+    Message,
+    Conversation,
+)
+def retry(max: int = 5, sleep: float = 1.0, fallback=None):
+    def decorator(fn):
+        @wraps(fn)
+        def wrapper(*args, **kwargs):
+            for i in range(max):
+                try:
+                    return fn(*args, **kwargs)
+                except Exception as e:
+                    print(f"[retry] attempt {i+1}/{max} failed: {e}")
+                    if i + 1 == max:
+                        print(f"[retry] giving up – returning {fallback!r}")
+                        return fallback
+                    if sleep:
+                        time.sleep(sleep)
+        return wrapper
+    return decorator
+class ReCallOSSHarmony:
+    TOOL_CALL_RE = re.compile(r"<tool_call>((?:(?!</tool_call>).)*)</tool_call>", re.DOTALL)
+    def __init__(
+        self,
+        executor_url: str,
+        base_url: str,
+        model_name: str,
+        api_key: Optional[str] = None,
+        request_timeout: int = 120,
+    ):
+        self.executor_url = executor_url.rstrip("/")
+        self.base_url = base_url.rstrip("/")
+        self.model_name = model_name
+        self.api_key = api_key
+        self.timeout = request_timeout
+        self.enc = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+    # ---------------- HTTP ----------------
+    def _headers(self):
+        h = {"Content-Type": "application/json"}
+        if self.api_key:
+            h["Authorization"] = f"Bearer {self.api_key}"
+        return h
+    @retry(max=5, sleep=1, fallback={"choices": [{"text": ""}]})
+    def _complete(self, prompt: str, temperature: float, max_tokens: int, stop: Optional[List[str]] = None):
+        payload = {
+            "model": self.model_name,
+            "prompt": prompt,
+            "max_tokens": max_tokens,
+            "temperature": temperature,
+        }
+        if stop:
+            payload["stop"] = stop
+        resp = requests.post(
+            f"{self.base_url}/completions",
+            headers=self._headers(),
+            json=payload,
+            timeout=self.timeout,
+        )
+        if resp.status_code != 200:
+            raise RuntimeError(f"completions HTTP {resp.status_code}: {resp.text}")
+        return resp.json()
+    # ------------- tool plumbing ----------
+    @staticmethod
+    def _validate_tool_tags(s: str) -> bool:
+        starts = [m.start() for m in re.finditer(r"<tool_call>", s)]
+        ends = [m.start() for m in re.finditer(r"</tool_call>", s)]
+        if len(starts) != len(ends):
+            return False
+        return all(st < en for st, en in zip(starts, ends))
+    def extract_tool_calls(self, text: str) -> List[str]:
+        if not self._validate_tool_tags(text):
+            return []
+        return [m.group(1).strip() for m in self.TOOL_CALL_RE.finditer(text)]
+    @staticmethod
+    def _format_tool_call(call_json_str: str) -> str:
+        try:
+            spec = json.loads(call_json_str)
+            fname = spec["name"]
+            args = spec.get("arguments", {}) or {}
+            args_str = ", ".join(f"{k}={repr(v)}" for k, v in args.items())
+            return f"{fname}({args_str})"
+        except Exception as e:
+            return f"error: parse tool call failed: {e}"
+    def _exec_one_call(self, env: str, call_json_str: str) -> str:
+        call_src = self._format_tool_call(call_json_str)
+        if call_src.startswith("error:"):
+            return call_src
+        try:
+            response = requests.post(
+                f"{self.executor_url}/execute",
+                json={"env": env, "call": call_src},
+                timeout=self.timeout,
+            )
+            if response.status_code != 200:
+                return f"error: executor HTTP {response.status_code}"
+            payload = response.json()
+            out = []
+            if payload.get("result"):
+                out.append(f"result:\n{payload['result']}")
+            if payload.get("output"):
+                out.append(f"output:\n{payload['output']}")
+            if payload.get("error"):
+                out.append(f"error:\n{payload['error']}")
+            return "\n".join(out).strip() or "ok"
+        except requests.exceptions.Timeout:
+            return "error: execution timed out"
+        except Exception as e:
+            return f"error: executor exception: {e}"
+    def execute_tool_calls(self, env: str, tool_calls: List[str]) -> List[str]:
+        return [self._exec_one_call(env, c) for c in tool_calls]
+    # ------------- harmony helpers --------
+    def _render(self, messages: List[Message]) -> str:
+        convo = Conversation.from_messages(messages)
+        # Render for a completion (assistant is the next speaker)
+        return self.enc.render_conversation_for_completion(convo, Role.ASSISTANT)
+    # ------------- main run loop ----------
+    @retry(max=5, sleep=1, fallback=("", []))
+    def run(
+        self,
+        env: str,
+        func_schemas,
+        question: str,
+        system_prompt: str = "",
+        temperature: float = 0.2,
+        max_tokens: int = 2048,
+        max_turns: int = 16,
+        stop: Optional[List[str]] = None,
+    ):
+        # Build the initial harmony conversation.
+        # Paste your full system prompt into `system_prompt` before calling.
+        # If you want to include func_schemas in your system content, do:
+        try:
+            sys_msg = system_prompt.format(func_schemas=json.dumps(func_schemas, ensure_ascii=False))
+        except Exception:
+            sys_msg = system_prompt
+        messages: List[Message] = [
+            Message.from_role_and_content(Role.SYSTEM, sys_msg),
+            Message.from_role_and_content(Role.USER, question),
+        ]
+        transcript_chunks: List[str] = []
+        all_tool_calls: List[str] = []
+        for _ in range(max_turns):
+            prompt = self._render(messages)
+            resp = self._complete(prompt=prompt, temperature=temperature, max_tokens=max_tokens, stop=stop)
+            assistant_text = resp["choices"][0]["text"]
+            transcript_chunks.append(assistant_text)
+            messages.append(Message.from_role_and_content(Role.ASSISTANT, assistant_text))
+            if "<answer>" in assistant_text:
+                break
+            tool_calls = self.extract_tool_calls(assistant_text)
+            all_tool_calls.extend(tool_calls)
+            if not tool_calls:
+                continue
+            results = self.execute_tool_calls(env, tool_calls)
+            tool_resp_block = "".join(
+                f"<tool_response>{tc}\n{res}\n</tool_response>\n"
+                for tc, res in zip(tool_calls, results)
+            )
+            messages.append(Message.from_role_and_content(Role.USER, tool_resp_block))
+        transcript = "".join(transcript_chunks)
+        return transcript, all_tool_calls

inference/r1_searcher.py ADDED Viewed

	@@ -0,0 +1,344 @@

+#!/usr/bin/env python3
+# r1_searcher_inference.py
+"""
+Faithful re‑implementation of the **R1‑Searcher** loop described in
+‘Reasoning with Retrieval 1’ (2024).
+Key properties ─────────────────────────────────────────────────────────
+• The policy LLM (“thinker”) reasons inside <think> … </think>.
+• When it needs external knowledge it emits exactly **one** single‑triple
+  query inside <|begin_of_query|> … <|end_of_query|>.
+• The wrapper searches *English Wikipedia only* (via Serper.dev).
+• It summarises the *first* retrieved article that contains the query terms
+  and injects that summary between
+      <|begin_of_documents|> … <|end_of_documents|>
+  before handing control back to the thinker.
+• The loop stops when the thinker outputs </answer> or when the configurable
+  round‑limit is reached.
+The class is API‑compatible with the user’s existing `ReCall` wrapper so the
+same benchmarking harness can swap between them with a single flag.
+"""
+from __future__ import annotations
+import os
+import time
+from dataclasses import dataclass
+from typing import List, Optional
+import requests
+from bs4 import BeautifulSoup
+import trafilatura
+import wikipedia
+from urllib.parse import unquote
+from openai import OpenAI
+client = OpenAI(api_key = "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")
+TOKENIZER_DIR = "/home/fractal_admin/shreyas/models/Qwen3-4B"
+# ───────────────────────── tokenizer ────────────────────────────────────────
+try:
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
+except Exception as e:
+    import sys
+    sys.exit(f"❌  Could not load Qwen3 tokenizer: {e}")
+# ─────────────────────────  BASIC UTILS  ──────────────────────────────
+def retry(max_attempts: int = 4, sleep: int = 1, fallback=None):
+    """Tiny retry decorator with fixed back‑off."""
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            for i in range(max_attempts):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as exc:
+                    if i == max_attempts - 1:
+                        #print(f"[retry] {func.__name__} failed – giving up: {exc}")
+                        return fallback
+                    #print(f"[retry] {func.__name__}: attempt {i+1}/{max_attempts} → {exc}")
+                    time.sleep(sleep)
+        return wrapper
+    return decorator
+# ──────────────────────────  CONFIG  ──────────────────────────────────
+@dataclass
+class R1SearchConfig:
+    # Serper.dev parameters
+    serper_api_key: str = "7bfe51ead1a1766b656c1355b292d1d29c15c114"
+    serper_url: str = "https://google.serper.dev/search"
+    gl: str = "us"
+    hl: str = "en"
+    # Policy model endpoint (vLLM)
+    thinker_temperature: float = 0.0
+    thinker_max_tokens: int = 40960
+    # Loop / misc
+    max_rounds: int = 16
+    summariser_model: str = "gpt-4o-mini"
+# ─────────────────────────  R1‑Searcher  ──────────────────────────────
+class R1Searcher:
+    SYSTEM_PROMPT = """
+    You are a helpful assistant.
+    Given a question, you should answer it by first thinking about the reasoning
+    process in the mind and then providing the final answer.
+    The output format of reasoning process and final answer are enclosed within
+    <think> </think> and <answer> </answer> tags, respectively, i.e.,
+    "<think> reasoning process here </think>
+    <answer> final answer here </answer>".
+    During the thinking process, **you can perform searching for uncertain
+    knowledge** if necessary with the format of
+    "<|begin_of_query|> keyword_1 keyword_2 ... <|end_of_query|>".
+    **A query must involve only a single triple**.
+    Then, the search system will provide you with the retrieval information with
+    the format of "<|begin_of_documents|> ...search results... <|end_of_documents|>".
+    """.strip()
+    SUMMARY_PROMPT = (
+        """## Task Description:\n"
+        "Given the search query and the content of the searched webpage, "
+        "extract information relevant to the query and write one summary paragraph."\n\n"
+        "## Guidelines:\n"
+        "(1) The extracted content should be relevant to the query.\n"
+        "(2) The form of the extracted content **must be a summary paragraph** rather than a direct answer.\n"
+        "(3) If the webpage content is unrelated to the query, output \"None\".\n\n"
+        "## Output Format:\n"
+        "[Exacted Content]: <summary‑paragraph‑or‑None>\n\n"
+        "## Inputs:\n"
+        "[Search Query]\n{search_query}\n\n"
+        "[Webpage Content]\n{document}\n\n"
+        "## Output:\n"""
+    )
+    # Tag constants
+    EOS_TOKEN = "<|im_end|>"
+    THINK_OPEN = "<think>"
+    ANSWER_CLOSE = "</answer>"
+    Q_OPEN, Q_CLOSE = "<|begin_of_query|>", "<|end_of_query|>"
+    DOC_OPEN, DOC_CLOSE = "<|begin_of_documents|>", "<|end_of_documents|>"
+    # Stop strings – must match *exact* token sequences vLLM will see
+    STOP_TOKENS = [
+        "<|im_end|>",
+        "<|endoftext|>",
+       "<|end_of_query|>",
+       " <|end_of_query|>",
+       "<|end_of_query|>\n",
+       "<|end_of_query|>\n\n",
+       " <|end_of_query|>\n",
+       " <|end_of_query|>\n\n",
+    ]
+    # STOP_TOKENS = []
+    def __init__(self, cfg: R1SearchConfig, model_url):
+        self.cfg = cfg
+        self.openai = client
+        self._wiki = wikipedia
+        self._wiki.set_lang("en")
+        # Patch wikipedia lib to use a session with proper UA
+        sess = requests.Session()
+        sess.headers.update({"User-Agent": "r1-searcher-bot/1.0"})
+        self._wiki._http = sess
+        self.model_url=model_url
+    # ── public entry ─────────────────────────────────────────────────
+    def run(self, question: str) -> tuple[str, List[str]]:
+        prompt = (
+            f"<|im_start|>system\n{self.SYSTEM_PROMPT}<|im_end|>\n"
+            f"<|im_start|>user\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n{self.THINK_OPEN}"
+        )
+        queries: List[str] = []
+        for _ in range(self.cfg.max_rounds):
+            model_out = self._call_thinker(prompt)
+            prompt += model_out
+            if self.ANSWER_CLOSE in model_out:
+                break
+            query = self._extract_query(model_out)
+            if not query:
+                break
+            queries.append(query)
+            doc_block = self._retrieve_block(query)
+            prompt += "<|im_start|>user\n" + doc_block + self.EOS_TOKEN + "<|im_start|>assistant\n" + self.THINK_OPEN  # continue loop
+        else:  # exceeded round cap
+            prompt += "<answer>I don't know.</answer><|im_end|>"
+        return prompt, queries
+    # ── thinker call ────────────────────────────────────────────────
+    # @retry()
+    def _call_thinker(self, prompt: str) -> str:
+        prompt_tokens = tokenizer(prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+        max_tokens_left = self.cfg.thinker_max_tokens - len(prompt_tokens) - 100
+        resp = requests.post(
+            f"{self.model_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": self.cfg.thinker_temperature,
+                    "max_new_tokens": 2048,#max_tokens_left,
+                    "stop": self.STOP_TOKENS,
+                    "repetition_penalty": 1.05,
+                },
+            },
+            timeout=60,
+        ).json()
+        generated = resp["text"]                       # what you have now
+        matched   = resp["meta_info"]["finish_reason"].get("matched")
+        reason = resp["meta_info"]["finish_reason"].get("type")
+        #print("-"*100)
+        #print(resp)
+        #print(matched)
+        #print("-"*100)
+        # ⇢ append the tag back only if it was removed
+        if reason == "stop" and matched in self.STOP_TOKENS:
+            if not "<|end_of_query|>" in generated:
+                generated += matched + self.EOS_TOKEN
+        if reason == "stop" and matched == 151645:
+             if not generated.endswith("<|im_end|>"):
+                generated += "<|im_end|>"
+        if reason == "stop" and matched == 151643:
+             if not generated.endswith("<|endoftext|>"):
+                generated += "<|endoftext|>"
+        return generated
+    # ── query helpers ───────────────────────────────────────────────
+    @staticmethod
+    def _extract_query(text: str) -> Optional[str]:
+        if R1Searcher.Q_OPEN not in text or R1Searcher.Q_CLOSE not in text:
+            return None
+        fragment = text.split(R1Searcher.Q_OPEN)[-1].split(R1Searcher.Q_CLOSE)[0]
+        #print("*"*10)
+        #print(fragment)
+        fragment = fragment.split("<|")[0] #handle end_of_query slipping
+        return (
+            fragment.replace("\t", " ")
+            .replace("\"", "")
+            .replace("'", "")
+            .replace("…", "")
+            .strip()
+        ) or None
+    # ── retrieval  & summary  ───────────────────────────────────────
+    def _retrieve_block(self, query: str) -> str:
+        wiki_links = self._serper_wiki_links(query)
+        for url in wiki_links[:3]:
+            text = self._get_wiki_text(url)
+            if not text:
+                continue
+            summary = self._summarise(query, text[:35000])
+            if summary.lower() != "none":
+                return f"{self.DOC_OPEN}\n{summary}\n{self.DOC_CLOSE}\n\n"
+        return f"{self.DOC_OPEN}\nNone\n{self.DOC_CLOSE}\n\n"
+    # --- Serper ------------------------------------------------------
+    @retry()
+    def _serper_wiki_links(self, q: str) -> List[str]:
+        headers = {"X-API-KEY": self.cfg.serper_api_key, "Content-Type": "application/json"}
+        payload = {"q": f"{q} site:en.wikipedia.org", "num": 10, "gl": self.cfg.gl, "hl": self.cfg.hl}
+        r = requests.post(self.cfg.serper_url, json=payload, headers=headers, timeout=20)
+        r.raise_for_status()
+        links = [
+            item.get("link")
+            for item in r.json().get("organic", [])
+            if item.get("link", "").startswith("https://en.wikipedia.org")
+        ]
+        return links
+    def extract_main_text(self, html: str) -> str:
+        txt = trafilatura.extract(html, output_format="txt") or ""
+        if len(txt) >= 500:
+            return txt
+        from readability import Document
+        soup = BeautifulSoup(Document(html).summary(), "lxml")
+        txt  = soup.get_text(" ", strip=True)
+        if len(txt) >= 400:
+            return txt
+        for tag in soup(["script", "style", "noscript"]):
+            tag.decompose()
+        return re.sub(r"\s+", " ", soup.get_text(" ").strip())
+    # --- fetch article ----------------------------------------------
+    def _get_wiki_text(self, url: str) -> str | None:
+        try:
+            # 1. Download
+            r = requests.get(url, timeout=10)
+            r.raise_for_status()
+            # 2. Extract main text
+            txt = self.extract_main_text(r.text).strip()
+            if not txt:
+                return None
+            # 3. Prepend article slug if it isn’t already in the body
+            slug = unquote(url.rsplit("/", 1)[-1]).replace("_", " ")
+            if slug.lower() not in txt.lower():
+                txt = f"{slug}\n\n{txt}"
+            # 4. Return the final value
+            return "[Retrieved from Wikipedia] " + txt
+        except Exception as e:
+            #print("Failed to fetch Wikipedia page %s: %s", url, e)
+            return None
+    # --- call OpenAI to summarise -----------------------------------
+    @retry(fallback="None")
+    def _summarise(self, query: str, doc: str) -> str:
+        prompt = self.SUMMARY_PROMPT.format(search_query=query, document=doc)
+        resp = self.openai.chat.completions.create(
+            model=self.cfg.summariser_model,
+            messages=[{"role": "user", "content": prompt}],
+            max_tokens=1024,
+            temperature=0.0,
+        )
+        text = resp.choices[0].message.content
+        return text.split("[Exacted Content]:")[-1].strip()
+# ───────────────────────────  CLI  ────────────────────────────────────
+if __name__ == "__main__":
+    import argparse, json
+    ap = argparse.ArgumentParser()
+    ap.add_argument("question", type=str, help="Natural‑language question")
+    ap.add_argument("--serper-key", type=str, help="Override SERPER_API_KEY env")
+    args = ap.parse_args()
+    cfg = R1SearchConfig(serper_api_key=args.serper_key or os.getenv("SERPER_API_KEY", ""))
+    agent = R1Searcher(cfg, OpenAI())
+    final_prompt, issued_queries = agent.run(args.question)
+    answer = final_prompt.split("<answer>")[-1].split("</answer>")[0]
+    #print("\nANSWER:", answer)
+    #print("\nQUERIES:", json.dumps(issued_queries, indent=2))

inference/re_call.py ADDED Viewed

	@@ -0,0 +1,980 @@

+import re
+import json
+import requests
+import time
+from typing import List
+from functools import wraps
+from together import Together          # pip install together
+from datetime import datetime          # only needed for retries / logging
+#     return decorator
+def retry(max: int = 10, sleep: int = 1, fallback=None):
+    """
+    Retry `max` times and, if still failing, return `fallback`
+    instead of raising.  This keeps outer loops alive.
+    """
+    def decorator(func):
+        @wraps(func)
+        def wrapper(*args, **kwargs):
+            for i in range(max):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as e:
+                    print(f"[retry] attempt {i+1}/{max} failed: {e}")
+                    if i == max - 1:                 # last try exhausted
+                        print(f"[retry] giving up – returning {fallback!r}")
+                        return fallback              # ← swallow the error
+                    if sleep:
+                        time.sleep(sleep)
+        return wrapper
+    return decorator
+class ReCall():
+    sys_prompt_websailor = """
+    You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
+    In this environment you have access to a set of tools you can use to assist with the user query.
+    You may perform multiple rounds of function calls. In each round, you can call one or more functions.
+    As you proceed, adhere to the following principles:
+    1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
+    2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
+    3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins.
+    Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
+    In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
+    The reasoning process and function calling are enclosed within <think> </think> and <tool_call> </tool_call> tags. \
+    The results of the function calls will be given back to you after execution, \
+    and you can continue to call functions until you get the final answer for the user's question. \
+    Finally, if you have got the answer, enclose it within \\boxed{{}} with latex format and do not continue to call functions, \
+    i.e., <think> Based on the response from the function call, I get the weather information. </think> The weather in Beijing on 2025-04-01 is \\[ \\boxed{{20C}} \\].
+    For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
+    <tool_call>
+    {{"name": <function-name>, "arguments": <args-json-object>}}
+    </tool_call>
+    For Multiple Choice Question always give the final answer as one of the options whichever fits the best.s
+    Always give your answer as option id. and answer.
+    Example:
+    What is the Captial of India ?
+     \\[ \\boxed{{A. India}} \\]
+    """
+    sys_prompt_websailor_deepseek = """
+    You are a Web Information Seeking Master. Your task is to thoroughly seek the internet for information and provide accurate answers to questions. No matter how complex the query, you will not give up until you find the corresponding information.
+    In this environment you have access to a set of tools you can use to assist with the user query.
+    You may perform multiple rounds of function calls. In each round, you can call one or more functions.
+    As you proceed, adhere to the following principles:
+    1. **Persistent Actions for Answers**: You will engage in many interactions, delving deeply into the topic to explore all possible aspects until a satisfactory answer is found.
+    2. **Repeated Verification**: Before presenting a Final Answer, you will **cross-check** and **validate the information** you've gathered to confirm its accuracy and reliability.
+    3. **Attention to Detail**: You will carefully analyze each information source to ensure that all data is current, relevant, and from credible origins.
+    Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
+    In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
+    The reasoning process and function calling are enclosed within <think> </think> and <tool_calls_begin> <tool_calls_end> tags. \
+    The results of the function calls will be given back to you after execution, \
+    and you can continue to call functions until you get the final answer for the user's question. \
+    Finally, if you have got the answer, enclose it within \\boxed{{}} with latex format and do not continue to call functions, \
+    i.e., <think> Based on the response from the function call, I get the weather information. </think> The weather in Beijing on 2025-04-01 is \\[ \\boxed{{20C}} \\].
+    """
+    # sys_prompt_websailor_deepseek = """
+    #         You are a Web Information Seeking Master. Seek the internet thoroughly and provide accurate answers. You may use tools multiple times.
+    #         Principles:
+    #         1) Persistent Actions for Answers: explore deeply until you find satisfactory information.
+    #         2) Repeated Verification: cross-check and validate before the final answer.
+    #         3) Attention to Detail: ensure sources are current, relevant, and credible.
+    #         You have the following tools (JSONSchema):
+    #         ```json
+    #         {func_schemas}
+    #         Follow this EXACT tool-call I/O protocol.
+    #         TO CALL ONE OR MORE TOOLS:
+    #         Respond only with this block (no extra text before/after):
+    #         <｜tool▁call▁begin｜>function<｜tool▁sep｜>{tool_name}{args_json}
+    #         <｜tool▁call▁end｜>
+    #         ... (repeat <｜tool▁call▁begin｜>…<｜tool▁call▁end｜> for multiple tools)
+    #         <｜tool▁calls▁end｜><｜end▁of▁sentence｜>
+    #         HOW TOOL RESULTS ARRIVE:
+    #         I will send tool outputs back embedded inside a single user message, each wrapped like:
+    #         <tool_response>{one_tool_call_you_made}
+    #         {tool_return_text_or_json}
+    #         </tool_response>
+    #         WHAT TO DO NEXT:
+    #         If you still need info, emit another tool-calls block (same exact format).
+    #         If you have the final answer, output:
+    #         <answer> …your final answer… </answer>
+    #         and DO NOT call any more tools.
+    #         Important:
+    #         Do not expose your internal reasoning; keep thoughts private.
+    #         When emitting a tool-calls block, do not include any explanations, only the block specified above.
+    #         Arguments must be valid JSON.
+    #         Stop tokens to respect: <｜end▁of▁sentence｜>
+    #         """
+    system_prompt = """In this environment you have access to a set of tools you can use to assist with the user query. \
+    You may perform multiple rounds of function calls. \
+    In each round, you can call one or more functions. \
+    Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
+    In your response, you need to first think about the reasoning process in the mind and then conduct function calling to get the information or perform the actions if needed. \
+    The reasoning process and function calling are enclosed within <think> </think> and <tool_call> </tool_call> tags. \
+    The results of the function calls will be given back to you after execution, \
+    and you can continue to call functions until you get the final answer for the user's question. You are encouraged to utilize as many function calls as possible. \
+    Finally, if you have got the answer, wrap it in <answer> </answer> **and do not call any more functions**, \
+    e.g. <think> Based on the tool results … </think> <answer>20 °C</answer>.
+    For each function call, return a JSON object with function name and arguments within <tool_call></tool_call> XML tags:
+    <tool_call>
+    {{"name": <function-name-1>, "arguments": <args-json-object>}}
+    </tool_call>"""
+    system_prompt_budget = """
+        You are an autonomous reasoning agent with access to external tools.
+        The conversation will retain only the *most-recent* <tool_response> block; older ones disappear.
+        As soon as you receive tool results, extract the *essential facts tables links etc* that might be needed for later and restate them inside your <think> section.
+         **Never copy large bodies of text** or raw JSON from tool output into your visible reply; summarise instead.
+        ◎ **Workflow**
+        1. In every round, start with <think> … </think> to lay out your short reasoning.
+        2. If you need external information or an action, emit one or more <tool_call> … </tool_call> blocks (JSON spec below).
+        3. When the environment returns <tool_response>, continue reasoning; you may call more tools.
+        4. Once you can answer the user, wrap the final result in <answer> … </answer> and STOP calling tools.
+        ◎ **Tool call format** (do **not** restate the schema or any explanations):
+        <tool_call>
+        {{"name": <function-name-1>, "arguments": <args-json-object>}}
+        </tool_call>
+        Here are available functions in JSONSchema format: \n```json\n{func_schemas}\n```
+    """
+    system_prompt_forcing_tool_call = """
+    In this environment you have access to a set of tools you can use to assist with the user query.
+    You may perform multiple rounds of function calls upto ten. In each round, you can call upto three functions.
+    ──────────────────────── AVAILABLE TOOLS ────────────────────────
+    ```json
+    [
+    {
+        "type": "function",
+        "function": {
+        "name": "pubmed_search",
+        "description": "Search PubMed for Medical related queries.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+            "query":  { "type": "string",  "description": "Query to search for." },
+            "top_n":  { "type": "integer", "description": "Number of hits", "default": 3 }
+            },
+            "required": ["query"]
+        }
+        }
+    }
+    ]
+    ```
+    ────────────────────────────── RULES ──────────────────────────────
+    1. You MUST issue one pubmed_search tool call for each answer choice. Each query must relate the clinical context to that option.
+    2. You MAY NOT skip any option or decide based only on internal reasoning. Evidence must be retrieved for all choices.
+    3. You MAY issue follow-up tool calls if your reasoning leads you to need more evidence.
+    4. You MUST wrap all reasoning in <think> </think> tags and all tool usage in <tool_call> </tool_call> tags. Number of <tool_call> and </tool_call> tokens in the entire trace MUST always match.
+    5. Do NOT casually emit the  <tool_call> </tool_call> during reasoning unless explicitly calling a tool in the proper format.
+    5. Your final answer must be enclosed a single letter corresponding to the correct option enclosed in the <answer> </answer> tags. Do not output anything else inside these tags.
+    6. DO NOT use any other confusing tags like <thiking> or </thinking>.
+    7. Each <think> </think> block MUST be followed by a <tool_call> </tool_call> or <answer> </answer> or else the program will break without an answer.
+    ───────────────────── DUMMY EXAMPLE INTERLEAVED SKELETON ─────────────────────
+    <think>
+    We are presented with a 54-year-old woman with invasive ductal carcinoma of the breast and osteolytic lesions in the thoracic spine. This strongly suggests metastatic spread. Our task is to determine the most likely anatomical route of metastasis to the spine.
+    Let’s examine the given options:
+    A. Hemiazygos vein
+    B. Posterior intercostal veins
+    C. Batson’s vertebral venous plexus
+    D. Internal mammary lymphatics
+    We'll evaluate each option in turn using available literature and known anatomical pathways.
+    **Option A: Hemiazygos vein**
+    We begin by evaluating whether the hemiazygos vein could be involved in metastatic spread from breast cancer to the spine.
+    </think>
+    <tool_call>
+    {"name": "pubmed_search", "arguments": {"query": "breast cancer metastasis hemiazygos vein", "top_n": 2}}
+    </tool_call>
+    <tool_response>
+    ...
+    </tool_response>
+    <think>
+    There is limited or no strong evidence suggesting the hemiazygos vein is a common or primary route for vertebral metastasis from breast cancer.
+    Lets explore **Option B: Posterior intercostal veins**  and  **Option C: Batson’s vertebral venous plexus** and **Option D:Internal mammary lymphatics**
+    </think>
+    <tool_call>
+    {"name": "pubmed_search", "arguments": {"query": "posterior intercostal veins breast cancer spinal metastasis", "top_n": 3}}
+    </tool_call>
+    <tool_call>
+    {"name": "pubmed_search", "arguments": {"query": "Batson vertebral venous plexus breast cancer metastasis", "top_n": 3}}
+    </tool_call>
+    <tool_call>
+    {"name": "pubmed_search", "arguments": {"query": "Internal mammary lymphatics breast cancer metastasis", "top_n": 3}}
+    </tool_call>
+    <tool_response>
+    ...
+    </tool_response>
+    <think>
+    While the posterior intercostal veins may be involved in venous drainage, there is insufficient evidence to support them as a primary route for metastasis to the vertebral column.
+    where as Batson’s vertebral venous plexus — a valveless venous network that connects the thoracic and abdominal veins directly to the spine. I to find more specific information about option C.
+    </think>
+    <tool_call>
+    {"name": "pubmed_search", "arguments": {"query": ""Batson vertebral venous plexus breast cancer metastasis in people over 50", "top_n": 1}}
+    </tool_call>
+     <think>
+    After evaluating all four options, the most plausible route for breast cancer metastasis to the thoracic spine is clearly via  Batson’s vertebral venous plexus:
+    </think>
+    <answer>C</answer>
+    """
+    # STOP_TOKENS =STOP_TOKENS = ["<|im_end|>", "<|endoftext|>"
+    def __init__(self, executor_url):
+        self.executor_url = executor_url
+    def init_prompt(self, func_schemas, question):
+        system_prompt = f"<|im_start|>system\n{self.sys_prompt_websailor.format(func_schemas=func_schemas)}<|im_end|>"
+        user_prompt = f"<|im_start|>user\n{question}<|im_end|>"
+        assistant_prefix = f"<|im_start|>assistant\n<think>"
+        return system_prompt + "\n" + user_prompt + "\n" + assistant_prefix
+    def _strip_old_tool_responses(self, prompt: str) -> str:
+        TOOL_RESPONSE_RE = re.compile(r"<tool_response>.*?</tool_response>\s*", re.DOTALL)
+        """Remove every existing <tool_response> … </tool_response> block."""
+        return TOOL_RESPONSE_RE.sub("", prompt)
+    def cat_assistant_response(self, curr_prompt, assistant_response):
+        return curr_prompt + assistant_response + "<|im_end|>"
+    def cat_tool_results(self, curr_prompt, tool_calls, results):
+        tool_response_str = ""
+        for tool_call, result in zip(tool_calls, results):
+            tool_response_str += f"<tool_response>{tool_call}\n{result}\n</tool_response>\n"
+        tool_response_str = f"<|im_start|>user\n{tool_response_str}<|im_end|>"
+        assistant_prefix = f"<|im_start|>assistant\n<think>"
+        return curr_prompt + "\n" + tool_response_str + "\n" + assistant_prefix
+    def format_tool_call(self, tool_call_str: str):
+        """Convert JSON function call description to Python executable code string."""
+        try:
+            call_json = json.loads(tool_call_str)
+            func_name = call_json['name']
+            arguments = call_json.get('arguments', {})
+            args_str = ', '.join(f"{k}={repr(v)}" for k, v in arguments.items())
+            return f"{func_name}({args_str})"
+        except Exception as e:
+            return f"Parse tool call failed: {e}"
+    def execute_tool_calls(self, env: str, tool_calls: List[str]) -> List[str]:
+        def exe_tool_call(env, call):
+            url = self.executor_url + '/execute'
+            call_str = self.format_tool_call(call)
+            # print(call_str)
+            if call_str.startswith("error: parse tool call failed"):
+                return call_str
+            try:
+                data = {
+                    'env': env,
+                    'call': call_str
+                }
+                response = requests.post(url, json=data, timeout=60)
+                if response.status_code != 200:
+                    return f"error: {response.status_code}"
+                response = response.json()
+                ret_str = ''
+                if response['result']:
+                    ret_str += f'result: \n{response["result"]}\n'
+                if response['output']:
+                    ret_str += f'output: \n{response["output"]}\n'
+                if response['error']:
+                    ret_str += f'error: \n{response["error"]}\n'
+                return ret_str.strip()
+            except requests.exceptions.Timeout:
+                return "error: execution timed out"
+            except Exception as e:
+                return str(e)
+        results = []
+        for tool_call in tool_calls:
+            result = exe_tool_call(env, tool_call)
+            results.append(result)
+        return results
+    def validate_tool_calls(self, output_str):
+        start_tags = re.findall(r'<tool_call>', output_str)
+        end_tags = re.findall(r'</tool_call>', output_str)
+        if len(start_tags) != len(end_tags):
+            return False
+        start_positions = [m.start() for m in re.finditer(r'<tool_call>', output_str)]
+        end_positions = [m.start() for m in re.finditer(r'</tool_call>', output_str)]
+        for start, end in zip(start_positions, end_positions):
+            if start >= end:
+                return False
+        return True
+    def extract_tool_calls(self, output_str):
+        if not self.validate_tool_calls(output_str):
+            return []
+        try:
+            pattern = r'<tool_call>((?:(?!</tool_call>).)*)</tool_call>'
+            matches = re.finditer(pattern, output_str, re.DOTALL)
+            return [match.group(1).strip() for match in matches]
+        except Exception as e:
+            return []
+    def extract_tool_calls_deepseek(self, output_str):
+        if not self.validate_tool_calls(output_str):
+            return []
+        try:
+            pattern = r'<tool_calls_begin>((?:(?!</tool_calls_end>).)*)<tool_calls_end>'
+            matches = re.finditer(pattern, output_str, re.DOTALL)
+            return [match.group(1).strip() for match in matches]
+        except Exception as e:
+            return []
+    @retry(max=5, sleep=1, fallback={"score": 0})
+    def run_ii_searcher(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        tokenizer,
+        model_url="http://0.0.0.0:1214",
+        temperature: float = 0.0,
+        max_new_tokens: int = 40960,
+        ):
+        curr_prompt = self.init_prompt(func_schemas, question)
+        all_tool_calls= []
+        for _ in range(16):
+            prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+            max_tokens_left = max_new_tokens - len(prompt_tokens) - 100
+            # for oss model served via vllm
+            # response = requests.post(
+            #     f'{model_url}/v1/chat/completions',
+            #     json={
+            #         "text": curr_prompt,
+            #         # "reasoning": "medium"
+            #         },
+            # ).json()
+            # for sglang served models hf models
+            response = requests.post(
+                f'{model_url}/generate',
+                json={
+                    "text": curr_prompt,
+                    "sampling_params": {
+                        "temperature": temperature,
+                        "max_new_tokens": max_tokens_left,
+                        "repetition_penalty": 1.05
+                    },
+                }
+            ).json()
+            if "error" in response.keys():
+                print("resp",response)
+            curr_prompt = self.cat_assistant_response(curr_prompt, response['text'])
+            tool_calls: List[str] = self.extract_tool_calls(response['text'])
+            all_tool_calls += tool_calls
+            if len(tool_calls) == 0:
+                break
+            else:
+                results: List[str] = self.execute_tool_calls(env, tool_calls)
+                curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
+        return curr_prompt, all_tool_calls
+    # @retry(max=5, sleep=1, fallback={"score": 0})
+    # def run(
+    #     self,
+    #     env: str,
+    #     func_schemas: str,
+    #     question: str,
+    #     tokenizer,
+    #     model_url="http://0.0.0.0:1214",
+    #     temperature: float = 0.0,
+    #     max_new_tokens: int = 40960,
+    #     ):
+    #     curr_prompt = self.init_prompt(func_schemas, question)
+    #     all_tool_calls= []
+    #     for i in range(32):
+    #         prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+    #         max_tokens_left = max_new_tokens - len(prompt_tokens) - 100
+    #         # for oss model served via vllm
+    #         # response = requests.post(
+    #         #     f'{model_url}/v1/chat/completions',
+    #         #     json={
+    #         #         "text": curr_prompt,
+    #         #         # "reasoning": "medium"
+    #         #         },
+    #         # ).json()
+    #         # for sglang served models hf models
+    #         response = requests.post(
+    #             f'{model_url}/generate',
+    #             json={
+    #                 "text": curr_prompt,
+    #                 "sampling_params": {
+    #                     "temperature": temperature,
+    #                     "max_new_tokens": max_tokens_left,
+    #                     "repetition_penalty": 1.05
+    #                 },
+    #             }
+    #         ).json()
+    #         if "error" in response.keys():
+    #             print("resp",response)
+    #         curr_prompt = self.cat_assistant_response(curr_prompt, response['text'])
+    #         tool_calls: List[str] = self.extract_tool_calls(response['text'])
+    #         all_tool_calls += tool_calls
+    #         if len(tool_calls) == 0:
+    #             break
+    #         else:
+    #             # print(f"Step-{i+1}")
+    #             results: List[str] = self.execute_tool_calls(env, tool_calls)
+    #             curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
+    #     return curr_prompt, all_tool_calls
+    from typing import List, Dict, Any, Tuple
+    import requests
+    @retry(max=5, sleep=1, fallback={"score": 0})
+    def run(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        tokenizer,
+        model_url: str = "http://0.0.0.0:1214",
+        temperature: float = 0.0,
+        max_new_tokens: int = 40960,
+    ) -> Tuple[str, List[str], List[Dict[str, str]]]:
+        """
+        Returns:
+            curr_prompt: the final prompt buffer (with assistant/tool traces you maintain internally)
+            all_tool_calls: flat list of all tool call strings extracted across steps
+            chat: a lightweight chat transcript list[{"role": "...", "content": "..."}]
+                • 'user' items = the original question + aggregated tool responses
+                • 'assistant' items = model responses (and a compact line-list of tool calls)
+        """
+        # Build runtime prompt and initialize accumulators
+        curr_prompt = self.init_prompt(func_schemas, question)
+        all_tool_calls: List[str] = []
+        chat: List[Dict[str, str]] = []
+        # Seed transcript with JUST the question (no system prompt)
+        chat.append({"role": "user", "content": question})
+        for i in range(32):
+            # Budget tokens for this step
+            prompt_tokens = tokenizer(curr_prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+            max_tokens_left = max(1, max_new_tokens - len(prompt_tokens) - 100)
+            # ---- Model call (sglang/vLLM-style JSON) ----
+            # If you switch to /v1/chat/completions, adjust accordingly.
+            response = requests.post(
+                f"{model_url}/generate",
+                json={
+                    "text": curr_prompt,
+                    "sampling_params": {
+                        "temperature": temperature,
+                        "max_new_tokens": max_tokens_left,
+                        "repetition_penalty": 1.05,
+                    },
+                },
+                timeout=300,
+            ).json()
+            if isinstance(response, dict) and "error" in response:
+                # Log the error as assistant text for visibility and break
+                err_msg = f"[model_error] {response.get('error')}"
+                chat.append({"role": "assistant", "content": err_msg})
+                break
+            assistant_text = response.get("text", "")
+            # Append assistant's raw text to chat
+            chat.append({"role": "assistant", "content": assistant_text})
+            # Update your running prompt with assistant text
+            curr_prompt = self.cat_assistant_response(curr_prompt, assistant_text)
+            # Extract tool calls from the assistant text
+            tool_calls: List[str] = self.extract_tool_calls(assistant_text)
+            if tool_calls:
+                all_tool_calls.extend(tool_calls)
+                # Log tool calls as an assistant message (newline-joined)
+                chat.append({"role": "assistant", "content": "\n".join(tool_calls)})
+                # Execute tools and collect results
+                results: List[str] = self.execute_tool_calls(env, tool_calls)
+                # Feed tool results back into prompt
+                curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
+                # Aggregate tool responses into a single user message
+                tool_res_blocks = []
+                for idx, (call, res) in enumerate(zip(tool_calls, results), 1):
+                    tool_res_blocks.append(f"[Tool {idx}] Result:\n{res}")
+                chat.append({"role": "user", "content": "\n\n".join(tool_res_blocks)})
+            else:
+                # No tool calls → model produced a final answer; stop.
+                break
+        # Return the original outputs plus the chat-style transcript
+        return curr_prompt, all_tool_calls, chat
+    @retry(max=5, sleep=1, fallback={"score": 0})
+    def run_deepseek(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        model_name: str,
+        temperature: float = 0.0,
+        top_p: float = 0.95,
+        max_tokens: int = 32768,
+    ):
+        # print("AA"* 100)
+        """
+        Chat-based ReCall loop for DeepSeek-R1 on Together.
+        """
+        sys_content = self.sys_prompt_websailor_deepseek.format(func_schemas=func_schemas)
+        # sys_content = self.init_prompt(func_schemas, question)
+        messages = [
+            {"role": "system", "content": sys_content},
+            {"role": "user",   "content": question},
+        ]
+        # client = Together(api_key="")
+        client = Together(api_key="bcc761f7a821a80c9c5166171ebb36756cd16d505cec226c3b2259b846364000")
+        all_tool_calls = []
+        for turn in range(32):  # up to 10 reasoning turns
+            resp = client.chat.completions.create(
+                model=model_name,
+                # model="Qwen/Qwen3-235B-A22B-fp8-tput",
+                messages=messages,
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=39000,
+                stop=["<｜end▁of▁sentence｜>", "<|im_end|>"]
+            )
+            # print(resp)
+            assistant_text = resp.choices[0].message.content
+            # print(assistant_text)
+            messages.append({"role": "assistant", "content": assistant_text})
+            # print(f"assistant_output: {assistant_text}")
+            # ⛑ Safe tool call extraction with diagnostic
+            # try:
+            # print("Extracting tool calls")
+            tool_calls = self.extract_tool_calls_deepseek(assistant_text)
+            print(tool_calls)
+            all_tool_calls += tool_calls
+            # except Exception as e:
+            #     print(f"Extraction failed with exception {e}")
+            #     err_msg = f"<tool_response>Tool call extraction failed on turn {turn+1}: {str(e)}</tool_response>"
+            #     messages.append({"role": "user", "content": err_msg})
+            #     continue  # continue to next turn instead of breaking
+            if "<answer>" in assistant_text:
+                break
+            if len(tool_calls) != 0:
+                results = self.execute_tool_calls(env, tool_calls)
+                tool_resp_block = "".join(
+                    f"<tool_response>{c}\n{r}\n</tool_response>\n"
+                    for c, r in zip(tool_calls, results)
+                )
+                messages.append({"role": "user", "content": tool_resp_block})
+                # print(f"Tool Response {tool_resp_block}")
+            else:
+                print("no answer or tool call")
+                break
+        trajectory = "\n".join(
+            f"<{m['role']}>\n{m['content']}" for m in messages
+            if m["role"] != "system"
+        )
+        return trajectory, all_tool_calls
+         # ────────────────────────────────────────────────────────────────
+    # HF-endpoint version of “retrieve → inject → tool loop”
+    # ────────────────────────────────────────────────────────────────
+    @retry(max=5, sleep=1, fallback=None)
+    def run_with_prompt_injection(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        model_url: str = "http://0.0.0.0:1214",
+        temperature: float = 0.0,
+        max_new_tokens: int = 512,
+        top_n: int = 5,
+    ):
+        """
+        0) call pubmed_search(question, top_n) once via the sandbox
+        1) inject those snippets into the very first user message
+        2) continue with the normal multi-turn ReCall loop against *model_url*
+        """
+        # 0️⃣ do a single retrieval tool call
+        retrieve_call = json.dumps({
+            "name": "pubmed_search",
+            "arguments": {"query": question, "top_n": top_n}
+        })
+        retrieval_raw = self.execute_tool_calls(env, [retrieve_call])[0]
+        try:
+            snippets_block = retrieval_raw.split("result:", 1)[-1].strip()
+        except Exception:
+            snippets_block = ""
+        # 1️⃣ build initial prompt with injected snippets
+        user_msg = (
+            f"Question: {question}\n\n"
+            "Here are some relevant PubMed snippets:\n"
+            f"{snippets_block}"
+        ) if snippets_block else f"Question: {question}"
+        sys_prompt = self.init_prompt(func_schemas, question)
+        system_prompt = f"<|im_start|>system\n{sys_prompt}<|im_end|>"
+        user_prompt   = f"<|im_start|>user\n{user_msg}<|im_end|>"
+        assistant_pref= f"<|im_start|>assistant\n<think>"
+        curr_prompt   = system_prompt + "\n" + user_prompt + "\n" + assistant_pref
+        # 2️⃣ normal ReCall loop hitting the HF inference endpoint
+        for _ in range(10):
+            resp = requests.post(
+                f"{model_url}/generate",
+                json={
+                    "text": curr_prompt,
+                    "sampling_params": {
+                        "temperature": temperature,
+                        "max_new_tokens": max_new_tokens,
+                    }
+                },
+                timeout=120,
+            ).json()
+            if "error" in resp.keys():
+                print("resp",response)
+            assistant_txt = resp["text"]
+            curr_prompt = self.cat_assistant_response(curr_prompt, assistant_txt)
+            tool_calls = self.extract_tool_calls(assistant_txt)
+            if  len(tool_calls) != 0:
+                # break  # model produced an answer → done
+                results = self.execute_tool_calls(env, tool_calls)
+                curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
+            else:
+                continue
+        return curr_prompt
+    @retry(max=5, sleep=1, fallback={"score": 0})
+    def run_budget(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        model_url: str = "http://0.0.0.0:1214",
+        temperature: float = 0.0,
+        max_new_tokens: int = 2048,
+    ) -> str:
+        """
+        Execute an agentic dialogue with external tools while *pruning* previous
+        <tool_response> blocks to prevent context-length explosion.
+        """
+        curr_prompt = self.init_prompt(func_schemas, question)
+        for _ in range(16):  # hard loop-limit
+            # ── 1. Call the model
+            rsp = requests.post(
+                f"{model_url}/generate",
+                json={
+                    "text": curr_prompt,
+                    "sampling_params": {
+                        "temperature": temperature,
+                        "max_new_tokens": max_new_tokens,
+                        "stop": ["<|im_end|>", "</think>", "</think>\n" "</think>\n\n"],
+                    },
+                },
+                timeout=120,
+            ).json()
+            generated = rsp["text"]                       # what you have now
+            matched   = rsp["meta_info"]["finish_reason"].get("matched")
+            # ⇢ append the tag back only if it was removed
+            if matched and not generated.endswith(matched):
+                generated += matched
+            # Fail fast on server error
+            if "error" in rsp:
+                raise RuntimeError(rsp["error"])
+            assistant_text: str = rsp["text"]
+            curr_prompt = self.cat_assistant_response(curr_prompt, assistant_text)
+            # ── 2. Check for final answer ────────────────────────────────────
+            if "<answer>" in assistant_text:
+                break
+            # ── 3. Extract & execute tool calls ──────────────────────────────
+            tool_calls: List[str] = self.extract_tool_calls(assistant_text)
+            if not tool_calls:        # continue reasoning without calling a tool
+                continue
+            results: List[str] = self.execute_tool_calls(env, tool_calls)
+            # ── 4. BEFORE appending new tool output, drop all old ones ───────
+            curr_prompt =self. _strip_old_tool_responses(curr_prompt)
+            # ── 5. Append *only* the fresh tool_response block ───────────────
+            curr_prompt = self.cat_tool_results(curr_prompt, tool_calls, results)
+        return curr_prompt
+    def _strip_old_tool_responses_msgs(self, messages: list[dict]) -> list[dict]:
+        """
+        Return a copy of `messages` with every *user* message that starts with
+        <tool_response> removed.  Keeps assistant turns untouched.
+        """
+        return [
+            m for m in messages
+            if not (m["role"] == "user" and m["content"].lstrip().startswith("<tool_response>"))
+        ]
+    # ────────── budget version ──────────
+    @retry(max=5, sleep=1, fallback={"score": 0})
+    def run_deepseek_budget(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        api_key: str,
+        model_name: str,
+        temperature: float = 0.0,
+        top_p: float = 0.95,
+        max_tokens: int = 32768,
+        max_turns: int = 10,
+    ):
+        """
+        Chat-based ReCall loop for DeepSeek-R1 **with context-budget pruning**.
+        Keeps only the *latest* <tool_response> block to avoid prompt bloat.
+        """
+        sys_content = self.system_prompt_budget.format(func_schemas=func_schemas)
+        messages = [
+            {"role": "system", "content": sys_content},
+            {"role": "user",   "content": question},
+        ]
+        client = Together(api_key=api_key)
+        for turn in range(max_turns):
+            # ── 1. model call ───────────────────────────────────────────────
+            resp = client.chat.completions.create(
+                model=model_name,
+                messages=messages,
+                temperature=temperature,
+                top_p=top_p,
+                max_tokens=max_tokens,
+                stop=["</tool_call>", "<｜end▁of▁sentence｜>"],
+            )
+            assistant_text = resp.choices[0].message.content
+            messages.append({"role": "assistant", "content": assistant_text})
+            print(f"**assistant** \n {assistant_text}")
+            # ── 2. finished? ────────────────────────────────────────────────
+            if "<answer>" in assistant_text:
+                break
+            # ── 3. parse tool calls ────────────────────────────────────────
+            tool_calls = self.extract_tool_calls(assistant_text)
+            print(f"**tool_calls** \n {tool_calls}")
+            if not tool_calls:
+                continue  # keep reasoning without tools
+            # ── 4. execute tools ───────────────────────────────────────────
+            results = self.execute_tool_calls(env, tool_calls)
+            print(f"**tool_response** \n {results}")
+            # ── 5. prune & append fresh tool_response ──────────────────────
+            messages = self._strip_old_tool_responses_msgs(messages)
+            tool_resp_block = "".join(
+                f"<tool_response>{c}\n{r}\n</tool_response>\n"
+                for c, r in zip(tool_calls, results)
+            )
+            messages.append({"role": "user", "content": tool_resp_block})
+        # ── 6. flatten & return trajectory (sans system for readability) ───
+        trajectory = "\n".join(
+            f"<{m['role']}>\n{m['content']}" for m in messages if m["role"] != "system"
+        )
+        return trajectory
+    @retry(max=5, sleep=1, fallback=None)
+    def run_deepseek_with_prompt_injection(
+        self,
+        env: str,
+        func_schemas: str,
+        question: str,
+        api_key: str,
+        model_name: str,
+        temperature: float = 0.0,
+        top_p: float = 0.95,
+        max_tokens: int = 32768,
+    ):
+        """
+        1) Call pubmed_search(question, top_n=5) as a tool to get snippets.
+        2) Inject them into the first user message.
+        3) Proceed with the usual DeepSeek-R1 tool‐based rollout.
+        """
+        # ── Step 0: prepare the single‐tool call for retrieval ───────────────
+        retrieve_call = json.dumps({
+            "name": "pubmed_search",
+            "arguments": {
+                "query": question,
+                "top_n": 5
+            }
+        })
+        # Execute it once via your helper
+        # note: `env` must include whatever import / client‐setup
+        #          your sandbox needs to run pubmed_search(...)
+        raw_retrieval_results = self.execute_tool_calls(env, [retrieve_call])[0]
+        # print("AAAAA"*100)
+        try:
+            snippets = raw_retrieval_results[9:] #"remove result: str"
+            # print(snippets)
+        except:
+            snippets = ""
+            # print(f"[ReCall] Retriever call failed to parse JSON, got:\n{raw_retrieval_results!r}")
+        # ── Step 1: build the injected user prompt ────────────────────────────
+        if snippets:
+            user_content = (
+                f"Question: {question}\n\n"
+                "Here are some relevant PubMed snippets:\n"
+                f"{snippets}"
+            )
+        else:
+            user_content = f"Question: {question}"
+        # ── Step 2: start the chat history ────────────────────────────────────
+        sys_content = self.system_prompt_forcing_tool_call
+        messages = [
+            {"role": "system",  "content": sys_content},
+            {"role": "user",    "content": user_content},
+        ]
+        client = Together(api_key=api_key)
+        # ── Step 3: your normal ReCall tool‐calling loop ─────────────────────
+        for turn in range(10):
+            resp = client.chat.completions.create(
+                model       = model_name,
+                messages    = messages,
+                temperature = temperature,
+                top_p       = top_p,
+                max_tokens  = max_tokens,
+                stop        = ["</tool_call>", "<｜end▁of▁sentence｜>"]
+            )
+            assistant_text = resp.choices[0].message.content
+            messages.append({"role": "assistant", "content": assistant_text})
+            tool_calls = self.extract_tool_calls(assistant_text)
+            if not tool_calls:
+                break
+            # Execute all of the tool calls in one go
+            results = self.execute_tool_calls(env, tool_calls)
+            # and append them back in the required <tool_response> format
+            tool_resp_block = "".join(
+                f"<tool_response>{call}\n{out}\n</tool_response>\n"
+                for call, out in zip(tool_calls, results)
+            )
+            messages.append({"role": "user", "content": tool_resp_block})
+        # ── Step 4: flatten to a single trajectory ────────────────────────────
+        trajectory = "\n".join(
+            f"<{m['role']}>\n{m['content']}"
+            for m in messages
+            if m["role"] != "system"
+        )
+        return trajectory

inference/simpledeepsearch.py ADDED Viewed

	@@ -0,0 +1,417 @@

+#!/usr/bin/env python3
+"""o1_searcher_inference.py — Serper‑based Search‑o1 re‑implementation
+with *original* in‑house summarisation workflow, step‑replacement logic and
+bug‑fixes for duplicate queries / ValueError.
+"""
+from __future__ import annotations
+import os, re, json, time, string, pathlib
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+import requests, trafilatura
+# -----------------------------------------------------------------------------
+# Optional NLTK sentence tokenizer (fallback to regex) -------------------------
+try:
+    from nltk.tokenize import sent_tokenize  # type: ignore
+except Exception:  # ImportError *or* missing punkt data
+    def sent_tokenize(x: str):
+        return re.split(r"(?<=[.!?]) +", x)
+# -----------------------------------------------------------------------------
+# Special tags & constants -----------------------------------------------------
+BEGIN_SEARCH_QUERY  = "<|begin_search_query|>"
+END_SEARCH_QUERY    = "<|end_search_query|>"
+BEGIN_DOCUMENT_QUERY    = "<|begin_of_document|>"
+END_DOCUMENT_QUERY    = "<|end_of_document|>"
+THINK_OPEN, THINK_CLOSE = "<think>", "</think>"
+EOS_TOKEN  = "<|im_end|>"
+ANSWER_OPEN, ANSWER_CLOSE = "<answer>", "</answer>"
+STOP_STRINGS = [END_SEARCH_QUERY, ANSWER_CLOSE, EOS_TOKEN, "<|endoftext|>"]
+ALLOWED_DATASETS = {"musique", "frames", "simpleqa", "browsercomp"}
+# tokenizer =
+TOKENIZER_DIR = "/home/fractal_admin/shreyas/models/Qwen3-4B"
+# ───────────────────────── tokenizer ────────────────────────────────────────
+try:
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
+except Exception as e:
+    import sys
+    sys.exit(f"❌  Could not load Qwen3 tokenizer: {e}")
+# -----------------------------------------------------------------------------
+# Helper functions -------------------------------------------------------------
+def remove_punc(t: str) -> str:
+    return t.translate(str.maketrans("", "", string.punctuation))
+# legacy aliases for older checkpoints  ---------------------------------------
+_nopunc = remove_punc
+def f1(a: set, b: set) -> float:
+    inter = len(a & b)
+    return 0.0 if inter == 0 else 2 * inter / (len(a) + len(b))
+# legacy alias
+_f1 = f1
+def extract_snippet_ctx(text: str, snippet: str, win: int = 2500) -> str:
+    """Return *window*‑sized context around the sentence most similar to snippet."""
+    text = text[:50_000]
+    sn_set = set(remove_punc(snippet.lower()).split())
+    best, best_score = None, 0.20
+    for sent in sent_tokenize(text):
+        score = f1(sn_set, set(remove_punc(sent.lower()).split()))
+        if score > best_score:
+            best, best_score = sent, score
+    if best:
+        pos = text.find(best)
+        return text[max(0, pos - win): pos + len(best) + win]
+    return text[: 2 * win]
+# -----------------------------------------------------------------------------
+# Config dataclass -------------------------------------------------------------
+@dataclass
+class SDSCfg:
+    serper_api_key: str = "7bfe51ead1a1766b656c1355b292d1d29c15c114"
+    gl: str = "us"; hl: str = "en"
+    top_k: int = 10; max_doc_len: int = 3000
+    max_search: int = 10; max_turn: int = 15
+    use_jina: bool = True
+    jina_tpl: str = "https://r.jina.ai/http://{}"
+    # generation params
+    temperature: float = 0.7; top_p: float = 0.8; top_k_sampling: int = 20
+    rep_pen: float = 1.05; thinker_max_tokens: int = 40960
+# -----------------------------------------------------------------------------
+# Serper search + page fetch ---------------------------------------------------
+def serper_search(q: str, num: int, key: str, gl="us", hl="en") -> List[Dict]:
+    hdr = {"X-API-KEY": key, "Content-Type": "application/json"}
+    body = {"q": q, "num": num, "gl": gl, "hl": hl}
+    r = requests.post("https://google.serper.dev/search", json=body, headers=hdr, timeout=20)
+    r.raise_for_status(); return r.json().get("organic", [])
+def fetch_page(url: str, cfg: O1Cfg, snippet: str = "") -> str:
+    try:
+        txt = ""
+        if cfg.use_jina:
+            r = requests.get(cfg.jina_tpl.format(url), timeout=15)
+            if r.ok and len(r.text.strip()) > 100:
+                txt = r.text.strip()
+        if txt == "":
+            r = requests.get(url, timeout=15); r.raise_for_status()
+            txt = trafilatura.extract(r.text, output_format="txt") or ""
+        if snippet:
+            txt = extract_snippet_ctx(txt, snippet, cfg.max_doc_len)
+        return txt
+    except Exception:
+        return ""
+# -----------------------------------------------------------------------------
+# replace_recent_steps  --------------------------------------------------------
+def replace_recent_steps(origin: str, patch: str) -> str:
+    """Apply *patch* (containing numbered `Step N:` lines) to *origin*."""
+    step_re = re.compile(r"Step\s+(\d+):\s*")
+    def parse(block: str) -> Dict[int, str]:
+        cur, buf, out = None, [], {}
+        for line in block.splitlines():
+            m = step_re.match(line)
+            if m:
+                if cur is not None:
+                    out[cur] = "\n".join(buf).strip()
+                cur, buf = int(m.group(1)), [line[m.end():].strip()]
+            elif cur is not None:
+                buf.append(line)
+        if cur is not None:
+            out[cur] = "\n".join(buf).strip()
+        return out
+    base = parse(origin); mod = parse(patch)
+    for k, v in mod.items():
+        if "DELETE THIS STEP" in v:
+            base.pop(k, None)
+        else:
+            base[k] = v
+    return "\n\n".join(base[k] for k in sorted(base))
+# -----------------------------------------------------------------------------
+# Prompts ----------------------------------------------------------------------
+# from prompts import get_webpage_to_reasonchain_instruction  # keep original helper
+# -----------------------------------------------------------------------------
+# Main agent -------------------------------------------------------------------
+class SDSearcher:
+    # STOP_TOKENS = [
+    #     "<|im_end|>",
+    #     "<|endoftext|>",
+    #    "<|end_of_query|>",
+    #    " <|end_of_query|>",
+    #    "<|end_of_query|>\n",
+    #    "<|end_of_query|>\n\n",
+    #    " <|end_of_query|>\n",
+    #    " <|end_of_query|>\n\n",
+    # ]
+    get_webpage_to_reasonchain_instruction = """**Task Instruction:**
+    You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question.
+    **Guidelines:**
+    1. **Analyze the Searched Web Pages:**
+    - Carefully review the content of each searched web page.
+    - Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question.
+    2. **Extract Relevant Information:**
+    - Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**.
+    - Ensure that the extracted information is accurate and relevant.
+    3. **Output Format:**
+    - **If the web pages provide helpful information for current search query:** Present the information beginning with **Final Information** as shown below.
+    **Final Information**
+    [Helpful information]
+    - **If the web pages do not provide any helpful information for current search query:** Output the following text.
+    **Final Information**
+    No helpful information found.
+    **Inputs:**
+    - **Previous Reasoning Steps:**
+    {prev_reasoning}
+    - **Current Search Query:**
+    {search_query}
+    - **Searched Web Pages:**
+    {document}
+    Now you should analyze each web page and find helpful information based on the current search query {search_query} and previous reasoning steps.
+    Return the Helpful information in the <information></information> tags
+    """
+    sys_prompt_multiqa = (
+        "You are a reasoning assistant with the ability to perform web searches to help "
+        "you answer the user's question accurately. You have special tools:\n\n"
+        "- To perform a search: write <|begin_search_query|> your query here <|end_search_query|>.\n"
+        "Then, the system will search and analyze relevant web pages, then provide you with helpful information in the format <|begin_search_result|> ...search results... <|end_search_result|>.\n\n"
+        f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to 16.\n\n"
+        "Once you have all the information you need, continue your reasoning.\n\n"
+        "Example:\n"
+        "Question: \"Alice David is the voice of Lara Croft in a video game developed by which company?\"\n"
+        "Assistant thinking steps:\n"
+        "- I need to find out who voices Lara Croft in the video game.\n"
+        "- Then, I need to determine which company developed that video game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>Alice David Lara Croft voice<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant thinks: The search results indicate that Alice David is the voice of Lara Croft in a specific video game. Now, I need to find out which company developed that game.\n\n"
+        "Assistant:\n"
+        "<|begin_search_query|>video game developed by Alice David Lara Croft<|end_search_query|>\n\n"
+        "(System returns processed information from relevant web pages)\n\n"
+        "Assistant continues reasoning with the new information...\n\n"
+        "Remember:\n"
+        "- Use <|begin_search_query|> to request a web search and end with <|end_search_query|>.\n"
+        "- When done searching, continue your reasoning.\n\n",
+        "Finally, if you have got the answer, enclose it within \\boxed{{}} with latex format and do not continue to call functions"
+    )
+    def __init__(self, cfg: O1Cfg, thinker_url: str):
+        if not cfg.serper_api_key:
+            raise ValueError("SERPER_API_KEY required")
+        self.cfg, self.model_url = cfg, thinker_url.rstrip("/")
+        self.search_cache: Dict[str, List[Dict]] = {}
+        self.page_cache: Dict[Tuple[str, str], str] = {}
+    # --- low‑level generation call ------------------------------------------
+    def _generate(self, prompt: str) -> str:
+        prompt_tokens = tokenizer(prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+        max_tokens_left = self.cfg.thinker_max_tokens - len(prompt_tokens) - 100
+        resp = requests.post(
+            f"{self.model_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": self.cfg.temperature,
+                    "top_p": self.cfg.top_p,
+                    "max_new_tokens": max_tokens_left,
+                    "repetition_penalty": self.cfg.rep_pen,
+                    "stop": STOP_STRINGS,
+                },
+            },
+            timeout=60,
+        ).json()
+        generated = resp["text"]                       # what you have now
+        matched   = resp["meta_info"]["finish_reason"].get("matched")
+        reason = resp["meta_info"]["finish_reason"].get("type")
+        # ⇢ append the tag back only if it was removed
+        if reason == "stop" and matched in STOP_STRINGS:
+            if not "<|end_of_query|>" in generated:
+                generated += matched
+        if reason == "stop" and matched == 151645:
+             if not generated.endswith("<|im_end|>"):
+                generated += "<|im_end|>"
+        if reason == "stop" and matched == 151643:
+             if not generated.endswith("<|endoftext|>"):
+                generated += "<|endoftext|>"
+        return generated
+    def _generate_summary(self, prompt: str) -> str:
+        summary_url = "http://0.0.0.0:1243"
+        prompt_tokens = tokenizer(prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+        max_tokens_left = self.cfg.thinker_max_tokens - len(prompt_tokens) - 100
+        resp = requests.post(
+            f"{summary_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": self.cfg.temperature,
+                    "max_new_tokens": 8192,#max_tokens_left,
+                    "stop": STOP_STRINGS,
+                },
+            },
+            timeout=60,
+        ).json()
+        generated = resp["text"]                       # what you have now
+        matched   = resp["meta_info"]["finish_reason"].get("matched")
+        reason = resp["meta_info"]["finish_reason"].get("type")
+        # ##print("-"*100)
+        # ##print(resp)
+        # ##print(matched)
+        # ##print("-"*100)
+        # ⇢ append the tag back only if it was removed
+        if reason == "stop" and matched in STOP_STRINGS:
+            if not "<|end_of_query|>" in generated:
+                generated += matched + EOS_TOKEN
+        if reason == "stop" and matched == 151645:
+             if not generated.endswith("<|im_end|>"):
+                generated += "<|im_end|>"
+        if reason == "stop" and matched == 151643:
+             if not generated.endswith("<|endoftext|>"):
+                generated += "<|endoftext|>"
+        return generated
+    # --- public entry -------------------------------------------------------
+    def run(self, question: str):
+        prompt = (
+            f"<|im_start|>system\n{self.sys_prompt_multiqa}<|im_end|>\n"
+            f"<|im_start|>user\n{question}<|im_end|>\n"
+            f"<|im_start|>assistant\n{THINK_OPEN}"
+        )
+        full_trace = prompt  # <-- Track full trace
+        queries: List[str] = []
+        seen_queries: set[str] = set()
+        for i in range(self.cfg.max_turn):
+            chunk = self._generate(prompt)
+            prompt += chunk
+            if ANSWER_CLOSE in chunk:
+                break
+            ##print(f"step-{i}")
+            ##print(chunk)
+            query = self._extract_query(chunk)
+            ##print(query)
+            if not query or len(queries) >= self.cfg.max_search:
+                break
+            if query in seen_queries:
+                continue
+            queries.append(query)
+            seen_queries.add(query)
+            doc = self._retrieve_doc(query)
+            prev_reasoning = self._extract_reasoning(prompt)
+            summary = "\n<|im_start|>user" + self._summarise(prev_reasoning, query, doc) + EOS_TOKEN + "\n<|im_start|>assistant" + THINK_OPEN
+            ##print("summary")
+            ##print(summary)
+            prompt += summary   # <-- Log summary to trace
+            # new_reasoning = replace_recent_steps(prev_reasoning, summary)
+            # if prev_reasoning:
+            #     prompt = prompt.rsplit(prev_reasoning, 1)[0] + new_reasoning + THINK_CLOSE + THINK_OPEN
+            # else:
+                # prompt += new_reasoning + THINK_CLOSE + THINK_OPEN
+            # full_trace +=  + THINK_CLOSE + THINK_OPEN + "\n"  # <-- Log reasoning to trace
+        else:
+            final = f"{ANSWER_OPEN}I don't know.{ANSWER_CLOSE}"
+            prompt += final
+            # full_trace += final
+        return prompt, queries
+    # ---------------------------------------------------------------------
+    # helpers --------------------------------------------------------------
+    def _extract_query(self, txt: str) -> Optional[str]:
+        if BEGIN_SEARCH_QUERY not in txt or END_SEARCH_QUERY not in txt:
+            return None
+        frag = txt.split(BEGIN_SEARCH_QUERY)[-1].split(END_SEARCH_QUERY)[0]
+        # strip quotes / ellipsis / tabs
+        return re.sub(r"[\"'…\t]", " ", frag.split("<|")[0]).strip()
+    def _retrieve_doc(self, query: str) -> str:
+        if query not in self.search_cache:
+            self.search_cache[query] = serper_search(query, self.cfg.top_k, self.cfg.serper_api_key,
+                                                     gl=self.cfg.gl, hl=self.cfg.hl)
+        for hit in self.search_cache[query]:
+            # ##print("hit")
+            # ##print(hit)
+            url, sn = hit.get("link", ""), hit.get("snippet", "")
+            if not url:
+                continue
+            key = (url, sn)
+            if key not in self.page_cache:
+                self.page_cache[key] = fetch_page(url, self.cfg, sn)
+            if self.page_cache[key]:
+                return self.page_cache[key]
+        return ""
+    def _summarise(self, prev: str, query: str, doc: str) -> str:
+        rid_prompt = self.get_webpage_to_reasonchain_instruction.format(prev_reasoning = prev, search_query = query, document = doc)
+        chat = f"<|im_start|>user\\n{rid_prompt}\\n<|im_end|>\\n<|im_start|>assistant\\n"
+        resp = self._generate_summary(chat)
+        # ##print("summarization out \n", resp)
+        return BEGIN_DOCUMENT_QUERY + self._extract_summary(resp)  + END_DOCUMENT_QUERY
+        # ##print("summary")
+        # ##print(resp)
+        # match = re.search(r"Final Information\*\*\s*\n(.+?)<\|im_end\|>", resp)
+        # if match:
+        #     final_info = match.group(1).strip()
+        #     ##print(final_info)
+        # return final_info
+    def _extract_summary(self, prompt: str) -> str:
+        if "<information>" in prompt:
+            summary = prompt.split("<information>")[-1].split("</information>")[0] if THINK_OPEN in prompt else ""
+            return summary
+        else:
+            match = re.search(r"\*\*Final Information\*\*\s*\n(.+?)<\|im_end\|>", prompt)
+            if match:
+                final_info = match.group(1).strip()
+                return final_info
+        return prompt
+    def _extract_reasoning(self, prompt: str) -> str:
+        return prompt.split(THINK_OPEN)[-1].split(THINK_CLOSE)[0] if THINK_OPEN in prompt else ""
+# -----------------------------------------------------------------------------
+# CLI -------------------------------------------------------------------------
+# if __name__ == "__main__":
+#     # import argparse, json
+#     # parser = argparse.ArgumentParser()
+#     # parser.add_argument("question"); parser.add_argument("--dataset", required=True, choices=sorted(ALLOWED_DATASETS)); parser.add_argument("--model-url", required

inference/zerosearch.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# zero_search_inference.py
+"""End‑to‑end inference loop that emulates the ZeroSearch prompting style.
+The policy model ("thinker") must:
+    • reason inside <think> … </think>
+    • place a query inside <search> … </search> whenever it needs external knowledge
+    • return the final short answer inside <answer> … </answer>
+The wrapper intercepts each <search> request, fulfils it with either:
+    (a) a **simulated search engine** (another small LLM fine‑tuned as ZeroSearch
+        retriever) ‑‑ default; or
+    (b) a real search backend (e.g. Serper.dev, Bing) if `engine="real"`.
+It then injects results between <information> … </information> and hands control
+back to the policy model.  The loop repeats until </answer> is produced or a
+maximum number of retrieval rounds is reached.
+The goal is to mirror the ergonomics of the user’s existing `ReCall` class so
+that outer orchestration code can drop this in with minimal friction.
+"""
+from __future__ import annotations
+import json
+import os
+import re
+import time
+from dataclasses import dataclass
+from typing import List, Optional
+import requests
+from openai import OpenAI
+__all__ = ["ZeroSearchInference", "ZeroSearchConfig"]
+TOKENIZER_DIR = "/home/fractal_admin/shreyas/models/Qwen3-4B"
+# ───────────────────────── tokenizer ────────────────────────────────────────
+try:
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
+except Exception as e:
+    import sys
+    sys.exit(f"❌  Could not load Qwen3 tokenizer: {e}")
+# ---------------------------------------------------------------------------
+# Utility: retry decorator ---------------------------------------------------
+# ---------------------------------------------------------------------------
+def retry(max_attempts: int = 4, sleep: int = 1, fallback=None):
+    def decorator(func):
+        def wrapper(*args, **kwargs):
+            for i in range(max_attempts):
+                try:
+                    return func(*args, **kwargs)
+                except Exception as exc:
+                    #print(f"[retry] {func.__name__}: attempt {i + 1}/{max_attempts} failed – {exc}")
+                    if i == max_attempts - 1:
+                        return fallback
+                    time.sleep(sleep)
+        return wrapper
+    return decorator
+# ---------------------------------------------------------------------------
+# Configuration dataclass ----------------------------------------------------
+# ---------------------------------------------------------------------------
+@dataclass
+class ZeroSearchConfig:
+    # thinker LLM endpoint
+    thinker_url: str = "http://0.0.0.0:1214"
+    thinker_temperature: float = 0.7
+    thinker_max_tokens: int = 40960
+    # retrieval engine mode: "sim" or "real"
+    engine: str = "real"  # simulated search (LLM) by default
+    # simulated search model (only used if engine == "sim")
+    retriever_model: str = "gpt-4o-mini"
+    retriever_top_k: int = 5
+    # real search backend (engine == "real")
+    serper_api_key: Optional[str] = "7bfe51ead1a1766b656c1355b292d1d29c15c114"
+    serper_url: str = "https://google.serper.dev/search"
+    serper_top_k: int = 5
+    # Loop limits
+    max_rounds: int = 16
+# ---------------------------------------------------------------------------
+# Main wrapper ---------------------------------------------------------------
+# ---------------------------------------------------------------------------
+class ZeroSearchInference:
+    SEARCH_OPEN = "<search>"
+    SEARCH_CLOSE = "</search>"
+    INFO_OPEN = "<information>"
+    INFO_CLOSE = "</information>"
+    ANSWER_CLOSE = "</answer>"
+    THINK_OPEN = "<think>"
+    THINK_CLOSE = "</think>"
+    STOP_TOKENS = ["<|im_end|>", "<|endoftext|>", "</search>", " </search>", "</search>\n", " </search>\n", "</search>\n\n", " </search>\n\n"]#, "</think>", "</think>\n", " </think>\n", "</think>\n\n", " </think>\n\n"]
+    def __init__(self, cfg: ZeroSearchConfig):
+        self.cfg = cfg
+    # ------------------------------------------------------------------
+    # Public driver -----------------------------------------------------
+    # ------------------------------------------------------------------
+    def run(self, user_question: str) -> str:
+        tool_calls = []
+        prompt = self._build_initial_prompt(user_question)
+        for round_idx in range(self.cfg.max_rounds):
+            generated = self._call_thinker(prompt)
+            #print("-"*100)
+            #print(f"Round: {round_idx}")
+            #print(generated)
+            prompt += generated
+            if self.ANSWER_CLOSE in generated:
+                #print(f"[ZeroSearch] Done in {round_idx + 1} rounds")
+                break
+            query = self._extract_query(generated)
+            if not query:
+                #print("[ZeroSearch] Model failed to emit <search>; aborting")
+                break
+            tool_calls.append(query)
+            info_block = self._retrieve_and_format(query)
+            #print(f"retrived docs: \n{info_block}")
+            #print("-"*100)
+            prompt += info_block + self.THINK_OPEN  # next turn
+        else:  # exceeded rounds
+            prompt += "<answer>I don't know.</answer><|im_end|>"
+        return prompt, tool_calls
+    # ------------------------------------------------------------------
+    # Prompt construction helpers --------------------------------------
+    # ------------------------------------------------------------------
+    def _build_initial_prompt(self, question: str) -> str:
+        user_msg =  f"""Answer the given question. \
+                    You must conduct reasoning inside <think> and </think> first every time you get new information. \
+                    After reasoning, if you find you lack some knowledge, you can call a search engine by <search> query </search> and it will return the top searched results between <information> and </information>. \
+                    You can search as many times as your want. \
+                    If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>. Question: {question}\n"""
+        return f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{self.THINK_OPEN}"
+    # ------------------------------------------------------------------
+    # Thinker model call ------------------------------------------------
+    # ------------------------------------------------------------------
+    @retry(fallback="")
+    def _call_thinker(self, prompt: str) -> str:
+        prompt_tokens = tokenizer(prompt, return_tensors=None, add_special_tokens=False)["input_ids"]
+        max_tokens_left = self.cfg.thinker_max_tokens - len(prompt_tokens) - 100
+        resp = requests.post(
+            f"{self.cfg.thinker_url}/generate",
+            json={
+                "text": prompt,
+                "sampling_params": {
+                    "temperature": self.cfg.thinker_temperature,
+                    "max_new_tokens": max_tokens_left,
+                    "stop": self.STOP_TOKENS,
+                },
+            },
+            timeout=60,
+        ).json()
+        generated = resp["text"]                       # what you have now
+        matched   = resp["meta_info"]["finish_reason"].get("matched")
+        reason = resp["meta_info"]["finish_reason"].get("type")
+        # ⇢ append the tag back only if it was removed
+        if reason == "stop" and matched in self.STOP_TOKENS:
+            if not generated.endswith(matched):
+                generated += matched
+        if reason == "stop" and matched == 151645:
+             if not generated.endswith("<|im_end|>"):
+                generated += "<|im_end|>"
+        return generated
+    # ------------------------------------------------------------------
+    # Query extraction --------------------------------------------------
+    # ------------------------------------------------------------------
+    def _extract_query(self, gen_text: str) -> Optional[str]:
+        if self.SEARCH_OPEN not in gen_text or self.SEARCH_CLOSE not in gen_text:
+            return None
+        query = gen_text.split(self.SEARCH_OPEN)[-1].split(self.SEARCH_CLOSE)[0].strip()
+        return query or None
+    # ------------------------------------------------------------------
+    # Retrieval path ----------------------------------------------------
+    # ------------------------------------------------------------------
+    def _retrieve_and_format(self, query: str) -> str:
+        if self.cfg.engine == "real":
+            docs = self._real_search(query)
+            #print("DOCS")
+            #print(docs)
+        else:
+            docs = self._simulated_search(query)
+        return f"{self.INFO_OPEN}\n{docs}\n{self.INFO_CLOSE}\n\n"
+    # --- simulated search with LLM ------------------------------------
+    @retry(fallback="No information available")
+    def _simulated_search(self, query: str) -> str:
+        messages = [
+            {
+                "role": "user",
+                "content": (
+                    "You are a search engine. Return up to "
+                    f"{self.cfg.retriever_top_k} short documents (titles + snippets) "
+                    "most relevant to the query, each on a new line.\n\n"
+                    f"Query: {query}"
+                ),
+            }
+        ]
+        resp = self.openai.chat.completions.create(
+            model=self.cfg.retriever_model,
+            messages=messages,
+            max_tokens=256,
+        )
+        return resp.choices[0].message.content.strip()
+    # --- real web search via Serper ----------------------------------
+    @retry(fallback="No information available")
+    def _real_search(self, query: str) -> str:
+        if not self.cfg.serper_api_key:
+            raise ValueError("serper_api_key must be set for real search mode")
+        headers = {"X-API-KEY": self.cfg.serper_api_key, "Content-Type": "application/json"}
+        payload = {"q": query, "num": self.cfg.serper_top_k}
+        resp = requests.post(self.cfg.serper_url, json=payload, headers=headers, timeout=20)
+        resp.raise_for_status()
+        data = resp.json().get("organic", [])[: self.cfg.serper_top_k]
+        lines = []
+        for i, item in enumerate(data, 1):
+            snippet = f"Title: {item['title']}, \nSnippet{item['snippet']}"
+            lines.append(f"Doc {i}: {snippet}")
+        return "\n".join(lines) or "No information available"