Spaces:

Emova-ollm
/

RACRO-demo

Running on Zero

App Files Files Community

KaiChen1998 commited on Jun 16

Commit

5f870ca

1 Parent(s): 6f2ad11

initial commit

Browse files

Files changed (7) hide show

app.py +311 -0
conversation_public.py +517 -0
examples/icon_256.png +0 -0
examples/image-text/demo_example.jpg +0 -0
examples/user_avator.png +0 -0
gitignore +1 -0
requirements.txt +30 -0

app.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import os
+import argparse
+import traceback
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S')
+import spaces
+import gradio as gr
+from conversation_public import default_conversation
+auth_token = os.environ.get("TOKEN_FROM_SECRET")
+##########################################
+# LLM part
+##########################################
+from transformers import AutoProcessor, AutoTokenizer, TextIteratorStreamer
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+from threading import Thread
+# === Prompts ===
+SYSTEM_PROMPT_LLM = "You are a helpful assistant."
+SYSTEM_PROMPT_CAP = "You are given an image and a relevant question. Based on the query, please describe the image in details. Do not try to answer the question."
+CAPTION_PROMPT = "Question: {}\nPlease describe the image. DO NOT try to answer the question!"
+LLM_PROMPT = """In the following text, you will receive a detailed caption of an image and a relevant question. In addition, you will be provided with a tentative model response. You goal is to answer the question using these information.\n\n### The detailed caption of the provided image: {}\n\n### Note that the caption might contain incorrect solutions, do not be misguided by them.\n\n### A problem to be solved: {}\n\n### A tentative model response: {}\n\n### Note that the above tentative response might be inaccurate (due to calculation errors, incorrect logic/reasoning and so on), under such a case, please ignore it and give your own solutions. However, if you do not have enough evidence to show it is wrong, please output the tentative response."""
+# === Initialize Models ===
+MLLM_MODEL_PATH = "KaiChen1998/RACRO-7B-CRO-GRPO"
+LLM_MODEL_PATH = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
+processor = AutoProcessor.from_pretrained(MLLM_MODEL_PATH)
+tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_PATH)
+mllm = LLM(model=MLLM_MODEL_PATH, tensor_parallel_size=1, gpu_memory_utilization=0.8,
+           device='cuda:0', dtype="bfloat16", limit_mm_per_prompt={"image": 1})
+llm = LLM(model=LLM_MODEL_PATH, tensor_parallel_size=1, gpu_memory_utilization=0.8,
+          device='cuda:0', dtype="bfloat16")
+mllm_sampling = SamplingParams(temperature=0, max_tokens=8192)
+llm_sampling = SamplingParams(temperature=0.6, top_p=0.95, max_tokens=8192)
+# === Build Prompts ===
+def build_messages(image_path, question):
+    cap_msgs = [
+        {"role": "system", "content": SYSTEM_PROMPT_CAP},
+        {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": CAPTION_PROMPT.format(question)}]}
+    ]
+    qa_msgs = [
+        {"role": "user", "content": [{"type": "image", "image": image_path}, {"type": "text", "text": question + " Please think step by step. The final answer MUST BE put in \\boxed{}."}]}
+    ]
+    return cap_msgs, qa_msgs
+# === Run Captioning and QA ===
+def run_mllm_tentative(image_tensor, cap_prompt, qa_prompt):
+    qa_output = mllm.generate([{"multi_modal_data": {"image": image_tensor}, "prompt": qa_prompt[0]}], sampling_params=mllm_sampling)
+    return qa_output[0].outputs[0].text
+def run_mllm_caption(image_tensor, cap_prompt, qa_prompt):
+    cap_output = mllm.generate([{"multi_modal_data": {"image": image_tensor}, "prompt": cap_prompt[0]}], sampling_params=mllm_sampling)
+    return cap_output[0].outputs[0].text
+# === Final Reasoning Step ===
+def run_llm_reasoning(caption, question, answer):
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT_LLM},
+        {"role": "user", "content": LLM_PROMPT.format(caption, question, answer)}
+    ]
+    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    output = llm.generate([{"prompt": prompt}], sampling_params=llm_sampling)
+    return output[0].outputs[0].text
+##########################################
+# Gradio part
+##########################################
+no_change_btn = gr.Button()
+enable_btn = gr.Button(interactive=True)
+disable_btn = gr.Button(interactive=False)
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+server_oom_msg = "**OUT OF GPU MEMORY DETECTED. PLEASE DECREASE THE MAX OUTPUT TOKENS AND REGENERATE.**"
+def load_demo_refresh_model_list():
+    logging.info(f"load_demo.")
+    state = default_conversation.copy()
+    return state
+def regenerate(state, image_process_mode):
+    logging.info(f"regenerate.")
+    state.messages[-1][-1] = None
+    prev_human_msg = state.messages[-2]
+    if type(prev_human_msg[1]) in (tuple, list):
+        prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode, *prev_human_msg[1][3:])
+    state.skip_next = False
+    return (state, state.to_gradio_chatbot_public(), "", None) + (disable_btn,) * 2
+def clear_history():
+    logging.info(f"clear_history.")
+    state = default_conversation.copy()
+    return (state, state.to_gradio_chatbot_public(), "", None) + (disable_btn,) * 2
+############
+# Show prompt in the chatbot
+# Input: [state, textbox, imagebox, image_process_mode]
+# Return: [state, chatbot, textbox, imagebox] + btn_list
+############
+def add_text(state, text, image, image_process_mode):
+    # Input legality checking
+    logging.info(f"add_text. len: {len(text)}")
+    if len(text) <= 0 or image is None:
+        state.skip_next = True
+        return (state, state.to_gradio_chatbot_public(), "", None) + (no_change_btn,) * 2
+    # Deal with image inputs
+    if image is not None:
+        text = (text, image, image_process_mode, None)
+    # Single round only
+    state = default_conversation.copy()
+    state.append_message(state.roles[0], text)
+    state.skip_next = False
+    logging.info(str(state.messages))
+    return (state, state.to_gradio_chatbot_public(), "") + (disable_btn,) * 2
+############
+# Get response
+# Input: [state]
+# Return: [state, chatbot] + btn_list
+############
+@spaces.GPU
+def http_bot(state):
+    logging.info(f"http_bot.")
+    if state.skip_next:
+        yield (state, state.to_gradio_chatbot_public()) + (no_change_btn,) * 2
+        return
+    # Retrive prompt
+    prompt = state.messages[-1][0][0]
+    all_images = state.get_images(return_pil=True)[0]
+    pload = {"prompt": prompt, "images": f'List of {len(state.get_images())} images: {all_images}'}
+    logging.info(f"==== request ====\n{pload}")
+    # Construct prompt
+    cap_msgs, qa_msgs = build_messages(all_images, prompt)
+    cap_prompt = processor.apply_chat_template([cap_msgs], tokenize=False, add_generation_prompt=True)
+    qa_prompt = processor.apply_chat_template([qa_msgs], tokenize=False, add_generation_prompt=True)
+    image_tensor, _ = process_vision_info(cap_msgs)
+    tentative_answer = run_mllm_tentative(image_tensor, cap_prompt, qa_prompt)
+    state.append_message(state.roles[1], "# Tentative Response\n\n" + tentative_answer)
+    logging.info("# Tentative Response\n\n" + tentative_answer)
+    yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
+    caption_text = run_mllm_caption(image_tensor, cap_prompt, qa_prompt)
+    state.append_message(state.roles[1], "# Caption\n\n" + caption_text)
+    logging.info("# Caption\n\n" + caption_text)
+    yield (state, state.to_gradio_chatbot_public()) + (disable_btn,) * 2
+    final_answer = run_llm_reasoning(caption_text, QUESTION, tentative_answer)
+    state.append_message(state.roles[1], "# Final Response\n\n" + final_answer)
+    logging.info("# Final Response\n\n" + final_answer)
+    yield (state, state.to_gradio_chatbot_public()) + (enable_btn,) * 2
+############
+# Layout Markdown
+############
+title_markdown = ("""
+<div style="display: flex; align-items: center; padding: 20px; border-radius: 10px; background-color: #f0f0f0;">
+  <div>
+    <h1 style="margin: 0;">RACRO: Perceptual Decoupling for Scalable Multi-modal Reasoning via Reward-Optimized Captioning</h1>
+    <h2 style="margin: 10px 0;">📃 <a href="https://www.arxiv.org/abs/2506.04559" style="font-weight: 400;">Paper</a> | 💻 <a href="https://github.com/gyhdog99/RACRO2" style="font-weight: 400;">Code</a> | 🤗 <a href="https://huggingface.co/collections/KaiChen1998/racro-6848ec8c65b3a0bf33d0fbdb" style="font-weight: 400;">HuggingFace</a></h2>
+    <p  style="margin: 20px 0;">
+      <strong>1. RACRO is designed for multi-modal reasoning, and thus, image inputs are <mark>ALWAYS</mark> necessary!</strong><br/>
+      <strong>2. Models are deployed with vLLM, which unfortunately, still does not support streaming outputs for MLLMs.</strong>
+    </p>
+  </div>
+</div>
+""")
+learn_more_markdown = ("""
+## Citation
+<pre><code>@article{gou2025perceptual,
+  author    = {Gou, Yunhao and Chen, Kai and Liu, Zhili and Hong, Lanqing and Jin, Xin and Li, Zhenguo and Kwok, James T. and Zhang, Yu},
+  title     = {Perceptual Decoupling for Scalable Multi-modal Reasoning via Reward-Optimized Captioning},
+  journal   = {arXiv preprint arXiv:2506.04559},
+  year      = {2025},
+}</code></pre>
+""")
+block_css = """
+#buttons button {
+    min-width: min(120px,100%);
+}
+.message-row img {
+    margin: 0px !important;
+}
+.avatar-container img {
+    padding: 0px !important;
+}
+"""
+############
+# Layout Demo
+############
+def build_demo(embed_mode):
+    textbox = gr.Textbox(label="Text", show_label=False, placeholder="Enter text and then click 💬 Chat to talk with me ^v^", container=False)
+    with gr.Blocks(title="RACRO", theme=gr.themes.Default(), css=block_css) as demo:
+        state = gr.State()
+        if not embed_mode:
+            gr.HTML(title_markdown)
+        ##############
+        # Chatbot
+        ##############
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                imagebox = gr.Image(type="pil", label="Image")
+                image_process_mode = gr.Radio(
+                    ["Crop", "Resize", "Pad", "Default"],
+                    value="Default",
+                    label="Preprocess for non-square image", visible=False)
+                gr.Examples(examples=[
+                    ["./examples/demo_example.png", "When the canister is momentarily stopped by the spring, by what distance $d$ is the spring compressed?"],
+                ], inputs=[imagebox, textbox], label='Examples')
+            with gr.Column(scale=8):
+                chatbot = gr.Chatbot(
+                    elem_id="chatbot",
+                    label="RACRO Chatbot",
+                    layout="bubble",
+                    avatar_images=["examples/user_avator.png", "examples/icon_256.png"]
+                )
+                textbox.render()
+                with gr.Row(elem_id="buttons") as button_row:
+                    submit_btn = gr.Button(value="💬  Chat", variant="primary")
+                    # stop_btn = gr.Button(value="⏹️  Stop Generation", interactive=False)
+                    regenerate_btn = gr.Button(value="🔄  Regenerate", interactive=False)
+                    clear_btn = gr.Button(value="🗑️  Clear", interactive=False)
+        if not embed_mode:
+            gr.Markdown(learn_more_markdown)
+        # Register listeners
+        btn_list = [regenerate_btn, clear_btn]
+        regenerate_btn.click(
+            regenerate,
+            [state, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state],
+            [state, chatbot] + btn_list,
+        )
+        clear_btn.click(
+            clear_history,
+            None,
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        )
+        # probably mean press enter
+        textbox.submit(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list,
+            queue=False
+        ).then(
+            http_bot,
+            [state],
+            [state, chatbot] + btn_list,
+        )
+        submit_btn.click(
+            add_text,
+            [state, textbox, imagebox, image_process_mode],
+            [state, chatbot, textbox, imagebox] + btn_list
+        ).then(
+            http_bot,
+            [state],
+            [state, chatbot] + btn_list,
+        )
+        ##############
+        # Demo loading
+        ##############
+        demo.load(
+            load_demo_refresh_model_list,
+            None,
+            [state],
+            queue=False
+        )
+    return demo
+parser = argparse.ArgumentParser()
+parser.add_argument("--share", action="store_true")
+parser.add_argument("--embed", action="store_true")
+args = parser.parse_args()
+demo = build_demo(args.embed)
+demo.queue(
+    max_size=10,
+    api_open=False
+).launch(
+    favicon_path="./examples/icon_256.png",
+    allowed_paths=["/"],
+    share=args.share
+)

conversation_public.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+import base64
+tts_format = "Please synthesize the speech corresponding to the follwing text.\n"
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+    GLM4 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple and messages[0][1][1] is not None:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if 'mmtag' in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    if i == 0: message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        elif self.sep_style == SeparatorStyle.GLM4:
+            role = ("<|user|>", "<|assistant|>")
+            ret = self.system + role[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message[:3]
+                    ret += self.sep + message + role[(i+1) % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        if isinstance(self.messages, tuple):
+            self.messages += ([role, message],)
+        else:
+            self.messages.append([role, message])
+    def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
+        if image_process_mode == "Pad":
+            def expand2square(pil_img, background_color=(122, 116, 104)):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+            image = expand2square(image)
+        elif image_process_mode in ["Default", "Crop"]:
+            pass
+        elif image_process_mode == "Resize":
+            image = image.resize((336, 336))
+        else:
+            raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+        if max(image.size) > max_len:
+            max_hw, min_hw = max(image.size), min(image.size)
+            aspect_ratio = max_hw / min_hw
+            shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+            longest_edge = int(shortest_edge * aspect_ratio)
+            W, H = image.size
+            if H > W:
+                H, W = longest_edge, shortest_edge
+            else:
+                H, W = shortest_edge, longest_edge
+            image = image.resize((W, H))
+        if return_pil:
+            return image
+        else:
+            buffered = BytesIO()
+            image.save(buffered, format=image_format)
+            img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+            return img_b64_str
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple and msg[1] is not None:
+                    msg, image, image_process_mode = msg[:3]
+                    image = self.process_image(image, image_process_mode, return_pil=return_pil)
+                    images.append(image)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode = msg
+                    img_b64_str = self.process_image(
+                        image, "Default", return_pil=False,
+                        image_format='JPEG')
+                    img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                    msg = img_str + msg.replace('<image>', '').strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_gradio_chatbot_public(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, image, image_process_mode, audio_input = msg
+                    ret_msg = ""
+                    if image is not None:
+                        img_b64_str = self.process_image(
+                            image, "Default", return_pil=False,
+                            image_format='JPEG')
+                        img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
+                        ret_msg += img_str
+                    if audio_input is not None:
+                        audio_b64_str = base64.b64encode(open(audio_input, "rb").read()).decode("utf-8")
+                        audio_str = f'<audio src="data:audio/wav;base64,{audio_b64_str}" controls ></audio>'
+                        ret_msg += audio_str
+                    else:
+                        ret_msg += msg.replace('<image>', '').replace(tts_format, '').strip()
+                    ret.append([ret_msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                if type(msg) is tuple:
+                    audio_b64_str = base64.b64encode(open(msg[1], "rb").read()).decode("utf-8")
+                    msg = f'<audio src="data:audio/wav;base64,{audio_b64_str}" controls autoplay></audio>'
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
+        ("Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+           "You are able to understand the visual content that the user provides, "
+           "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+           "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+           "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+           "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+conv_mistral_instruct = Conversation(
+    system="",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="",
+    sep2="</s>",
+)
+conv_chatml_direct = Conversation(
+    system="""<|im_start|>system
+Answer the questions.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llama3 = Conversation(
+    system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llama3_demo = Conversation(
+    system="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. Your name is emova, and you are purely developed by the emova Team.""",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3_demo",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llama3_without_system = Conversation(
+    system="",
+    roles=("<|start_header_id|>user<|end_header_id|>\n\n", "<|start_header_id|>assistant<|end_header_id|>\n\n"),
+    version="llama3_without_system",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|eot_id|>",
+)
+conv_llama3_without_systemV2 = Conversation(
+    system="",
+    roles=("user:", "assistant:"),
+    version="llama3_without_systemv2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="\n\n",
+)
+conv_qwen2 = Conversation(
+    system='<|im_start|>system\nYou are a helpful assistant.',
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="qwen2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>\n",
+)
+conv_qwen2_demo = Conversation(
+    system='<|im_start|>system\nYou are a helpful assistant. Your name is emova, and you are purely developed by the emova Team.',
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="qwen2_demo",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>\n",
+)
+conv_glm4 = Conversation(
+    system='[gMASK]<sop>',
+    roles=("<|user|>", "<|assistant|>"),
+    version="glm4",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.GLM4,
+    sep="\n",
+)
+default_conversation = conv_vicuna_v1
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "mistral_instruct": conv_mistral_instruct,
+    "chatml_direct": conv_chatml_direct,
+    "mistral_direct": conv_chatml_direct,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "llama3": conv_llama3,
+    "llama3_demo": conv_llama3_demo,
+    "llama3_without_system": conv_llama3_without_system,
+    "conv_llama3_without_systemV2": conv_llama3_without_systemV2,
+    "mpt": conv_mpt,
+    "qwen2": conv_qwen2,
+    "qwen2_demo": conv_qwen2_demo,
+    "glm4": conv_glm4,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

examples/icon_256.png ADDED Viewed

examples/image-text/demo_example.jpg ADDED Viewed

examples/user_avator.png ADDED Viewed

gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__/

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+# requirements.txt records the full set of dependencies for development
+torch==2.6.0
+accelerate
+codetiming
+datasets
+dill
+# flash-attn
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+hydra-core
+liger-kernel
+numpy
+pandas
+datasets
+peft
+pyarrow>=15.0.0
+pybind11
+pylatexenc
+pylint==3.3.6
+qwen_vl_utils
+ray[default]
+tensordict<=0.6.2
+torchdata
+transformers
+vllm==0.8.2
+wandb
+word2number
+math_verify
+mathruler
+tensorboard
+transformers==4.51.0