{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/dscilab_dungvo/workspace/bin/envs/lmdeploy/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import datasets, huggingface_hub\n", "disk_path ='/dscilab_dungvo/workspace/BA-PRE_THESIS/dataset_pretraining/SYNTH-PEDES/annotation_english_vietnamese_processed'\n", "dataset = datasets.load_from_disk(disk_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CUDA_VISIBLE_DEVICES=0 python inference.py --model_path \"deepseek-ai/deepseek-vl2-small\" --chunk_size 512\n", "CUDA_VISIBLE_DEVICES=0,1,2 python inference.py --model_path \"deepseek-ai/deepseek-vl2\"" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "# Base64\n", "import requests\n", "from PIL import Image\n", "from io import BytesIO\n", "import base64\n", "from openai import OpenAI\n", "from langchain_community.llms import VLLMOpenAI\n", "from langchain_openai import ChatOpenAI\n", "from langchain_core.messages import HumanMessage, SystemMessage\n", "from langchain_core.prompts.chat import (\n", " ChatPromptTemplate,\n", " HumanMessagePromptTemplate,\n", " SystemMessagePromptTemplate,\n", ")\n", "\n", "\n", "PORT = 19400\n", "client = OpenAI(api_key=\"YOUR_API_KEY\", base_url=f\"http://0.0.0.0:{PORT}/v1\")\n", "model_name = client.models.list().data[0].id\n", "\n", "inference_server_url = f\"http://0.0.0.0:{PORT}/v1\"\n", "\n", "llm = ChatOpenAI(\n", " model=model_name,\n", " openai_api_key=\"EMPTY\",\n", " openai_api_base=inference_server_url,\n", " max_tokens=2000,\n", " # temperature=0.1,\n", " # top_p=0.8,\n", " temperature=0.05,\n", " top_p=0.9,\n", ")\n", "\n", "def make_message(pil_image):\n", "\n", " # INSERT THIS ...\n", " buffered = BytesIO()\n", " pil_image.save(buffered, format=\"JPEG\")\n", " img_str = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", " img_str = str(img_str)\n", " message = HumanMessage(\n", " content=[\n", " {\"type\": \"text\", \"text\": \"Describe the image\"},\n", " {\"type\": \"image_url\", \"image_url\": {\"url\": 'data:image/jpeg;base64,' + img_str}},\n", " ],\n", " )\n", " return message\n", "# response = llm.invoke([message], temperature=0.1, top_p=0.9)\n", "# response\n", "def get_answer(chain, message):\n", " response = chain.invoke([message], temperature=0.1, top_p=0.9)\n", " return response.content\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'OpenGVLab/InternVL2_5-8B-AWQ'" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model_name" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "example_image = dataset[1000]['image']\n", "message = make_message(example_image)\n", "response = get_answer(message)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'The image shows a person from behind walking on a tiled floor. The person is wearing a dark shirt and dark pants. The lighting is dim, and there is a bright screen or display in the background. The person appears to be holding something in their right hand.'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[SystemMessage(content='You are a helpful assistant who is helping user to caption about the image related to person, taking from surveillance camera. Please provide the caption in detail.'),\n", " HumanMessage(content='Describe the image')]" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "init_prompt = ChatPromptTemplate(\n", " [\n", " (\n", " \"system\",\n", " \"You are a helpful assistant who is helping the user write a clear prompt for guiding a Multimodal Large Language Model (MLLM) to describe the image.\",\n", " ),\n", " (\n", " \"user\",\n", " \"\"\"I want the MLLM to provide a detailed, fine-grained description of the image related to a person, taken from surveillance. The model must cover these aspects:\n", " - The gender, pose, appearance, and age of the person in the image.\n", " - The region of the head, face, and items such as hats, glasses, helmets, etc.\n", " - Characteristics of the upper body, such as a red shirt, blue and white jacket, etc.\n", " - Characteristics of the lower body, such as black jeans, white skirt, etc.\n", " - Characteristics of accessories the person is holding, such as a phone, bag, etc.\n", " - Characteristics of the bottom of the person, such as shoes, sandals, etc.\n", " - The location of the person and objects in the image, such as in the park, on the street, in the house, etc.\n", " - The transportation in the image, such as a car, bike, bus, etc.\n", " - The time of day or lighting conditions.\n", " - The weather conditions, such as sunny, rainy, etc.\n", " - Any notable actions or activities the person is engaged in.\n", " \n", " For the objects that occur in the image or on the person, please provide a detailed description of the object, such as the color, shape, size, and any other relevant details.\n", " Please generate three example templates to help the model describe the image in detail. For example:\n", " EX1: \"The [gender] [age] person is wearing a [color] [type of clothing] and holding a [object] in the [location]. [He/She] is standing next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\n", " EX2: \"The [gender] [age] person is [action] while wearing a [color] [type of clothing]. [He/She] is holding a [object] and is located in the [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\n", " EX3: \"In the [location], the [gender] [age] person is seen wearing a [color] [type of clothing] and holding a [object]. [He/She] is next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\n", " \"\"\"\n", " ),\n", " (\n", " \"user\",\n", " [\n", " {\n", " \"type\": \"image_url\",\n", " \"image_url\": {\"url\": \"data:image/jpeg;base64,{image_data}\"},\n", " }\n", " ],\n", " )\n", " ]\n", ")\n", "\n", "\n", "\n", "\n", "extract_prompt = ChatPromptTemplate(\n", " [\n", " (\n", " \"system\",\n", " \"You are a helpful assistant who is helping user to caption about the image related to person, taking from surveillance camera. Please provide the caption in detail.\",\n", " ),\n", " (\n", " \"user\",\n", " \"{guild}\"\n", " ,\n", " ),\n", " ]\n", ")\n", "\n", "extract_prompt.format_messages(guild=\"Describe the image\")\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "chain = init_prompt + llm \n", "\n", "response = chain.invoke()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[SystemMessage(content='You are a helpful assistant who is helping the user write a clear prompt for guiding a Multimodal Large Language Model (MLLM) to describe the image.'),\n", " HumanMessage(content='I want the MLLM to provide a detailed, fine-grained description of the image related to a person, taken from surveillance. The model must cover these aspects:\\n - The gender, pose, appearance, and age of the person in the image.\\n - The region of the head, face, and items such as hats, glasses, helmets, etc.\\n - Characteristics of the upper body, such as a red shirt, blue and white jacket, etc.\\n - Characteristics of the lower body, such as black jeans, white skirt, etc.\\n - Characteristics of accessories the person is holding, such as a phone, bag, etc.\\n - Characteristics of the bottom of the person, such as shoes, sandals, etc.\\n - The location of the person and objects in the image, such as in the park, on the street, in the house, etc.\\n - The transportation in the image, such as a car, bike, bus, etc.\\n - The time of day or lighting conditions.\\n - The weather conditions, such as sunny, rainy, etc.\\n - Any notable actions or activities the person is engaged in.\\n \\n For the objects that occur in the image or on the person, please provide a detailed description of the object, such as the color, shape, size, and any other relevant details.\\n Please generate three example templates to help the model describe the image in detail. For example:\\n EX1: \"The [gender] [age] person is wearing a [color] [type of clothing] and holding a [object] in the [location]. [He/She] is standing next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n EX2: \"The [gender] [age] person is [action] while wearing a [color] [type of clothing]. [He/She] is holding a [object] and is located in the [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n EX3: \"In the [location], the [gender] [age] person is seen wearing a [color] [type of clothing] and holding a [object]. [He/She] is next to a [object] and [object]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n '),\n", " HumanMessage(content=[{'type': 'image_url', 'image_url': {'url': ''}}])]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_str_img(pil_image):\n", " buffered = BytesIO()\n", " pil_image.save(buffered, format=\"JPEG\")\n", " img_str = base64.b64encode(buffered.getvalue()).decode(\"utf-8\")\n", " img_str = str(img_str)\n", " return img_str\n", "\n", "\n", "\n", "init_prompt.format_messages(\n", " image_data=get_str_img(example_image)\n", ")" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "init_chain = init_prompt | llm\n", "response = init_chain.invoke(input={\"image_data\": get_str_img(example_image)})" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "AIMessage(content='To guide a Multimodal Large Language Model (MLLM) to provide a detailed description of the image, you can use the following templates. These templates are designed to cover all the aspects you mentioned, ensuring a comprehensive and clear description:\\n\\n### Template 1:\\n\"The [gender] [age] person is seen from behind, wearing a [color] [type of clothing] and [type of pants]. [He/She] is walking on a [surface] and appears to be in a [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n\\n### Template 2:\\n\"The [gender] [age] person is walking while wearing a [color] [type of clothing] and [type of pants]. [He/She] is holding a [object] and is located in the [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n\\n### Template 3:\\n\"In the [location], the [gender] [age] person is seen from behind, wearing a [color] [type of clothing] and [type of pants]. [He/She] is walking on a [surface] and appears to be in a [location]. The [upper body clothing] is [color] and [lower body clothing] is [color]. [He/She] is wearing [accessories] and [shoes].\"\\n\\nThese templates provide a structured format for the MLLM to describe the image, ensuring that all relevant details are covered.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 350, 'prompt_tokens': 1649, 'total_tokens': 1999, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'OpenGVLab/InternVL2_5-8B-AWQ', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-eb1871d9-302e-45a6-a6c5-5b2f425f7c3b-0', usage_metadata={'input_tokens': 1649, 'output_tokens': 350, 'total_tokens': 1999})" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "response" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" } }, "nbformat": 4, "nbformat_minor": 4 }