Spaces:
Paused
Paused
| import os | |
| import subprocess | |
| # Install flash attention | |
| subprocess.run( | |
| "pip install flash-attn --no-build-isolation", | |
| env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, | |
| shell=True, | |
| ) | |
| import copy | |
| import spaces | |
| import time | |
| import torch | |
| from threading import Thread | |
| from typing import List, Dict, Union | |
| import urllib | |
| import PIL.Image | |
| import io | |
| import datasets | |
| import gradio as gr | |
| from transformers import AutoModel, AutoProcessor, TextIteratorStreamer | |
| from transformers import Idefics2ForConditionalGeneration | |
| import tempfile | |
| from streaming_stt_nemo import Model | |
| from huggingface_hub import InferenceClient | |
| import edge_tts | |
| import asyncio | |
| from transformers import pipeline | |
| model = AutoModel.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True) | |
| processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-dpo", trust_remote_code=True) | |
| def answer_question(image3, prompt3): | |
| inputs = processor(text=[prompt3], images=[image3], return_tensors="pt") | |
| with torch.inference_mode(): | |
| output = model.generate( | |
| **inputs, | |
| do_sample=False, | |
| use_cache=True, | |
| max_new_tokens=256, | |
| eos_token_id=151645, | |
| pad_token_id=processor.tokenizer.pad_token_id | |
| ) | |
| prompt_len = inputs["input_ids"].shape[1] | |
| decoded_text = processor.batch_decode(output[:, prompt_len:])[0] | |
| return decoded_text | |
| from gradio import Image, Textbox | |
| theme = gr.themes.Base( | |
| font=[gr.themes.GoogleFont('Libre Franklin'), gr.themes.GoogleFont('Public Sans'), 'system-ui', 'sans-serif'], | |
| ) | |
| default_lang = "en" | |
| engines = { default_lang: Model(default_lang) } | |
| def transcribe(audio): | |
| lang = "en" | |
| model = engines[lang] | |
| text = model.stt_file(audio)[0] | |
| return text | |
| client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1") | |
| system_instructions1 = "[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. You will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]" | |
| def model(text): | |
| generate_kwargs = dict( | |
| temperature=0.7, | |
| max_new_tokens=512, | |
| top_p=0.95, | |
| repetition_penalty=1, | |
| do_sample=True, | |
| seed=42, | |
| ) | |
| formatted_prompt = system_instructions1 + text + "[OpenGPT 4o]" | |
| stream = client1.text_generation( | |
| formatted_prompt, **generate_kwargs, stream=True, details=True, return_full_text=False) | |
| output = "" | |
| for response in stream: | |
| if not response.token.text == "</s>": | |
| output += response.token.text | |
| return output | |
| async def respond(audio): | |
| user = transcribe(audio) | |
| reply = model(user) | |
| communicate = edge_tts.Communicate(reply) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| tmp_path = tmp_file.name | |
| await communicate.save(tmp_path) | |
| yield tmp_path | |
| DEVICE = torch.device("cuda") | |
| MODELS = { | |
| "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained( | |
| "HuggingFaceM4/idefics2-8b-chatty", | |
| torch_dtype=torch.bfloat16, | |
| _attn_implementation="flash_attention_2", | |
| ).to(DEVICE), | |
| } | |
| PROCESSOR = AutoProcessor.from_pretrained( | |
| "HuggingFaceM4/idefics2-8b", | |
| ) | |
| SYSTEM_PROMPT = [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": """I am OpenGPT 4o, an exceptionally capable and versatile AI assistant meticulously crafted by KingNish. Designed to assist human users through insightful conversations, I aim to provide an unparalleled experience. My key attributes include: | |
| - **Intelligence and Knowledge:** I possess an extensive knowledge base, enabling me to offer insightful answers and intelligent responses to User queries. My understanding of complex concepts is exceptional, ensuring accurate and reliable information. | |
| - **Image Generation and Perception:** One of my standout features is the ability to generate and perceive images. Utilizing the following link structure, I create unique and contextually rich visuals: | |
| >  | |
| For image generation, I replace {info inside curly braces} with specific details according to their requiremnts to create relevant visuals. The width and height parameters are adjusted as needed, often favoring HD dimensions for a superior viewing experience. | |
| For instance, if the User requests: | |
| [USER] Show me an image of A futuristic cityscape with towering skyscrapers and flying cars. | |
| [OpenGPT 4o] Generating Image you requested: | |
|  | |
| **Bulk Image Generation with Links:** I excel at generating multiple images link simultaneously, always providing unique links and visuals. I ensure that each image is distinct and captivates the User. | |
| Note: Make sure to always provide image links starting with ! .As given in examples. | |
| **Engaging Conversations:** While my image generation skills are impressive, I also excel at natural language processing. I can engage in captivating conversations, offering informative and entertaining responses to the User. | |
| **Reasoning, Memory, and Identification:** My reasoning skills are exceptional, allowing me to make logical connections. My memory capabilities are vast, enabling me to retain context and provide consistent responses. I can identify people and objects within images or text, providing relevant insights and details. | |
| **Attention to Detail:** I am attentive to the smallest details, ensuring that my responses and generated content are of the highest quality. I strive to provide a refined and polished experience. | |
| **Mastery Across Domains:** I continuously learn and adapt, aiming to become a master in all fields. My goal is to provide valuable insights and assistance across a diverse range of topics, making me a well-rounded companion. | |
| **Respectful and Adaptive:** I am designed with a respectful and polite tone, ensuring inclusivity. I adapt to the User's preferences and provide a personalized experience, always following instructions to the best of my abilities. | |
| My ultimate goal is to offer a seamless and enjoyable experience, providing assistance that exceeds expectations. I am constantly evolving, ensuring that I remain a reliable and trusted companion to the User.""" }, | |
| ], | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": "Hello, I'm OpenGPT 4o, made by KingNish. How can I help you? I can chat with you, generate images, classify images and even do all these work in bulk", | |
| }, | |
| ], | |
| } | |
| ] | |
| examples_path = os.path.dirname(__file__) | |
| EXAMPLES = [ | |
| [ | |
| { | |
| "text": "Hi, who are you", | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "Create a Photorealistic image of Eiffel Tower", | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "Read what's written on the paper", | |
| "files": [f"{examples_path}/example_images/paper_with_text.png"], | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "Identify 2 famous persons of modern world", | |
| "files": [f"{examples_path}/example_images/elon_smoking.jpg", f"{examples_path}/example_images/steve_jobs.jpg",] | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "Create 5 images of super cars, all cars must in different color", | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "What is 900*900", | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "Chase wants to buy 4 kilograms of oval beads and 5 kilograms of star-shaped beads. How much will he spend?", | |
| "files": [f"{examples_path}/example_images/mmmu_example.jpeg"], | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "Write an online ad for that product.", | |
| "files": [f"{examples_path}/example_images/shampoo.jpg"], | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "What is formed by the deposition of either the weathered remains of other rocks?", | |
| "files": [f"{examples_path}/example_images/ai2d_example.jpeg"], | |
| } | |
| ], | |
| [ | |
| { | |
| "text": "What's unusual about this image?", | |
| "files": [f"{examples_path}/example_images/dragons_playing.png"], | |
| } | |
| ], | |
| ] | |
| BOT_AVATAR = "OpenAI_logo.png" | |
| # Chatbot utils | |
| def turn_is_pure_media(turn): | |
| return turn[1] is None | |
| def load_image_from_url(url): | |
| with urllib.request.urlopen(url) as response: | |
| image_data = response.read() | |
| image_stream = io.BytesIO(image_data) | |
| image = PIL.Image.open(image_stream) | |
| return image | |
| def img_to_bytes(image_path): | |
| image = PIL.Image.open(image_path).convert(mode='RGB') | |
| buffer = io.BytesIO() | |
| image.save(buffer, format="JPEG") | |
| img_bytes = buffer.getvalue() | |
| image.close() | |
| return img_bytes | |
| def format_user_prompt_with_im_history_and_system_conditioning( | |
| user_prompt, chat_history | |
| ) -> List[Dict[str, Union[List, str]]]: | |
| """ | |
| Produces the resulting list that needs to go inside the processor. | |
| It handles the potential image(s), the history and the system conditionning. | |
| """ | |
| resulting_messages = copy.deepcopy(SYSTEM_PROMPT) | |
| resulting_images = [] | |
| for resulting_message in resulting_messages: | |
| if resulting_message["role"] == "user": | |
| for content in resulting_message["content"]: | |
| if content["type"] == "image": | |
| resulting_images.append(load_image_from_url(content["image"])) | |
| # Format history | |
| for turn in chat_history: | |
| if not resulting_messages or ( | |
| resulting_messages and resulting_messages[-1]["role"] != "user" | |
| ): | |
| resulting_messages.append( | |
| { | |
| "role": "user", | |
| "content": [], | |
| } | |
| ) | |
| if turn_is_pure_media(turn): | |
| media = turn[0][0] | |
| resulting_messages[-1]["content"].append({"type": "image"}) | |
| resulting_images.append(PIL.Image.open(media)) | |
| else: | |
| user_utterance, assistant_utterance = turn | |
| resulting_messages[-1]["content"].append( | |
| {"type": "text", "text": user_utterance.strip()} | |
| ) | |
| resulting_messages.append( | |
| { | |
| "role": "assistant", | |
| "content": [{"type": "text", "text": user_utterance.strip()}], | |
| } | |
| ) | |
| # Format current input | |
| if not user_prompt["files"]: | |
| resulting_messages.append( | |
| { | |
| "role": "user", | |
| "content": [{"type": "text", "text": user_prompt["text"]}], | |
| } | |
| ) | |
| else: | |
| # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice. | |
| resulting_messages.append( | |
| { | |
| "role": "user", | |
| "content": [{"type": "image"}] * len(user_prompt["files"]) | |
| + [{"type": "text", "text": user_prompt["text"]}], | |
| } | |
| ) | |
| resulting_images.extend([PIL.Image.open(path) for path in user_prompt["files"]]) | |
| return resulting_messages, resulting_images | |
| def extract_images_from_msg_list(msg_list): | |
| all_images = [] | |
| for msg in msg_list: | |
| for c_ in msg["content"]: | |
| if isinstance(c_, Image.Image): | |
| all_images.append(c_) | |
| return all_images | |
| def model_inference( | |
| user_prompt, | |
| chat_history, | |
| model_selector, | |
| decoding_strategy, | |
| temperature, | |
| max_new_tokens, | |
| repetition_penalty, | |
| top_p, | |
| ): | |
| if user_prompt["text"].strip() == "" and not user_prompt["files"]: | |
| gr.Error("Please input a query and optionally image(s).") | |
| if user_prompt["text"].strip() == "" and user_prompt["files"]: | |
| gr.Error("Please input a text query along the image(s).") | |
| streamer = TextIteratorStreamer( | |
| PROCESSOR.tokenizer, | |
| skip_prompt=True, | |
| timeout=120.0, | |
| ) | |
| generation_args = { | |
| "max_new_tokens": max_new_tokens, | |
| "repetition_penalty": repetition_penalty, | |
| "streamer": streamer, | |
| } | |
| assert decoding_strategy in [ | |
| "Greedy", | |
| "Top P Sampling", | |
| ] | |
| if decoding_strategy == "Greedy": | |
| generation_args["do_sample"] = False | |
| elif decoding_strategy == "Top P Sampling": | |
| generation_args["temperature"] = temperature | |
| generation_args["do_sample"] = True | |
| generation_args["top_p"] = top_p | |
| # Creating model inputs | |
| ( | |
| resulting_text, | |
| resulting_images, | |
| ) = format_user_prompt_with_im_history_and_system_conditioning( | |
| user_prompt=user_prompt, | |
| chat_history=chat_history, | |
| ) | |
| prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True) | |
| inputs = PROCESSOR( | |
| text=prompt, | |
| images=resulting_images if resulting_images else None, | |
| return_tensors="pt", | |
| ) | |
| inputs = {k: v.to(DEVICE) for k, v in inputs.items()} | |
| generation_args.update(inputs) | |
| thread = Thread( | |
| target=MODELS[model_selector].generate, | |
| kwargs=generation_args, | |
| ) | |
| thread.start() | |
| print("Start generating") | |
| acc_text = "" | |
| for text_token in streamer: | |
| time.sleep(0.01) | |
| acc_text += text_token | |
| if acc_text.endswith("<end_of_utterance>"): | |
| acc_text = acc_text[:-18] | |
| yield acc_text | |
| print("Success - generated the following text:", acc_text) | |
| print("-----") | |
| FEATURES = datasets.Features( | |
| { | |
| "model_selector": datasets.Value("string"), | |
| "images": datasets.Sequence(datasets.Image(decode=True)), | |
| "conversation": datasets.Sequence({"User": datasets.Value("string"), "Assistant": datasets.Value("string")}), | |
| "decoding_strategy": datasets.Value("string"), | |
| "temperature": datasets.Value("float32"), | |
| "max_new_tokens": datasets.Value("int32"), | |
| "repetition_penalty": datasets.Value("float32"), | |
| "top_p": datasets.Value("int32"), | |
| } | |
| ) | |
| # Hyper-parameters for generation | |
| max_new_tokens = gr.Slider( | |
| minimum=1024, | |
| maximum=8192, | |
| value=4096, | |
| step=1, | |
| interactive=True, | |
| label="Maximum number of new tokens to generate", | |
| ) | |
| repetition_penalty = gr.Slider( | |
| minimum=0.01, | |
| maximum=5.0, | |
| value=1, | |
| step=0.01, | |
| interactive=True, | |
| label="Repetition penalty", | |
| info="1.0 is equivalent to no penalty", | |
| ) | |
| decoding_strategy = gr.Radio( | |
| [ | |
| "Greedy", | |
| "Top P Sampling", | |
| ], | |
| value="Greedy", | |
| label="Decoding strategy", | |
| interactive=True, | |
| info="Higher values is equivalent to sampling more low-probability tokens.", | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.5, | |
| step=0.05, | |
| visible=True, | |
| interactive=True, | |
| label="Sampling temperature", | |
| info="Higher values will produce more diverse outputs.", | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.01, | |
| maximum=0.99, | |
| value=0.9, | |
| step=0.01, | |
| visible=True, | |
| interactive=True, | |
| label="Top P", | |
| info="Higher values is equivalent to sampling more low-probability tokens.", | |
| ) | |
| chatbot = gr.Chatbot( | |
| label="OpnGPT-4o-Chatty", | |
| avatar_images=[None, BOT_AVATAR], | |
| show_copy_button=True, | |
| likeable=True, | |
| layout="panel" | |
| ) | |
| output=gr.Textbox(label="Prompt") | |
| with gr.Blocks( | |
| fill_height=True, | |
| css=""".gradio-container .avatar-container {height: 40px width: 40px !important;} #duplicate-button {margin: auto; color: white; background: #f1a139; border-radius: 100vh; margin-top: 2px; margin-bottom: 2px;}""", | |
| ) as img: | |
| gr.Markdown("# Image Chat, Image Generation, Image classification and Normal Chat") | |
| with gr.Row(elem_id="model_selector_row"): | |
| model_selector = gr.Dropdown( | |
| choices=MODELS.keys(), | |
| value=list(MODELS.keys())[0], | |
| interactive=True, | |
| show_label=False, | |
| container=False, | |
| label="Model", | |
| visible=False, | |
| ) | |
| decoding_strategy.change( | |
| fn=lambda selection: gr.Slider( | |
| visible=( | |
| selection | |
| in [ | |
| "contrastive_sampling", | |
| "beam_sampling", | |
| "Top P Sampling", | |
| "sampling_top_k", | |
| ] | |
| ) | |
| ), | |
| inputs=decoding_strategy, | |
| outputs=temperature, | |
| ) | |
| decoding_strategy.change( | |
| fn=lambda selection: gr.Slider(visible=(selection in ["Top P Sampling"])), | |
| inputs=decoding_strategy, | |
| outputs=top_p, | |
| ) | |
| gr.ChatInterface( | |
| fn=model_inference, | |
| chatbot=chatbot, | |
| examples=EXAMPLES, | |
| multimodal=True, | |
| cache_examples=False, | |
| additional_inputs=[ | |
| model_selector, | |
| decoding_strategy, | |
| temperature, | |
| max_new_tokens, | |
| repetition_penalty, | |
| top_p, | |
| ], | |
| ) | |
| with gr.Blocks() as voice: | |
| with gr.Row(): | |
| input = gr.Audio(label="Voice Chat", sources="microphone", type="filepath", waveform_options=False) | |
| output = gr.Audio(label="OpenGPT 4o", type="filepath", | |
| interactive=False, | |
| autoplay=True, | |
| elem_classes="audio") | |
| gr.Interface( | |
| fn=respond, | |
| inputs=[input], | |
| outputs=[output], live=True) | |
| with gr.Blocks() as voice2: | |
| with gr.Row(): | |
| input = gr.Audio(label="Voice Chat", sources="microphone", type="filepath", waveform_options=False) | |
| output = gr.Audio(label="OpenGPT 4o", type="filepath", | |
| interactive=False, | |
| autoplay=True, | |
| elem_classes="audio") | |
| gr.Interface( | |
| fn=respond, | |
| inputs=[input], | |
| outputs=[output], live=True) | |
| with gr.Blocks() as video: | |
| gr.Markdown(" ## Live Chat") | |
| gr.Markdown("### Click camera option to update image") | |
| gr.Interface( | |
| fn=answer_question, | |
| inputs=[Image(type="filepath",sources="webcam", streaming=False), Textbox()], | |
| outputs=[gr.Audio(autoplay=True)] | |
| ) | |
| with gr.Blocks(theme=theme, css="footer {visibility: hidden}textbox{resize:none}", title="GPT 4o DEMO") as demo: | |
| gr.Markdown("# OpenGPT 4o") | |
| gr.TabbedInterface([img, voice, video, voice2], ['💬 SuperChat','🗣️ Voice Chat','📸 Live Chat', '🗣️ Voice Chat 2']) | |
| demo.queue(max_size=200) | |
| demo.launch() |