from collections.abc import Iterator from datetime import datetime from pathlib import Path from threading import Thread from huggingface_hub import hf_hub_download from themes.research_monochrome import theme from typing import Iterator, List, Dict import requests import json import subprocess import gradio as gr today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002 SYS_PROMPT = f"""Today's Date: {today_date}. You are Gemma3, developed by Google. You are a helpful AI assistant""" TITLE = "Gemma3 1b instruct IQ4_XS from local GGUF server using BPP library." # Added a suggestion for users to duplicate the space DESCRIPTION = """

Gemma3 1b instruct is an open-source LLM supporting a 128k context window. This demo uses only 2K context.

The BPP library implements matrix multiplication with far less multiplications.

It will run much faster if you duplicate this space for your own use

Original space by TobDeBers, and GGUF by Bartowski. Not sure it's the idea, but we noticed IQ4_XS runs faster.

Running on CPU, please be patient!

""" LLAMA_CPP_SERVER = "http://127.0.0.1:8081" MAX_NEW_TOKENS = 512 TEMPERATURE = 1 TOP_P = 0.95 TOP_K = 64 REPETITION_PENALTY = 1.05 MIN_P = 0.0 # download GGUF into local directory gguf_path = hf_hub_download( repo_id="bartowski/google_gemma-3-1b-it-GGUF", filename="google_gemma-3-1b-it-IQ4_XS.gguf", local_dir="." ) # start llama-server subprocess.run(["chmod", "+x", "llama-server"]) command = ["./llama-server", "-m", "google_gemma-3-1b-it-IQ4_XS.gguf", "-ngl", "0", "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"] process = subprocess.Popen(command) print(f"Llama-server process started with PID {process.pid}") def generate( message: str, chat_history: List[Dict], temperature: float = TEMPERATURE, repetition_penalty: float = REPETITION_PENALTY, top_p: float = TOP_P, top_k: float = TOP_K, min_p: float = MIN_P, max_new_tokens: int = MAX_NEW_TOKENS, ) -> Iterator[str]: """Generate function for chat demo using Llama.cpp server.""" # Build messages conversation = [] conversation.append({"role": "system", "content": SYS_PROMPT}) conversation += chat_history conversation.append({"role": "user", "content": message}) # Prepare the prompt for the Llama.cpp server prompt = "" for item in conversation: if item["role"] == "system": prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n" elif item["role"] == "user": prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n" elif item["role"] == "assistant": prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n" prompt += "<|model|>\n" # Add the beginning token for the assistant # Construct the request payload payload = { "prompt": prompt, "stream": True, # Enable streaming "max_tokens": max_new_tokens, "temperature": temperature, "repeat_penalty": repetition_penalty, "top_p": top_p, "top_k": top_k, "min_p": min_p, "stop": ["<|file_separator|>"], # stops after it sees this } try: # Make the request to the Llama.cpp server with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response: response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx) # Stream the response from the server outputs = [] for line in response.iter_lines(): if line: # Decode the line decoded_line = line.decode('utf-8') # Remove 'data: ' prefix if present if decoded_line.startswith("data: "): decoded_line = decoded_line[6:] # Handle potential JSON decoding errors try: json_data = json.loads(decoded_line) text = json_data.get("content", "") # Extract content field. crucial. if text: outputs.append(text) yield "".join(outputs) except json.JSONDecodeError: print(f"JSONDecodeError: {decoded_line}") # Handle the error, potentially skipping the line or logging it. except requests.exceptions.RequestException as e: print(f"Request failed: {e}") yield f"Error: {e}" # Yield an error message to the user except Exception as e: print(f"An unexpected error occurred: {e}") yield f"Error: {e}" # Yield error message css_file_path = Path(Path(__file__).parent / "app.css") # advanced settings (displayed in Accordion) temperature_slider = gr.Slider( minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"] ) top_p_slider = gr.Slider( minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"] ) top_k_slider = gr.Slider( minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"] ) min_p_slider = gr.Slider( minimum=0, maximum=1.0, value=MIN_P, step=0.01, label="Min P", elem_classes=["gr_accordion_element"] # Added comma here ) repetition_penalty_slider = gr.Slider( minimum=0, maximum=2.0, value=REPETITION_PENALTY, step=0.05, label="Repetition Penalty", elem_classes=["gr_accordion_element"], ) max_new_tokens_slider = gr.Slider( minimum=1, maximum=2000, value=MAX_NEW_TOKENS, step=1, label="Max New Tokens", elem_classes=["gr_accordion_element"], ) chat_interface_accordion = gr.Accordion(label="Advanced Settings", open=False) with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=theme, title=TITLE) as demo: gr.HTML(f"

{TITLE}

", elem_classes=["gr_title"]) gr.HTML(DESCRIPTION) chat_interface = gr.ChatInterface( fn=generate, examples=[ ["Explain the concept of quantum computing to someone with no background in physics or computer science."], ["What is OpenShift?"], ["What's the importance of low latency inference?"], ["Help me boost productivity habits."], ], example_labels=[ "Explain quantum computing", "What is OpenShift?", "Importance of low latency inference", "Boosting productivity habits", ], cache_examples=False, type="messages", additional_inputs=[ temperature_slider, repetition_penalty_slider, top_p_slider, top_k_slider, min_p_slider, #added input for min_p max_new_tokens_slider, ], additional_inputs_accordion=chat_interface_accordion, ) if __name__ == "__main__": demo.queue().launch()