|
from collections.abc import Iterator |
|
from datetime import datetime |
|
from pathlib import Path |
|
from threading import Thread |
|
from huggingface_hub import hf_hub_download |
|
from themes.research_monochrome import theme |
|
from typing import Iterator, List, Dict |
|
|
|
import requests |
|
import json |
|
import subprocess |
|
import gradio as gr |
|
|
|
today_date = datetime.today().strftime("%B %-d, %Y") |
|
|
|
SYS_PROMPT = f"""Today's Date: {today_date}. |
|
You are Gemma3, developed by Google. You are a helpful AI assistant""" |
|
TITLE = "Gemma3 1b instruct IQ4_XS from local GGUF server using BPP library." |
|
|
|
DESCRIPTION = """ |
|
<p>Gemma3 1b instruct is an open-source LLM supporting a 128k context window. This demo uses only 2K context. |
|
</p> |
|
<p> The BPP library implements matrix multiplication with far less multiplications. |
|
</p> |
|
<p> <b><u> It will run much faster if you duplicate this space for your own use</u></b> |
|
</p> |
|
<p> Original space by TobDeBers, and GGUF by Bartowski. Not sure it's the idea, but we noticed IQ4_XS runs faster. |
|
</p> |
|
<p> <b>Running on CPU, please be patient!</b> |
|
</p> |
|
|
|
""" |
|
|
|
|
|
LLAMA_CPP_SERVER = "http://127.0.0.1:8081" |
|
MAX_NEW_TOKENS = 512 |
|
TEMPERATURE = 1 |
|
TOP_P = 0.95 |
|
TOP_K = 64 |
|
REPETITION_PENALTY = 1.05 |
|
MIN_P = 0.0 |
|
|
|
|
|
gguf_path = hf_hub_download( |
|
repo_id="bartowski/google_gemma-3-1b-it-GGUF", |
|
filename="google_gemma-3-1b-it-IQ4_XS.gguf", |
|
local_dir="." |
|
) |
|
|
|
|
|
subprocess.run(["chmod", "+x", "llama-server"]) |
|
command = ["./llama-server", "-m", "google_gemma-3-1b-it-IQ4_XS.gguf", "-ngl", "0", "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"] |
|
process = subprocess.Popen(command) |
|
print(f"Llama-server process started with PID {process.pid}") |
|
|
|
|
|
def generate( |
|
message: str, |
|
chat_history: List[Dict], |
|
temperature: float = TEMPERATURE, |
|
repetition_penalty: float = REPETITION_PENALTY, |
|
top_p: float = TOP_P, |
|
top_k: float = TOP_K, |
|
min_p: float = MIN_P, |
|
max_new_tokens: int = MAX_NEW_TOKENS, |
|
) -> Iterator[str]: |
|
"""Generate function for chat demo using Llama.cpp server.""" |
|
|
|
|
|
conversation = [] |
|
conversation.append({"role": "system", "content": SYS_PROMPT}) |
|
conversation += chat_history |
|
conversation.append({"role": "user", "content": message}) |
|
|
|
|
|
prompt = "" |
|
for item in conversation: |
|
if item["role"] == "system": |
|
prompt += f"<|system|>\n{item['content']}\n<|file_separator|>\n" |
|
elif item["role"] == "user": |
|
prompt += f"<|user|>\n{item['content']}\n<|file_separator|>\n" |
|
elif item["role"] == "assistant": |
|
prompt += f"<|model|>\n{item['content']}\n<|file_separator|>\n" |
|
prompt += "<|model|>\n" |
|
|
|
|
|
payload = { |
|
"prompt": prompt, |
|
"stream": True, |
|
"max_tokens": max_new_tokens, |
|
"temperature": temperature, |
|
"repeat_penalty": repetition_penalty, |
|
"top_p": top_p, |
|
"top_k": top_k, |
|
"min_p": min_p, |
|
"stop": ["<|file_separator|>"], |
|
} |
|
|
|
try: |
|
|
|
with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response: |
|
response.raise_for_status() |
|
|
|
|
|
outputs = [] |
|
for line in response.iter_lines(): |
|
if line: |
|
|
|
decoded_line = line.decode('utf-8') |
|
|
|
if decoded_line.startswith("data: "): |
|
decoded_line = decoded_line[6:] |
|
|
|
|
|
try: |
|
json_data = json.loads(decoded_line) |
|
text = json_data.get("content", "") |
|
if text: |
|
outputs.append(text) |
|
yield "".join(outputs) |
|
|
|
except json.JSONDecodeError: |
|
print(f"JSONDecodeError: {decoded_line}") |
|
|
|
|
|
except requests.exceptions.RequestException as e: |
|
print(f"Request failed: {e}") |
|
yield f"Error: {e}" |
|
except Exception as e: |
|
print(f"An unexpected error occurred: {e}") |
|
yield f"Error: {e}" |
|
|
|
|
|
css_file_path = Path(Path(__file__).parent / "app.css") |
|
|
|
|
|
temperature_slider = gr.Slider( |
|
minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"] |
|
) |
|
top_p_slider = gr.Slider( |
|
minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"] |
|
) |
|
top_k_slider = gr.Slider( |
|
minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"] |
|
) |
|
min_p_slider = gr.Slider( |
|
minimum=0, maximum=1.0, value=MIN_P, step=0.01, label="Min P", elem_classes=["gr_accordion_element"] |
|
) |
|
repetition_penalty_slider = gr.Slider( |
|
minimum=0, |
|
maximum=2.0, |
|
value=REPETITION_PENALTY, |
|
step=0.05, |
|
label="Repetition Penalty", |
|
elem_classes=["gr_accordion_element"], |
|
) |
|
max_new_tokens_slider = gr.Slider( |
|
minimum=1, |
|
maximum=2000, |
|
value=MAX_NEW_TOKENS, |
|
step=1, |
|
label="Max New Tokens", |
|
elem_classes=["gr_accordion_element"], |
|
) |
|
chat_interface_accordion = gr.Accordion(label="Advanced Settings", open=False) |
|
|
|
with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=theme, title=TITLE) as demo: |
|
gr.HTML(f"<h2>{TITLE}</h2>", elem_classes=["gr_title"]) |
|
gr.HTML(DESCRIPTION) |
|
chat_interface = gr.ChatInterface( |
|
fn=generate, |
|
examples=[ |
|
["Explain the concept of quantum computing to someone with no background in physics or computer science."], |
|
["What is OpenShift?"], |
|
["What's the importance of low latency inference?"], |
|
["Help me boost productivity habits."], |
|
], |
|
example_labels=[ |
|
"Explain quantum computing", |
|
"What is OpenShift?", |
|
"Importance of low latency inference", |
|
"Boosting productivity habits", |
|
], |
|
cache_examples=False, |
|
type="messages", |
|
additional_inputs=[ |
|
temperature_slider, |
|
repetition_penalty_slider, |
|
top_p_slider, |
|
top_k_slider, |
|
min_p_slider, |
|
max_new_tokens_slider, |
|
], |
|
additional_inputs_accordion=chat_interface_accordion, |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.queue().launch() |