TobDeBers_BPP_Gemma3_1b

Paused

App Files Files Community

TobDeBers_BPP_Gemma3_1b / app.py

robb-0

Update app.py (#2)

3d12b1f verified 3 months ago

raw

history blame contribute delete

7.08 kB

	from collections.abc import Iterator
	from datetime import datetime
	from pathlib import Path
	from threading import Thread
	from huggingface_hub import hf_hub_download
	from themes.research_monochrome import theme
	from typing import Iterator, List, Dict

	import requests
	import json
	import subprocess
	import gradio as gr

	today_date = datetime.today().strftime("%B %-d, %Y") # noqa: DTZ002

	SYS_PROMPT = f"""Today's Date: {today_date}.
	You are Gemma3, developed by Google. You are a helpful AI assistant"""
	TITLE = "Gemma3 1b instruct IQ4_XS from local GGUF server using BPP library."
	# Added a suggestion for users to duplicate the space
	DESCRIPTION = """
	<p>Gemma3 1b instruct is an open-source LLM supporting a 128k context window. This demo uses only 2K context.
	</p>
	<p> The BPP library implements matrix multiplication with far less multiplications.
	</p>
	<p> <b><u> It will run much faster if you duplicate this space for your own use</u></b>
	</p>
	<p> Original space by TobDeBers, and GGUF by Bartowski. Not sure it's the idea, but we noticed IQ4_XS runs faster.
	</p>
	<p> <b>Running on CPU, please be patient!</b>
	</p>

	"""


	LLAMA_CPP_SERVER = "http://127.0.0.1:8081"
	MAX_NEW_TOKENS = 512
	TEMPERATURE = 1
	TOP_P = 0.95
	TOP_K = 64
	REPETITION_PENALTY = 1.05
	MIN_P = 0.0

	# download GGUF into local directory
	gguf_path = hf_hub_download(
	repo_id="bartowski/google_gemma-3-1b-it-GGUF",
	filename="google_gemma-3-1b-it-IQ4_XS.gguf",
	local_dir="."
	)

	# start llama-server
	subprocess.run(["chmod", "+x", "llama-server"])
	command = ["./llama-server", "-m", "google_gemma-3-1b-it-IQ4_XS.gguf", "-ngl", "0", "--temp", "0.0", "-c", "2048", "-t", "8", "--port", "8081"]
	process = subprocess.Popen(command)
	print(f"Llama-server process started with PID {process.pid}")


	def generate(
	message: str,
	chat_history: List[Dict],
	temperature: float = TEMPERATURE,
	repetition_penalty: float = REPETITION_PENALTY,
	top_p: float = TOP_P,
	top_k: float = TOP_K,
	min_p: float = MIN_P,
	max_new_tokens: int = MAX_NEW_TOKENS,
	) -> Iterator[str]:
	"""Generate function for chat demo using Llama.cpp server."""

	# Build messages
	conversation = []
	conversation.append({"role": "system", "content": SYS_PROMPT})
	conversation += chat_history
	conversation.append({"role": "user", "content": message})

	# Prepare the prompt for the Llama.cpp server
	prompt = ""
	for item in conversation:
	if item["role"] == "system":
	prompt += f"<\|system\|>\n{item['content']}\n<\|file_separator\|>\n"
	elif item["role"] == "user":
	prompt += f"<\|user\|>\n{item['content']}\n<\|file_separator\|>\n"
	elif item["role"] == "assistant":
	prompt += f"<\|model\|>\n{item['content']}\n<\|file_separator\|>\n"
	prompt += "<\|model\|>\n" # Add the beginning token for the assistant

	# Construct the request payload
	payload = {
	"prompt": prompt,
	"stream": True, # Enable streaming
	"max_tokens": max_new_tokens,
	"temperature": temperature,
	"repeat_penalty": repetition_penalty,
	"top_p": top_p,
	"top_k": top_k,
	"min_p": min_p,
	"stop": ["<\|file_separator\|>"], # stops after it sees this
	}

	try:
	# Make the request to the Llama.cpp server
	with requests.post(f"{LLAMA_CPP_SERVER}/completion", json=payload, stream=True, timeout=60) as response:
	response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)

	# Stream the response from the server
	outputs = []
	for line in response.iter_lines():
	if line:
	# Decode the line
	decoded_line = line.decode('utf-8')
	# Remove 'data: ' prefix if present
	if decoded_line.startswith("data: "):
	decoded_line = decoded_line[6:]

	# Handle potential JSON decoding errors
	try:
	json_data = json.loads(decoded_line)
	text = json_data.get("content", "") # Extract content field. crucial.
	if text:
	outputs.append(text)
	yield "".join(outputs)

	except json.JSONDecodeError:
	print(f"JSONDecodeError: {decoded_line}")
	# Handle the error, potentially skipping the line or logging it.

	except requests.exceptions.RequestException as e:
	print(f"Request failed: {e}")
	yield f"Error: {e}" # Yield an error message to the user
	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	yield f"Error: {e}" # Yield error message


	css_file_path = Path(Path(__file__).parent / "app.css")

	# advanced settings (displayed in Accordion)
	temperature_slider = gr.Slider(
	minimum=0, maximum=1.0, value=TEMPERATURE, step=0.1, label="Temperature", elem_classes=["gr_accordion_element"]
	)
	top_p_slider = gr.Slider(
	minimum=0, maximum=1.0, value=TOP_P, step=0.05, label="Top P", elem_classes=["gr_accordion_element"]
	)
	top_k_slider = gr.Slider(
	minimum=0, maximum=100, value=TOP_K, step=1, label="Top K", elem_classes=["gr_accordion_element"]
	)
	min_p_slider = gr.Slider(
	minimum=0, maximum=1.0, value=MIN_P, step=0.01, label="Min P", elem_classes=["gr_accordion_element"] # Added comma here
	)
	repetition_penalty_slider = gr.Slider(
	minimum=0,
	maximum=2.0,
	value=REPETITION_PENALTY,
	step=0.05,
	label="Repetition Penalty",
	elem_classes=["gr_accordion_element"],
	)
	max_new_tokens_slider = gr.Slider(
	minimum=1,
	maximum=2000,
	value=MAX_NEW_TOKENS,
	step=1,
	label="Max New Tokens",
	elem_classes=["gr_accordion_element"],
	)
	chat_interface_accordion = gr.Accordion(label="Advanced Settings", open=False)

	with gr.Blocks(fill_height=True, css_paths=css_file_path, theme=theme, title=TITLE) as demo:
	gr.HTML(f"<h2>{TITLE}</h2>", elem_classes=["gr_title"])
	gr.HTML(DESCRIPTION)
	chat_interface = gr.ChatInterface(
	fn=generate,
	examples=[
	["Explain the concept of quantum computing to someone with no background in physics or computer science."],
	["What is OpenShift?"],
	["What's the importance of low latency inference?"],
	["Help me boost productivity habits."],
	],
	example_labels=[
	"Explain quantum computing",
	"What is OpenShift?",
	"Importance of low latency inference",
	"Boosting productivity habits",
	],
	cache_examples=False,
	type="messages",
	additional_inputs=[
	temperature_slider,
	repetition_penalty_slider,
	top_p_slider,
	top_k_slider,
	min_p_slider, #added input for min_p
	max_new_tokens_slider,
	],
	additional_inputs_accordion=chat_interface_accordion,
	)

	if __name__ == "__main__":
	demo.queue().launch()