googoo / app.py
johnpaulbin's picture
Update app.py
f06b197 verified
raw
history blame
11.5 kB
import os
import time
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import threading
import queue
import multiprocessing
# First check if GPU is available for maximum speed
has_gpu = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
print(f"GPU available: {has_gpu} - {gpu_name}")
# Download model files
def get_model_path(repo_id, filename):
print(f"Obtaining {filename}...")
return hf_hub_download(repo_id=repo_id, filename=filename)
base_model_path = get_model_path(
"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
"articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
"johnpaulbin/articulate-V1-Q8_0-GGUF",
"articulate-V1-q8_0.gguf"
)
# Set up optimized environment variables for llama-cpp-python
os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
# For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
os.environ["LLAMA_AVX"] = "1"
os.environ["LLAMA_AVX2"] = "1"
os.environ["LLAMA_F16"] = "1" # Use FP16 where available
# Determine the most optimized backend
if has_gpu:
try:
from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
LlamaClass = GPULlama
print("Using GPU-accelerated llama-cpp-python")
n_gpu_layers = -1 # Use all layers on GPU
except ImportError:
from llama_cpp import Llama
LlamaClass = Llama
print("Using standard llama-cpp-python with GPU acceleration")
n_gpu_layers = -1 # Use all layers on GPU
else:
from llama_cpp import Llama
LlamaClass = Llama
print("Using CPU-only llama-cpp-python")
n_gpu_layers = 0
# Cache for translations
translation_cache = {}
MAX_CACHE_SIZE = 1000
# Pre-compute common translations
COMMON_PHRASES = {
"English to Spanish": [
"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
],
"Spanish to English": [
"Hola", "Gracias", "Buenos dรญas", "ยฟCรณmo estรกs?", "ยฟCรณmo te llamas?",
"No entiendo", "Por favor", "Lo siento", "Sรญ", "No", "Dรณnde estรก"
],
"English to Korean": [
"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
],
"Korean to English": [
"์•ˆ๋…•ํ•˜์„ธ์š”", "๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค", "์ข‹์€ ์•„์นจ์ž…๋‹ˆ๋‹ค", "์–ด๋–ป๊ฒŒ ์ง€๋‚ด์„ธ์š”?", "์ด๋ฆ„์ด ๋ญ์˜ˆ์š”?",
"์ดํ•ด๊ฐ€ ์•ˆ ๋ผ์š”", "์ œ๋ฐœ", "์ฃ„์†กํ•ฉ๋‹ˆ๋‹ค", "๋„ค", "์•„๋‹ˆ์š”", "์–ด๋””์— ์žˆ์–ด์š”"
]
}
# Background worker for model loading and inference
class ModelWorker:
def __init__(self):
self.model = None
self.request_queue = queue.Queue()
self.response_queue = queue.Queue()
self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
self.worker_thread.start()
def _worker_loop(self):
# Initialize model in the worker thread
print("Initializing model in background thread...")
# CPU optimization settings
cpu_count = multiprocessing.cpu_count()
optimal_threads = max(4, cpu_count - 2) # Leave two cores free
# Initialize with the most optimized settings
start_time = time.time()
self.model = LlamaClass(
model_path=base_model_path,
lora_path=adapter_path,
n_ctx=512, # Larger context for longer translations
n_threads=optimal_threads, # Optimized thread count
n_batch=1024, # Large batch for parallel processing
use_mmap=True, # Efficient memory mapping
n_gpu_layers=n_gpu_layers, # GPU acceleration if available
seed=42, # Consistent results
verbose=False, # Reduce overhead
main_gpu=0, # Primary GPU
tensor_split=None, # Auto-distribute across GPUs if multiple
rope_freq_base=10000, # Optimized attention parameters
rope_freq_scale=1.0,
)
print(f"Model loaded in {time.time() - start_time:.2f} seconds")
# Pre-warm the model with common phrases
self._prewarm_model()
# Process requests
while True:
try:
request = self.request_queue.get()
if request is None: # Shutdown signal
break
direction, text, callback_id = request
result = self._process_translation(direction, text)
self.response_queue.put((callback_id, result))
except Exception as e:
print(f"Error in worker thread: {e}")
self.response_queue.put((callback_id, f"Error: {str(e)}"))
def _prewarm_model(self):
"""Pre-compute common translations to warm up the model"""
print("Pre-warming model with common phrases...")
start = time.time()
for direction, phrases in COMMON_PHRASES.items():
for phrase in phrases[:3]: # Just do a few to warm up
self._process_translation(direction, phrase)
print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")
def _process_translation(self, direction, text):
# Skip empty inputs
if not text or not text.strip():
return ""
# Check cache first for faster response
cache_key = f"{direction}:{text}"
if cache_key in translation_cache:
return translation_cache[cache_key]
# Start timing for performance tracking
start_time = time.time()
# Map language directions
lang_map = {
"English to Spanish": ("ENGLISH", "SPANISH"),
"Spanish to English": ("SPANISH", "ENGLISH"),
"Korean to English": ("KOREAN", "ENGLISH"),
"English to Korean": ("ENGLISH", "KOREAN")
}
if direction not in lang_map:
return "Invalid direction"
source_lang, target_lang = lang_map[direction]
# Efficient prompt format
prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
# Estimate appropriate token length based on input
input_tokens = len(text.split())
max_tokens = min(200, max(50, int(input_tokens * 1.5)))
# Generate translation with optimized settings
response = self.model.create_completion(
prompt,
max_tokens=max_tokens,
temperature=0.0, # Deterministic for faster inference
top_k=1, # Only consider most likely token
top_p=1.0, # No sampling
repeat_penalty=1.0, # No repeat penalty
stream=False # Get complete response at once
)
translation = response['choices'][0]['text'].strip()
# Cache result
if len(translation_cache) >= MAX_CACHE_SIZE:
# Remove oldest entry (first key)
translation_cache.pop(next(iter(translation_cache)))
translation_cache[cache_key] = translation
# Log performance
inference_time = time.time() - start_time
tokens_per_second = (input_tokens + len(translation.split())) / inference_time
print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
return translation
def request_translation(self, direction, text, callback_id):
"""Queue a translation request"""
self.request_queue.put((direction, text, callback_id))
# Create worker instance
worker = ModelWorker()
# Counter for request IDs
next_request_id = 0
# Gradio interface functions
def translate(direction, text, progress=gr.Progress()):
"""Queue translation request and wait for result"""
global next_request_id
# Check cache first for immediate response
cache_key = f"{direction}:{text}"
if cache_key in translation_cache:
return translation_cache[cache_key]
# If input is very short, check if we have a similar cached phrase
if len(text) < 20:
for cached_key in translation_cache:
cached_dir, cached_text = cached_key.split(":", 1)
if cached_dir == direction and cached_text.lower().startswith(text.lower()):
return translation_cache[cached_key]
# Generate unique request ID
request_id = next_request_id
next_request_id += 1
# Queue the request
worker.request_translation(direction, text, request_id)
# Wait for the response (with progress feedback)
progress(0, desc="Translating...")
max_wait = 30 # Maximum wait time in seconds
start_time = time.time()
while time.time() - start_time < max_wait:
progress((time.time() - start_time) / max_wait)
# Check for our response
try:
while not worker.response_queue.empty():
resp_id, result = worker.response_queue.get_nowait()
if resp_id == request_id:
progress(1.0)
return result
except queue.Empty:
pass
# Small sleep to prevent CPU hogging
time.sleep(0.05)
progress(1.0)
return "Translation timed out. Please try again."
# Create Gradio interface
with gr.Blocks(title="Ultra-Fast Translation App") as iface:
gr.Markdown(f"""
## Ultra-Fast Translation App
Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
""")
with gr.Row():
direction = gr.Dropdown(
choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
label="Translation Direction",
value="English to Spanish"
)
with gr.Row():
input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
output_text = gr.Textbox(lines=5, label="Translation")
# Add translate button
translate_btn = gr.Button("Translate")
translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
# Optimization options
with gr.Accordion("Advanced Options", open=False):
gr.Markdown("""
### Performance Tips
- Short sentences translate faster than long paragraphs
- Common phrases may be cached for instant results
- First translation might be slower as the model warms up
""")
# Add examples with preloaded common phrases
gr.Examples(
examples=[
["English to Spanish", "Hello, how are you today?"],
["Spanish to English", "Hola, ยฟcรณmo estรกs hoy?"],
["English to Korean", "The weather is nice today."],
["Korean to English", "์•ˆ๋…•ํ•˜์„ธ์š”, ๋งŒ๋‚˜์„œ ๋ฐ˜๊ฐ‘์Šต๋‹ˆ๋‹ค."]
],
inputs=[direction, input_text],
fn=translate,
outputs=output_text
)
# Launch with optimized settings
iface.launch(
debug=False,
show_error=True,
share=False, # Don't share publicly by default
quiet=True, # Reduce console output
server_name="0.0.0.0",
server_port=7860
)