Spaces:

johnpaulbin
/

googoo

Sleeping

App Files Files Community

googoo / app.py

johnpaulbin

Update app.py

f06b197 verified about 1 month ago

raw

history blame

11.5 kB

	import os
	import time
	import torch
	import gradio as gr
	from huggingface_hub import hf_hub_download
	import threading
	import queue
	import multiprocessing

	# First check if GPU is available for maximum speed
	has_gpu = torch.cuda.is_available()
	gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
	print(f"GPU available: {has_gpu} - {gpu_name}")

	# Download model files
	def get_model_path(repo_id, filename):
	print(f"Obtaining {filename}...")
	return hf_hub_download(repo_id=repo_id, filename=filename)

	base_model_path = get_model_path(
	"johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF",
	"articulate-11-expspanish-base-merged-q8_0.gguf"
	)
	adapter_path = get_model_path(
	"johnpaulbin/articulate-V1-Q8_0-GGUF",
	"articulate-V1-q8_0.gguf"
	)

	# Set up optimized environment variables for llama-cpp-python
	os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
	os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL
	# For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
	os.environ["LLAMA_AVX"] = "1"
	os.environ["LLAMA_AVX2"] = "1"
	os.environ["LLAMA_F16"] = "1" # Use FP16 where available

	# Determine the most optimized backend
	if has_gpu:
	try:
	from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
	LlamaClass = GPULlama
	print("Using GPU-accelerated llama-cpp-python")
	n_gpu_layers = -1 # Use all layers on GPU
	except ImportError:
	from llama_cpp import Llama
	LlamaClass = Llama
	print("Using standard llama-cpp-python with GPU acceleration")
	n_gpu_layers = -1 # Use all layers on GPU
	else:
	from llama_cpp import Llama
	LlamaClass = Llama
	print("Using CPU-only llama-cpp-python")
	n_gpu_layers = 0

	# Cache for translations
	translation_cache = {}
	MAX_CACHE_SIZE = 1000

	# Pre-compute common translations
	COMMON_PHRASES = {
	"English to Spanish": [
	"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
	"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
	],
	"Spanish to English": [
	"Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
	"No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
	],
	"English to Korean": [
	"Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
	"I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
	],
	"Korean to English": [
	"안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
	"이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
	]
	}

	# Background worker for model loading and inference
	class ModelWorker:
	def __init__(self):
	self.model = None
	self.request_queue = queue.Queue()
	self.response_queue = queue.Queue()
	self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
	self.worker_thread.start()

	def _worker_loop(self):
	# Initialize model in the worker thread
	print("Initializing model in background thread...")

	# CPU optimization settings
	cpu_count = multiprocessing.cpu_count()
	optimal_threads = max(4, cpu_count - 2) # Leave two cores free

	# Initialize with the most optimized settings
	start_time = time.time()
	self.model = LlamaClass(
	model_path=base_model_path,
	lora_path=adapter_path,
	n_ctx=512, # Larger context for longer translations
	n_threads=optimal_threads, # Optimized thread count
	n_batch=1024, # Large batch for parallel processing
	use_mmap=True, # Efficient memory mapping
	n_gpu_layers=n_gpu_layers, # GPU acceleration if available
	seed=42, # Consistent results
	verbose=False, # Reduce overhead
	main_gpu=0, # Primary GPU
	tensor_split=None, # Auto-distribute across GPUs if multiple
	rope_freq_base=10000, # Optimized attention parameters
	rope_freq_scale=1.0,
	)
	print(f"Model loaded in {time.time() - start_time:.2f} seconds")

	# Pre-warm the model with common phrases
	self._prewarm_model()

	# Process requests
	while True:
	try:
	request = self.request_queue.get()
	if request is None: # Shutdown signal
	break

	direction, text, callback_id = request
	result = self._process_translation(direction, text)
	self.response_queue.put((callback_id, result))
	except Exception as e:
	print(f"Error in worker thread: {e}")
	self.response_queue.put((callback_id, f"Error: {str(e)}"))

	def _prewarm_model(self):
	"""Pre-compute common translations to warm up the model"""
	print("Pre-warming model with common phrases...")
	start = time.time()
	for direction, phrases in COMMON_PHRASES.items():
	for phrase in phrases[:3]: # Just do a few to warm up
	self._process_translation(direction, phrase)
	print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")

	def _process_translation(self, direction, text):
	# Skip empty inputs
	if not text or not text.strip():
	return ""

	# Check cache first for faster response
	cache_key = f"{direction}:{text}"
	if cache_key in translation_cache:
	return translation_cache[cache_key]

	# Start timing for performance tracking
	start_time = time.time()

	# Map language directions
	lang_map = {
	"English to Spanish": ("ENGLISH", "SPANISH"),
	"Spanish to English": ("SPANISH", "ENGLISH"),
	"Korean to English": ("KOREAN", "ENGLISH"),
	"English to Korean": ("ENGLISH", "KOREAN")
	}

	if direction not in lang_map:
	return "Invalid direction"

	source_lang, target_lang = lang_map[direction]

	# Efficient prompt format
	prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"

	# Estimate appropriate token length based on input
	input_tokens = len(text.split())
	max_tokens = min(200, max(50, int(input_tokens * 1.5)))

	# Generate translation with optimized settings
	response = self.model.create_completion(
	prompt,
	max_tokens=max_tokens,
	temperature=0.0, # Deterministic for faster inference
	top_k=1, # Only consider most likely token
	top_p=1.0, # No sampling
	repeat_penalty=1.0, # No repeat penalty
	stream=False # Get complete response at once
	)

	translation = response['choices'][0]['text'].strip()

	# Cache result
	if len(translation_cache) >= MAX_CACHE_SIZE:
	# Remove oldest entry (first key)
	translation_cache.pop(next(iter(translation_cache)))
	translation_cache[cache_key] = translation

	# Log performance
	inference_time = time.time() - start_time
	tokens_per_second = (input_tokens + len(translation.split())) / inference_time
	print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")

	return translation

	def request_translation(self, direction, text, callback_id):
	"""Queue a translation request"""
	self.request_queue.put((direction, text, callback_id))

	# Create worker instance
	worker = ModelWorker()

	# Counter for request IDs
	next_request_id = 0

	# Gradio interface functions
	def translate(direction, text, progress=gr.Progress()):
	"""Queue translation request and wait for result"""
	global next_request_id

	# Check cache first for immediate response
	cache_key = f"{direction}:{text}"
	if cache_key in translation_cache:
	return translation_cache[cache_key]

	# If input is very short, check if we have a similar cached phrase
	if len(text) < 20:
	for cached_key in translation_cache:
	cached_dir, cached_text = cached_key.split(":", 1)
	if cached_dir == direction and cached_text.lower().startswith(text.lower()):
	return translation_cache[cached_key]

	# Generate unique request ID
	request_id = next_request_id
	next_request_id += 1

	# Queue the request
	worker.request_translation(direction, text, request_id)

	# Wait for the response (with progress feedback)
	progress(0, desc="Translating...")
	max_wait = 30 # Maximum wait time in seconds
	start_time = time.time()

	while time.time() - start_time < max_wait:
	progress((time.time() - start_time) / max_wait)

	# Check for our response
	try:
	while not worker.response_queue.empty():
	resp_id, result = worker.response_queue.get_nowait()
	if resp_id == request_id:
	progress(1.0)
	return result
	except queue.Empty:
	pass

	# Small sleep to prevent CPU hogging
	time.sleep(0.05)

	progress(1.0)
	return "Translation timed out. Please try again."

	# Create Gradio interface
	with gr.Blocks(title="Ultra-Fast Translation App") as iface:
	gr.Markdown(f"""
	## Ultra-Fast Translation App
	Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
	""")

	with gr.Row():
	direction = gr.Dropdown(
	choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
	label="Translation Direction",
	value="English to Spanish"
	)

	with gr.Row():
	input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
	output_text = gr.Textbox(lines=5, label="Translation")

	# Add translate button
	translate_btn = gr.Button("Translate")
	translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)

	# Optimization options
	with gr.Accordion("Advanced Options", open=False):
	gr.Markdown("""
	### Performance Tips
	- Short sentences translate faster than long paragraphs
	- Common phrases may be cached for instant results
	- First translation might be slower as the model warms up
	""")

	# Add examples with preloaded common phrases
	gr.Examples(
	examples=[
	["English to Spanish", "Hello, how are you today?"],
	["Spanish to English", "Hola, ¿cómo estás hoy?"],
	["English to Korean", "The weather is nice today."],
	["Korean to English", "안녕하세요, 만나서 반갑습니다."]
	],
	inputs=[direction, input_text],
	fn=translate,
	outputs=output_text
	)

	# Launch with optimized settings
	iface.launch(
	debug=False,
	show_error=True,
	share=False, # Don't share publicly by default
	quiet=True, # Reduce console output
	server_name="0.0.0.0",
	server_port=7860
	)