import os import time import torch import gradio as gr from huggingface_hub import hf_hub_download import threading import queue import multiprocessing # First check if GPU is available for maximum speed has_gpu = torch.cuda.is_available() gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU" print(f"GPU available: {has_gpu} - {gpu_name}") # Download model files def get_model_path(repo_id, filename): print(f"Obtaining {filename}...") return hf_hub_download(repo_id=repo_id, filename=filename) base_model_path = get_model_path( "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", "articulate-11-expspanish-base-merged-q8_0.gguf" ) adapter_path = get_model_path( "johnpaulbin/articulate-V1-Q8_0-GGUF", "articulate-V1-q8_0.gguf" ) # Set up optimized environment variables for llama-cpp-python os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0" os.environ["LLAMA_CLBLAST"] = "0" # Disable OpenCL # For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available os.environ["LLAMA_AVX"] = "1" os.environ["LLAMA_AVX2"] = "1" os.environ["LLAMA_F16"] = "1" # Use FP16 where available # Determine the most optimized backend if has_gpu: try: from llama_cpp_python.llama_cpp.llama import Llama as GPULlama LlamaClass = GPULlama print("Using GPU-accelerated llama-cpp-python") n_gpu_layers = -1 # Use all layers on GPU except ImportError: from llama_cpp import Llama LlamaClass = Llama print("Using standard llama-cpp-python with GPU acceleration") n_gpu_layers = -1 # Use all layers on GPU else: from llama_cpp import Llama LlamaClass = Llama print("Using CPU-only llama-cpp-python") n_gpu_layers = 0 # Cache for translations translation_cache = {} MAX_CACHE_SIZE = 1000 # Pre-compute common translations COMMON_PHRASES = { "English to Spanish": [ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?", "I don't understand", "Please", "Sorry", "Yes", "No", "Where is" ], "Spanish to English": [ "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?", "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está" ], "English to Korean": [ "Hello", "Thank you", "Good morning", "How are you?", "What's your name?", "I don't understand", "Please", "Sorry", "Yes", "No", "Where is" ], "Korean to English": [ "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?", "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요" ] } # Background worker for model loading and inference class ModelWorker: def __init__(self): self.model = None self.request_queue = queue.Queue() self.response_queue = queue.Queue() self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True) self.worker_thread.start() def _worker_loop(self): # Initialize model in the worker thread print("Initializing model in background thread...") # CPU optimization settings cpu_count = multiprocessing.cpu_count() optimal_threads = max(4, cpu_count - 2) # Leave two cores free # Initialize with the most optimized settings start_time = time.time() self.model = LlamaClass( model_path=base_model_path, lora_path=adapter_path, n_ctx=512, # Larger context for longer translations n_threads=optimal_threads, # Optimized thread count n_batch=1024, # Large batch for parallel processing use_mmap=True, # Efficient memory mapping n_gpu_layers=n_gpu_layers, # GPU acceleration if available seed=42, # Consistent results verbose=False, # Reduce overhead main_gpu=0, # Primary GPU tensor_split=None, # Auto-distribute across GPUs if multiple rope_freq_base=10000, # Optimized attention parameters rope_freq_scale=1.0, ) print(f"Model loaded in {time.time() - start_time:.2f} seconds") # Pre-warm the model with common phrases self._prewarm_model() # Process requests while True: try: request = self.request_queue.get() if request is None: # Shutdown signal break direction, text, callback_id = request result = self._process_translation(direction, text) self.response_queue.put((callback_id, result)) except Exception as e: print(f"Error in worker thread: {e}") self.response_queue.put((callback_id, f"Error: {str(e)}")) def _prewarm_model(self): """Pre-compute common translations to warm up the model""" print("Pre-warming model with common phrases...") start = time.time() for direction, phrases in COMMON_PHRASES.items(): for phrase in phrases[:3]: # Just do a few to warm up self._process_translation(direction, phrase) print(f"Model pre-warming completed in {time.time() - start:.2f} seconds") def _process_translation(self, direction, text): # Skip empty inputs if not text or not text.strip(): return "" # Check cache first for faster response cache_key = f"{direction}:{text}" if cache_key in translation_cache: return translation_cache[cache_key] # Start timing for performance tracking start_time = time.time() # Map language directions lang_map = { "English to Spanish": ("ENGLISH", "SPANISH"), "Spanish to English": ("SPANISH", "ENGLISH"), "Korean to English": ("KOREAN", "ENGLISH"), "English to Korean": ("ENGLISH", "KOREAN") } if direction not in lang_map: return "Invalid direction" source_lang, target_lang = lang_map[direction] # Efficient prompt format prompt = f"[{source_lang}]{text.strip()}[{target_lang}]" # Estimate appropriate token length based on input input_tokens = len(text.split()) max_tokens = min(200, max(50, int(input_tokens * 1.5))) # Generate translation with optimized settings response = self.model.create_completion( prompt, max_tokens=max_tokens, temperature=0.0, # Deterministic for faster inference top_k=1, # Only consider most likely token top_p=1.0, # No sampling repeat_penalty=1.0, # No repeat penalty stream=False # Get complete response at once ) translation = response['choices'][0]['text'].strip() # Cache result if len(translation_cache) >= MAX_CACHE_SIZE: # Remove oldest entry (first key) translation_cache.pop(next(iter(translation_cache))) translation_cache[cache_key] = translation # Log performance inference_time = time.time() - start_time tokens_per_second = (input_tokens + len(translation.split())) / inference_time print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)") return translation def request_translation(self, direction, text, callback_id): """Queue a translation request""" self.request_queue.put((direction, text, callback_id)) # Create worker instance worker = ModelWorker() # Counter for request IDs next_request_id = 0 # Gradio interface functions def translate(direction, text, progress=gr.Progress()): """Queue translation request and wait for result""" global next_request_id # Check cache first for immediate response cache_key = f"{direction}:{text}" if cache_key in translation_cache: return translation_cache[cache_key] # If input is very short, check if we have a similar cached phrase if len(text) < 20: for cached_key in translation_cache: cached_dir, cached_text = cached_key.split(":", 1) if cached_dir == direction and cached_text.lower().startswith(text.lower()): return translation_cache[cached_key] # Generate unique request ID request_id = next_request_id next_request_id += 1 # Queue the request worker.request_translation(direction, text, request_id) # Wait for the response (with progress feedback) progress(0, desc="Translating...") max_wait = 30 # Maximum wait time in seconds start_time = time.time() while time.time() - start_time < max_wait: progress((time.time() - start_time) / max_wait) # Check for our response try: while not worker.response_queue.empty(): resp_id, result = worker.response_queue.get_nowait() if resp_id == request_id: progress(1.0) return result except queue.Empty: pass # Small sleep to prevent CPU hogging time.sleep(0.05) progress(1.0) return "Translation timed out. Please try again." # Create Gradio interface with gr.Blocks(title="Ultra-Fast Translation App") as iface: gr.Markdown(f""" ## Ultra-Fast Translation App Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'} """) with gr.Row(): direction = gr.Dropdown( choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"], label="Translation Direction", value="English to Spanish" ) with gr.Row(): input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...") output_text = gr.Textbox(lines=5, label="Translation") # Add translate button translate_btn = gr.Button("Translate") translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text) # Optimization options with gr.Accordion("Advanced Options", open=False): gr.Markdown(""" ### Performance Tips - Short sentences translate faster than long paragraphs - Common phrases may be cached for instant results - First translation might be slower as the model warms up """) # Add examples with preloaded common phrases gr.Examples( examples=[ ["English to Spanish", "Hello, how are you today?"], ["Spanish to English", "Hola, ¿cómo estás hoy?"], ["English to Korean", "The weather is nice today."], ["Korean to English", "안녕하세요, 만나서 반갑습니다."] ], inputs=[direction, input_text], fn=translate, outputs=output_text ) # Launch with optimized settings iface.launch( debug=False, show_error=True, share=False, # Don't share publicly by default quiet=True, # Reduce console output server_name="0.0.0.0", server_port=7860 )