Spaces:

johnpaulbin
/

googoo

Sleeping

File size: 11,530 Bytes

import os
import time
import torch
import gradio as gr
from huggingface_hub import hf_hub_download
import threading
import queue
import multiprocessing

# First check if GPU is available for maximum speed
has_gpu = torch.cuda.is_available()
gpu_name = torch.cuda.get_device_name(0) if has_gpu else "No GPU"
print(f"GPU available: {has_gpu} - {gpu_name}")

# Download model files
def get_model_path(repo_id, filename):
    print(f"Obtaining {filename}...")
    return hf_hub_download(repo_id=repo_id, filename=filename)

base_model_path = get_model_path(
    "johnpaulbin/articulate-11-expspanish-base-merged-Q8_0-GGUF", 
    "articulate-11-expspanish-base-merged-q8_0.gguf"
)
adapter_path = get_model_path(
    "johnpaulbin/articulate-V1-Q8_0-GGUF", 
    "articulate-V1-q8_0.gguf"
)

# Set up optimized environment variables for llama-cpp-python
os.environ["LLAMA_CUBLAS"] = "1" if has_gpu else "0"
os.environ["LLAMA_CLBLAST"] = "0"  # Disable OpenCL
# For CPU: Use AVX2/AVX512/AVX-VNNI instruction sets if available
os.environ["LLAMA_AVX"] = "1" 
os.environ["LLAMA_AVX2"] = "1"
os.environ["LLAMA_F16"] = "1"  # Use FP16 where available

# Determine the most optimized backend
if has_gpu:
    try:
        from llama_cpp_python.llama_cpp.llama import Llama as GPULlama
        LlamaClass = GPULlama
        print("Using GPU-accelerated llama-cpp-python")
        n_gpu_layers = -1  # Use all layers on GPU
    except ImportError:
        from llama_cpp import Llama
        LlamaClass = Llama
        print("Using standard llama-cpp-python with GPU acceleration")
        n_gpu_layers = -1  # Use all layers on GPU
else:
    from llama_cpp import Llama
    LlamaClass = Llama
    print("Using CPU-only llama-cpp-python")
    n_gpu_layers = 0

# Cache for translations
translation_cache = {}
MAX_CACHE_SIZE = 1000

# Pre-compute common translations
COMMON_PHRASES = {
    "English to Spanish": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Spanish to English": [
        "Hola", "Gracias", "Buenos días", "¿Cómo estás?", "¿Cómo te llamas?",
        "No entiendo", "Por favor", "Lo siento", "Sí", "No", "Dónde está"
    ],
    "English to Korean": [
        "Hello", "Thank you", "Good morning", "How are you?", "What's your name?",
        "I don't understand", "Please", "Sorry", "Yes", "No", "Where is"
    ],
    "Korean to English": [
        "안녕하세요", "감사합니다", "좋은 아침입니다", "어떻게 지내세요?", "이름이 뭐예요?",
        "이해가 안 돼요", "제발", "죄송합니다", "네", "아니요", "어디에 있어요"
    ]
}

# Background worker for model loading and inference
class ModelWorker:
    def __init__(self):
        self.model = None
        self.request_queue = queue.Queue()
        self.response_queue = queue.Queue()
        self.worker_thread = threading.Thread(target=self._worker_loop, daemon=True)
        self.worker_thread.start()
    
    def _worker_loop(self):
        # Initialize model in the worker thread
        print("Initializing model in background thread...")
        
        # CPU optimization settings
        cpu_count = multiprocessing.cpu_count()
        optimal_threads = max(4, cpu_count - 2)  # Leave two cores free
        
        # Initialize with the most optimized settings
        start_time = time.time()
        self.model = LlamaClass(
            model_path=base_model_path,
            lora_path=adapter_path,
            n_ctx=512,               # Larger context for longer translations
            n_threads=optimal_threads, # Optimized thread count
            n_batch=1024,              # Large batch for parallel processing
            use_mmap=True,             # Efficient memory mapping
            n_gpu_layers=n_gpu_layers, # GPU acceleration if available
            seed=42,                   # Consistent results
            verbose=False,             # Reduce overhead
            main_gpu=0,                # Primary GPU
            tensor_split=None,         # Auto-distribute across GPUs if multiple
            rope_freq_base=10000,      # Optimized attention parameters
            rope_freq_scale=1.0,
        )
        print(f"Model loaded in {time.time() - start_time:.2f} seconds")
        
        # Pre-warm the model with common phrases
        self._prewarm_model()
        
        # Process requests
        while True:
            try:
                request = self.request_queue.get()
                if request is None:  # Shutdown signal
                    break
                    
                direction, text, callback_id = request
                result = self._process_translation(direction, text)
                self.response_queue.put((callback_id, result))
            except Exception as e:
                print(f"Error in worker thread: {e}")
                self.response_queue.put((callback_id, f"Error: {str(e)}"))
    
    def _prewarm_model(self):
        """Pre-compute common translations to warm up the model"""
        print("Pre-warming model with common phrases...")
        start = time.time()
        for direction, phrases in COMMON_PHRASES.items():
            for phrase in phrases[:3]:  # Just do a few to warm up
                self._process_translation(direction, phrase)
        print(f"Model pre-warming completed in {time.time() - start:.2f} seconds")
    
    def _process_translation(self, direction, text):
        # Skip empty inputs
        if not text or not text.strip():
            return ""
            
        # Check cache first for faster response
        cache_key = f"{direction}:{text}"
        if cache_key in translation_cache:
            return translation_cache[cache_key]
        
        # Start timing for performance tracking
        start_time = time.time()
        
        # Map language directions
        lang_map = {
            "English to Spanish": ("ENGLISH", "SPANISH"),
            "Spanish to English": ("SPANISH", "ENGLISH"),
            "Korean to English": ("KOREAN", "ENGLISH"),
            "English to Korean": ("ENGLISH", "KOREAN")
        }
        
        if direction not in lang_map:
            return "Invalid direction"
        
        source_lang, target_lang = lang_map[direction]
        
        # Efficient prompt format
        prompt = f"[{source_lang}]{text.strip()}[{target_lang}]"
        
        # Estimate appropriate token length based on input
        input_tokens = len(text.split())
        max_tokens = min(200, max(50, int(input_tokens * 1.5)))
        
        # Generate translation with optimized settings
        response = self.model.create_completion(
            prompt,
            max_tokens=max_tokens,
            temperature=0.0,      # Deterministic for faster inference
            top_k=1,              # Only consider most likely token
            top_p=1.0,            # No sampling
            repeat_penalty=1.0,   # No repeat penalty
            stream=False          # Get complete response at once
        )
        
        translation = response['choices'][0]['text'].strip()
        
        # Cache result
        if len(translation_cache) >= MAX_CACHE_SIZE:
            # Remove oldest entry (first key)
            translation_cache.pop(next(iter(translation_cache)))
        translation_cache[cache_key] = translation
        
        # Log performance
        inference_time = time.time() - start_time
        tokens_per_second = (input_tokens + len(translation.split())) / inference_time
        print(f"Translation: {inference_time:.3f}s ({tokens_per_second:.1f} tokens/sec)")
        
        return translation
    
    def request_translation(self, direction, text, callback_id):
        """Queue a translation request"""
        self.request_queue.put((direction, text, callback_id))

# Create worker instance
worker = ModelWorker()

# Counter for request IDs
next_request_id = 0

# Gradio interface functions
def translate(direction, text, progress=gr.Progress()):
    """Queue translation request and wait for result"""
    global next_request_id
    
    # Check cache first for immediate response
    cache_key = f"{direction}:{text}"
    if cache_key in translation_cache:
        return translation_cache[cache_key]
    
    # If input is very short, check if we have a similar cached phrase
    if len(text) < 20:
        for cached_key in translation_cache:
            cached_dir, cached_text = cached_key.split(":", 1)
            if cached_dir == direction and cached_text.lower().startswith(text.lower()):
                return translation_cache[cached_key]
    
    # Generate unique request ID
    request_id = next_request_id
    next_request_id += 1
    
    # Queue the request
    worker.request_translation(direction, text, request_id)
    
    # Wait for the response (with progress feedback)
    progress(0, desc="Translating...")
    max_wait = 30  # Maximum wait time in seconds
    start_time = time.time()
    
    while time.time() - start_time < max_wait:
        progress((time.time() - start_time) / max_wait)
        
        # Check for our response
        try:
            while not worker.response_queue.empty():
                resp_id, result = worker.response_queue.get_nowait()
                if resp_id == request_id:
                    progress(1.0)
                    return result
        except queue.Empty:
            pass
        
        # Small sleep to prevent CPU hogging
        time.sleep(0.05)
    
    progress(1.0)
    return "Translation timed out. Please try again."

# Create Gradio interface
with gr.Blocks(title="Ultra-Fast Translation App") as iface:
    gr.Markdown(f"""
    ## Ultra-Fast Translation App
    Running on: {'GPU: ' + gpu_name if has_gpu else 'CPU only'}
    """)
    
    with gr.Row():
        direction = gr.Dropdown(
            choices=["English to Spanish", "Spanish to English", "Korean to English", "English to Korean"],
            label="Translation Direction",
            value="English to Spanish"
        )
    
    with gr.Row():
        input_text = gr.Textbox(lines=5, label="Input Text", placeholder="Enter text to translate...")
        output_text = gr.Textbox(lines=5, label="Translation")
    
    # Add translate button
    translate_btn = gr.Button("Translate")
    translate_btn.click(fn=translate, inputs=[direction, input_text], outputs=output_text)
    
    # Optimization options
    with gr.Accordion("Advanced Options", open=False):
        gr.Markdown("""
        ### Performance Tips
        - Short sentences translate faster than long paragraphs
        - Common phrases may be cached for instant results
        - First translation might be slower as the model warms up
        """)
    
    # Add examples with preloaded common phrases
    gr.Examples(
        examples=[
            ["English to Spanish", "Hello, how are you today?"],
            ["Spanish to English", "Hola, ¿cómo estás hoy?"],
            ["English to Korean", "The weather is nice today."],
            ["Korean to English", "안녕하세요, 만나서 반갑습니다."]
        ],
        inputs=[direction, input_text],
        fn=translate,
        outputs=output_text
    )

# Launch with optimized settings
iface.launch(
    debug=False, 
    show_error=True,
    share=False,       # Don't share publicly by default
    quiet=True,        # Reduce console output
    server_name="0.0.0.0", 
    server_port=7860
)