Spaces:

osman
/

Uyghur_TTS_Demo

Sleeping

App Files Files Community

osman commited on Oct 1

Commit

4a41719

1 Parent(s): dd86377

updating the app files

Browse files

Files changed (2) hide show

app.py +138 -69
utils.py +337 -0

app.py CHANGED Viewed

@@ -1,77 +1,146 @@
 import gradio as gr
-from transformers import pipeline
 import torch
-# ----------------------------
-# Load TTS model
-# ----------------------------
-synthesiser = pipeline(
-    "text-to-speech",
-    "osman/uyghur_arabic_script_tts",
-    torch_dtype=torch.float16
-)
-# ----------------------------
-# TTS function
-# ----------------------------
-def tts_fn(text):
-    if not text.strip():
-        return None
-    speech = synthesiser(text)
-    return speech["sampling_rate"], speech["audio"][0]
-# ----------------------------
-# Example sentences
-# ----------------------------
 examples = [
-    ["شاھمات، ئىككى كىشى ئوتتۇرىسىدا ئوينىلىدىغان، چوڭقۇر ئىستراتېگىيە، ئىنچىكە تاكتىكا ۋە يىراقنى كۆرەرلىككە ئەھمىيەت بېرىدىغان بىر خىل ئەقلىي ئويۇن."],
-    ["ئۇ پەقەت بىر ئويۇنلا بولۇپ قالماستىن، بەلكى ئىلمىي تەپەككۈر، سەنئەت ۋە رىقابەت روھىنى ئۆزىدە مۇجەسسەملىگەن بىر مەدەنىيەت ھادىسىسىدۇر."],
-     ["ئەسكەرتىش: مەزكۇر ئوبزوردىكى كۆز قاراشلار ئاپتورنىڭ ئۆزىگە تەۋە بولۇپ، رادىيومىزغا ۋەكىللىك قىلمايدۇ."],
-     ["پىروگرامما تەپسىلاتىنى ئاۋاز ئۇلىنىشىدىن ئاڭلاڭ"],
-     ["ئانا يۇرتۇڭ ئامان بولسا، رەڭگىرويۇڭ سامان بولماس"]
 ]
-# ----------------------------
-# Minimal CSS for RTL text
-# ----------------------------
-rtl_css = """
-textarea, .examples td button {
-    direction: rtl;
-    text-align: right;
-    font-family: 'Noto Sans Arabic', sans-serif;
-    font-size: 1.2em;
-}
-"""
-# ----------------------------
-# Build Gradio interface
-# ----------------------------
-with gr.Blocks(css=rtl_css) as demo:
-    gr.Markdown("## 🎙️ Uyghur Text-to-Speech Demo")
-    gr.Markdown("Enter Uyghur text below and click **Generate** to synthesize speech.")
-    with gr.Row():
-        with gr.Column(scale=2):
-            text_input = gr.Textbox(
-                lines=5,
-                placeholder="ئۇيغۇرچە تېكىست كىرگۈزۈڭ...",
-                label="📝 Input Text"
-            )
-            submit_btn = gr.Button("🎵 Generate Speech")
-        with gr.Column(scale=1):
-            audio_output = gr.Audio(type="numpy", label="🔊 Generated Speech", format="wav")
-    submit_btn.click(tts_fn, inputs=text_input, outputs=audio_output)
-    gr.Markdown("### 📚 Example Sentences")
-    gr.Examples(examples=examples, inputs=text_input)
-    gr.Markdown("*Powered by Hugging Face Transformers | Model: osman/uyghur_arabic_script_tts*")
-# ----------------------------
-# Launch app
-# ----------------------------
-# if __name__ == "__main__":
-demo.launch(server_name="0.0.0.0", server_port=7860, share=True)

+"""
+Uyghur Text-to-Speech Application
+Main application file for the Gradio interface.
+"""
 import gradio as gr
+from transformers import VitsModel, AutoTokenizer
 import torch
+import soundfile as sf
+import os
+from huggingface_hub import login
+# Import Uyghur text processing utilities
+from utils import preprocess_uyghur_text
+# Login to Hugging Face if token is provided
+if os.environ.get("HF_TOKEN"):
+    login(token=os.environ["HF_TOKEN"])
+# Dictionary of available TTS models
+MODEL_OPTIONS = {
+    "Muhsin": "osman/uyghur_arabic_script_tts",
+}
+# Cache for loaded models and tokenizers
+model_cache = {}
+tokenizer_cache = {}
+def load_model_and_tokenizer(model_name):
+    """
+    Load model and tokenizer with caching to avoid reloading.
+    Args:
+        model_name (str): Name of the model from MODEL_OPTIONS.
+    Returns:
+        tuple: (model, tokenizer)
+    """
+    if model_name not in model_cache:
+        model_cache[model_name] = VitsModel.from_pretrained(
+            MODEL_OPTIONS[model_name])
+        tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
+            MODEL_OPTIONS[model_name])
+    return model_cache[model_name], tokenizer_cache[model_name]
+def text_to_speech(text, model_name):
+    """
+    Convert input text to speech using the selected TTS model.
+    Args:
+        text (str): Input text to convert to speech.
+        model_name (str): Name of the TTS model to use.
+    Returns:
+        bytes: Audio data in WAV format.
+    """
+    # Load the selected model and tokenizer
+    model, tokenizer = load_model_and_tokenizer(model_name)
+    # Preprocess the text
+    processed_text = preprocess_uyghur_text(text)
+    print(f"Processed text: {processed_text}")
+    # Tokenize input text
+    inputs = tokenizer(processed_text, return_tensors="pt")
+    # Generate speech waveform
+    with torch.no_grad():
+        output = model(**inputs).waveform
+    # Convert waveform to numpy array and ensure correct shape
+    audio_data = output.squeeze().numpy()
+    sample_rate = model.config.sampling_rate  # Get sample rate from model config
+    # Save audio to a temporary file
+    temp_file = "output.wav"
+    sf.write(temp_file, audio_data, sample_rate)
+    # Read the audio file for Gradio output
+    with open(temp_file, "rb") as f:
+        audio_bytes = f.read()
+    # Clean up temporary file
+    os.remove(temp_file)
+    return audio_bytes
+# Define examples for Gradio Examples component
 examples = [
+    ["باشنىڭ يېرىمى ئاغرىسا، بىر داس ئىسسىق سۇغا ئىككى قولنى تەخمىنەن يېرىم سائەت ئەتراپىدا چىلاپ بەرسە، باش ئاغرىقى ئاستا-ئاستا يېنىكلەيدۇ.", "Muhsin"],
+    ["ئەسلىدىكى دوختۇر تور بېكىتى، ھازىرقى دوختۇرلار تور بېكىتى نامىدا كەڭ تورداشلارغا خىزمەت سۇنماقتا.",
+        "Muhsin"],
+    ["ھەممە ئادەم ئەركىن بولۇپ تۇغۇلىدۇ، ھەمدە ئىززەت-ھۆرمەت ۋە ھوقۇقتا باب-باراۋەر بولىدۇ.",
+        "Muhsin"],
+        ["ۋالىبول: ساغلاملىق، ھەمكارلىق ۋە ھاياتىي كۈچنىڭ مۇكەممەل بىرىكىشى", "Muhsin"],
+        #["ئايلانمىسى: 65-67 سانتىمېتىر (cm).", "Muhsin"]
 ]
+# Create Gradio interface with model selection, RTL text input, and examples
+demo = gr.Interface(
+    fn=text_to_speech,
+    inputs=[
+        gr.Textbox(
+            label="Enter text to convert to speech",
+            elem_classes="rtl-text",
+            elem_id="input-textbox",
+            lines=6,
+            max_lines=15
+        ),
+        gr.Dropdown(
+            choices=list(MODEL_OPTIONS.keys()),
+            label="Select TTS Model",
+            value="Muhsin"
+        )
+    ],
+    outputs=gr.Audio(label="Generated Speech", type="filepath"),
+    title="Text-to-Speech with Uyghur Arabic Script TTS",
+    description="Uyghur TTS Text To Speech using osman/uyghur_arabic_script_tts model",
+    examples=examples,
+    css="""
+        @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
+        .rtl-text textarea {
+            direction: rtl;
+            width: 100%;
+            height: 200px;
+            font-size: 17px;
+            font-family: "Noto Sans Arabic" !important;
+        }
+        .table-wrap{
+            font-family: "Noto Sans Arabic" !important;
+        }
+        .table-wrap table tbody tr td:first-child {
+            direction: rtl;
+            text-align: right;
+        }
+    """
+)
+if __name__ == "__main__":
+    demo.launch()

utils.py ADDED Viewed

	@@ -0,0 +1,337 @@

+"""
+Uyghur Text Processing Utilities
+Contains functions for processing Uyghur text, numbers, and script conversion.
+"""
+import unicodedata
+from pypinyin import pinyin, Style
+import re
+from umsc import UgMultiScriptConverter
+# Initialize uyghur script converter
+ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
+ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
+def number_to_uyghur_arabic_script(number_str):
+    """
+    Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal)
+    to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term.
+    Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits.
+    Args:
+        number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789').
+    Returns:
+        str: Uyghur pronunciation in Arabic script.
+    """
+    # Uyghur number words in Arabic script
+    digits = {
+        0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش',
+        6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز'
+    }
+    ordinals = {
+        1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى',
+        6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'توققۇزىنجى'
+    }
+    tens = {
+        10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك',
+        60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان'
+    }
+    units = [
+        (1000000000, 'مىليارد'),  # billion
+        (1000000, 'مىليون'),      # million
+        (1000, 'مىڭ'),             # thousand
+        (100, 'يۈز')               # hundred
+    ]
+    fractions = {
+        1: 'ئوندا',         # tenths
+        2: 'يۈزدە',         # hundredths
+        3: 'مىڭدە',         # thousandths
+        4: 'ئون مىڭدە',      # ten-thousandths
+        5: 'يۈز مىڭدە',     # hundred-thousandths
+        6: 'مىليوندا',     # millionths
+        7: 'ئون مىليوندا',  # ten-millionths
+        8: 'يۈز مىليوندا',  # hundred-millionths
+        9: 'مىليارددا'     # billionths
+    }
+    # Convert integer part to words
+    def integer_to_words(num):
+        if num == 0:
+            return digits[0]
+        result = []
+        num = int(num)
+        # Handle large units (billion, million, thousand, hundred)
+        for value, unit_name in units:
+            if num >= value:
+                count = num // value
+                if count == 1 and value >= 100:  # e.g., 100 → "يۈز", not "بىر يۈز"
+                    result.append(unit_name)
+                else:
+                    result.append(integer_to_words(count) + ' ' + unit_name)
+                num %= value
+        # Handle tens and ones
+        if num >= 10 and num in tens:
+            result.append(tens[num])
+        elif num > 10:
+            ten = (num // 10) * 10
+            one = num % 10
+            if one == 0:
+                result.append(tens[ten])
+            else:
+                result.append(tens[ten] + ' ' + digits[one])
+        elif num > 0:
+            result.append(digits[num])
+        return ' '.join(result)
+    # Clean the input (remove commas or spaces)
+    number_str = number_str.replace(',', '').replace(' ', '')
+    # Check for ordinal (ends with '_')
+    is_ordinal = number_str.endswith('_') or number_str.endswith('-')
+    if is_ordinal:
+        number_str = number_str[:-1]  # Remove the _ sign
+        num = int(number_str)
+        if num > 999999999:
+            return number_str
+        if num in ordinals:  # Use special forms for single-digit ordinals
+            return ordinals[num]
+        # Convert to words and modify the last word for ordinal
+        words = integer_to_words(num).split()
+        last_num = num % 100  # Get the last two digits to handle tens and ones
+        if last_num in tens:
+            words[-1] = tens[last_num] + 'ىنجى '  # e.g., 60_ → ئاتمىشىنجى
+        elif last_num % 10 == 0 and last_num > 0:
+            words[-1] = tens[last_num] + 'ىنجى '  # e.g., 60_ → ئاتمىشىنجى
+        else:
+            last_digit = num % 10
+            if last_digit in ordinals:
+                # Replace last digit with ordinal form
+                words[-1] = ordinals[last_digit] + ' '
+            elif last_digit == 0:
+                words[-1] += 'ىنجى'
+        return ' '.join(words)
+    # Check for percentage
+    is_percentage = number_str.endswith('%')
+    if is_percentage:
+        number_str = number_str[:-1]  # Remove the % sign
+    # Check for fraction
+    if '/' in number_str:
+        numerator, denominator = map(int, number_str.split('/'))
+        if numerator in digits and denominator in digits:
+            return f"{digits[denominator]}دە {digits[numerator]}"
+        else:
+            return number_str
+    # Split into integer and decimal parts
+    parts = number_str.split('.')
+    integer_part = parts[0]
+    decimal_part = parts[1] if len(parts) > 1 else None
+    # Validate integer part (up to 9 digits)
+    if len(integer_part) > 9:
+        return number_str
+    # Validate decimal part (up to 9 digits)
+    if decimal_part and len(decimal_part) > 9:
+        return number_str
+    # Convert the integer part
+    pronunciation = integer_to_words(int(integer_part))
+    # Handle decimal part as a whole number with fractional term
+    if decimal_part:
+        pronunciation += ' پۈتۈن'
+        if decimal_part != '0':  # Only pronounce non-zero decimal parts
+            # Remove trailing zeros
+            decimal_value = int(decimal_part.rstrip('0'))
+            # Count significant decimal places
+            decimal_places = len(decimal_part.rstrip('0'))
+            # Fallback for beyond 9 digits
+            fraction_term = fractions.get(decimal_places, 'مىليارددا')
+            pronunciation += ' ' + fraction_term + \
+                ' ' + integer_to_words(decimal_value)
+    # Append percentage term if applicable
+    if is_percentage:
+        pronunciation += ' پىرسەنت'
+    return pronunciation.strip()
+def process_uyghur_text_with_numbers(text):
+    """
+    Processes a string containing Uyghur text and numbers, converting valid numbers to their
+    Uyghur pronunciation in Arabic script while preserving non-numeric text.
+    Args:
+        text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك').
+    Returns:
+        str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved.
+    """
+    text = text.replace('%', ' پىرسەنت ')
+    # Valid number characters and symbols
+    digits = '0123456789'
+    number_symbols = '/.%_-'
+    result = []
+    i = 0
+    while i < len(text):
+        # Check for spaces and preserve them
+        if text[i].isspace():
+            result.append(text[i])
+            i += 1
+            continue
+        # Try to identify a number (fraction, percentage, ordinal, decimal, or integer)
+        number_start = i
+        number_str = ''
+        is_number = False
+        # Collect potential number characters
+        while i < len(text) and (text[i] in digits or text[i] in number_symbols):
+            number_str += text[i]
+            i += 1
+            is_number = True
+        # If we found a potential number, validate and convert it
+        if is_number:
+            # Check if the string is a valid number format
+            valid = False
+            if '/' in number_str and number_str.count('/') == 1:
+                # Fraction: e.g., "1/4"
+                num, denom = number_str.split('/')
+                if num.isdigit() and denom.isdigit():
+                    valid = True
+            elif number_str.endswith('%'):
+                # Percentage: e.g., "25%"
+                if number_str[:-1].isdigit():
+                    valid = True
+            elif number_str.endswith('_') or number_str.endswith('-'):
+                # Ordinal: e.g., "1_"
+                if number_str[:-1].isdigit():
+                    valid = True
+            elif '.' in number_str and number_str.count('.') == 1:
+                # Decimal: e.g., "3.14"
+                whole, frac = number_str.split('.')
+                if whole.isdigit() and frac.isdigit():
+                    valid = True
+            elif number_str.isdigit():
+                # Integer: e.g., "123"
+                valid = True
+            if valid:
+                try:
+                    # Convert the number to Uyghur pronunciation
+                    converted = number_to_uyghur_arabic_script(number_str)
+                    result.append(converted)
+                except ValueError:
+                    # If conversion fails, append the original number string
+                    result.append(number_str)
+            else:
+                # If not a valid number format, treat as regular text
+                result.append(number_str)
+        else:
+            # Non-number character, append as is
+            result.append(text[i])
+            i += 1
+    # Join the result list into a string
+    return ''.join(result)
+def fix_pauctuations(batch):
+    """
+    Normalize and clean Uyghur text by fixing punctuation and character variants.
+    Args:
+        batch (str): Input text to be normalized.
+    Returns:
+        str: Normalized text with only valid Uyghur characters.
+    """
+    batch = batch.lower()
+    batch = unicodedata.normalize('NFKC', batch)
+    # Replace Uyghur character variants
+    batch = batch.replace('ژ', 'ج')
+    batch = batch.replace('ک', 'ك')
+    batch = batch.replace('ی', 'ى')
+    batch = batch.replace('ه', 'ە')
+    vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك",
+             "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"]
+    # Process each character in the batch
+    result = []
+    for char in batch:
+        if char in vocab:
+            result.append(char)
+        elif char in {'.', '?', '؟'}:
+            result.append('  ')  # Replace dot with two spaces
+        else:
+            # Replace other non-vocab characters with one space
+            result.append(' ')
+    # Join the result into a string
+    return ''.join(result)
+def chinese_to_pinyin(mixed_text):
+    """
+    Convert Chinese characters in a mixed-language string to Pinyin without tone marks,
+    preserving non-Chinese text, using only English letters.
+    Args:
+        mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur)
+    Returns:
+        str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged
+    """
+    # Regular expression to match Chinese characters (Unicode range for CJK Unified Ideographs)
+    chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
+    def replace_chinese(match):
+        chinese_text = match.group(0)
+        # Convert Chinese to Pinyin without tone marks, join syllables with spaces
+        pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
+        return ' '.join([item[0] for item in pinyin_list])
+    # Replace Chinese characters with their Pinyin, leave other text unchanged
+    result = chinese_pattern.sub(replace_chinese, mixed_text)
+    return result
+def preprocess_uyghur_text(text):
+    """
+    Complete preprocessing pipeline for Uyghur text.
+    Converts Chinese to Pinyin, Latin script to Arabic script, processes numbers, and fixes punctuation.
+    Args:
+        text (str): Input text in any supported format.
+    Returns:
+        str: Fully preprocessed Uyghur text in Arabic script.
+    """
+    # Step 1: Convert Chinese to Pinyin
+    text = chinese_to_pinyin(text)
+    # Step 2: Convert Latin script to Arabic script
+    text = ug_latn_to_arab(text)
+    # Step 3: Process numbers
+    text = process_uyghur_text_with_numbers(text)
+    # Step 4: Fix punctuation and normalize
+    text = fix_pauctuations(text)
+    return text