osman commited on
Commit
4a41719
·
1 Parent(s): dd86377

updating the app files

Browse files
Files changed (2) hide show
  1. app.py +138 -69
  2. utils.py +337 -0
app.py CHANGED
@@ -1,77 +1,146 @@
 
 
 
 
 
1
  import gradio as gr
2
- from transformers import pipeline
3
  import torch
 
 
 
4
 
5
- # ----------------------------
6
- # Load TTS model
7
- # ----------------------------
8
- synthesiser = pipeline(
9
- "text-to-speech",
10
- "osman/uyghur_arabic_script_tts",
11
- torch_dtype=torch.float16
12
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- # ----------------------------
15
- # TTS function
16
- # ----------------------------
17
- def tts_fn(text):
18
- if not text.strip():
19
- return None
20
- speech = synthesiser(text)
21
- return speech["sampling_rate"], speech["audio"][0]
22
-
23
- # ----------------------------
24
- # Example sentences
25
- # ----------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  examples = [
27
- ["شاھمات، ئىككى كىشى ئوتتۇرىسىدا ئوينىلىدىغان، چوڭقۇر ئىستراتېگىيە، ئىنچىكە تاكتىكا ۋە يىراقنى كۆرەرلىككە ئەھمىيەت بېرىدىغان بىر خىل ئەقلىي ئويۇن."],
28
- ["ئۇ پەقەت بىر ئويۇنلا بولۇپ قالماستىن، بەلكى ئىلمىي تەپەككۈر، سەنئەت ۋە رىقابەت روھىنى ئۆزىدە مۇجەسسەملىگەن بىر مەدەنىيەت ھادىسىسىدۇر."],
29
- ["ئەسكەرتىش: مەزكۇر ئوبزوردىكى كۆز قاراشلار ئاپتورنىڭ ئۆزىگە تەۋە بولۇپ، رادىيومىزغا ۋەكىللىك قىلمايدۇ."],
30
- ["پىروگرامما تەپسىلاتىنى ئاۋاز ئۇلىنىشىدىن ئاڭلاڭ"],
31
- ["ئانا يۇرتۇڭ ئامان بولسا، رەڭگىرويۇڭ سامان بولماس"]
 
 
32
  ]
33
 
34
- # ----------------------------
35
- # Minimal CSS for RTL text
36
- # ----------------------------
37
- rtl_css = """
38
- textarea, .examples td button {
39
- direction: rtl;
40
- text-align: right;
41
- font-family: 'Noto Sans Arabic', sans-serif;
42
- font-size: 1.2em;
43
- }
44
- """
45
 
46
- # ----------------------------
47
- # Build Gradio interface
48
- # ----------------------------
49
- with gr.Blocks(css=rtl_css) as demo:
50
- gr.Markdown("## 🎙️ Uyghur Text-to-Speech Demo")
51
- gr.Markdown("Enter Uyghur text below and click **Generate** to synthesize speech.")
52
-
53
- with gr.Row():
54
- with gr.Column(scale=2):
55
- text_input = gr.Textbox(
56
- lines=5,
57
- placeholder="ئۇيغۇرچە تېكىست كىرگۈزۈڭ...",
58
- label="📝 Input Text"
59
- )
60
- submit_btn = gr.Button("🎵 Generate Speech")
61
-
62
- with gr.Column(scale=1):
63
- audio_output = gr.Audio(type="numpy", label="🔊 Generated Speech", format="wav")
64
-
65
- submit_btn.click(tts_fn, inputs=text_input, outputs=audio_output)
66
-
67
- gr.Markdown("### 📚 Example Sentences")
68
- gr.Examples(examples=examples, inputs=text_input)
69
-
70
- gr.Markdown("*Powered by Hugging Face Transformers | Model: osman/uyghur_arabic_script_tts*")
71
-
72
- # ----------------------------
73
- # Launch app
74
- # ----------------------------
75
- # if __name__ == "__main__":
76
-
77
- demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Uyghur Text-to-Speech Application
3
+ Main application file for the Gradio interface.
4
+ """
5
+
6
  import gradio as gr
7
+ from transformers import VitsModel, AutoTokenizer
8
  import torch
9
+ import soundfile as sf
10
+ import os
11
+ from huggingface_hub import login
12
 
13
+ # Import Uyghur text processing utilities
14
+ from utils import preprocess_uyghur_text
15
+
16
+ # Login to Hugging Face if token is provided
17
+ if os.environ.get("HF_TOKEN"):
18
+ login(token=os.environ["HF_TOKEN"])
19
+
20
+
21
+ # Dictionary of available TTS models
22
+ MODEL_OPTIONS = {
23
+ "Muhsin": "osman/uyghur_arabic_script_tts",
24
+ }
25
+
26
+ # Cache for loaded models and tokenizers
27
+ model_cache = {}
28
+ tokenizer_cache = {}
29
+
30
+
31
+ def load_model_and_tokenizer(model_name):
32
+ """
33
+ Load model and tokenizer with caching to avoid reloading.
34
+
35
+ Args:
36
+ model_name (str): Name of the model from MODEL_OPTIONS.
37
+
38
+ Returns:
39
+ tuple: (model, tokenizer)
40
+ """
41
+ if model_name not in model_cache:
42
+ model_cache[model_name] = VitsModel.from_pretrained(
43
+ MODEL_OPTIONS[model_name])
44
+ tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
45
+ MODEL_OPTIONS[model_name])
46
+ return model_cache[model_name], tokenizer_cache[model_name]
47
+
48
+
49
+ def text_to_speech(text, model_name):
50
+ """
51
+ Convert input text to speech using the selected TTS model.
52
+
53
+ Args:
54
+ text (str): Input text to convert to speech.
55
+ model_name (str): Name of the TTS model to use.
56
 
57
+ Returns:
58
+ bytes: Audio data in WAV format.
59
+ """
60
+ # Load the selected model and tokenizer
61
+ model, tokenizer = load_model_and_tokenizer(model_name)
62
+
63
+ # Preprocess the text
64
+ processed_text = preprocess_uyghur_text(text)
65
+ print(f"Processed text: {processed_text}")
66
+
67
+ # Tokenize input text
68
+ inputs = tokenizer(processed_text, return_tensors="pt")
69
+
70
+ # Generate speech waveform
71
+ with torch.no_grad():
72
+ output = model(**inputs).waveform
73
+
74
+ # Convert waveform to numpy array and ensure correct shape
75
+ audio_data = output.squeeze().numpy()
76
+ sample_rate = model.config.sampling_rate # Get sample rate from model config
77
+
78
+ # Save audio to a temporary file
79
+ temp_file = "output.wav"
80
+ sf.write(temp_file, audio_data, sample_rate)
81
+
82
+ # Read the audio file for Gradio output
83
+ with open(temp_file, "rb") as f:
84
+ audio_bytes = f.read()
85
+
86
+ # Clean up temporary file
87
+ os.remove(temp_file)
88
+
89
+ return audio_bytes
90
+
91
+
92
+ # Define examples for Gradio Examples component
93
  examples = [
94
+ ["باشنىڭ يېرىمى ئاغرىسا، بىر داس ئىسسىق سۇغا ئىككى قولنى تەخمىنەن يېرىم سائەت ئەتراپىدا چىلاپ بەرسە، باش ئاغرىقى ئاستا-ئاستا يېنىكلەيدۇ.", "Muhsin"],
95
+ ["ئەسلىدىكى دوختۇر تور بېكىتى، ھازىرقى دوختۇرلار تور بېكىتى نامىدا كەڭ تورداشلارغا خىزمەت سۇنماقتا.",
96
+ "Muhsin"],
97
+ ["ھەممە ئادەم ئەركىن بولۇپ تۇغۇلىدۇ، ھەمدە ئىززەت-ھۆرمەت ۋە ھوقۇقتا باب-باراۋەر بولىدۇ.",
98
+ "Muhsin"],
99
+ ["ۋالىبول: ساغلاملىق، ھەمكارلىق ۋە ھاياتىي كۈچنىڭ مۇكەممەل بىرىكىشى", "Muhsin"],
100
+ #["ئايلانمىسى: 65-67 سانتىمېتىر (cm).", "Muhsin"]
101
  ]
102
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
+ # Create Gradio interface with model selection, RTL text input, and examples
105
+ demo = gr.Interface(
106
+ fn=text_to_speech,
107
+ inputs=[
108
+ gr.Textbox(
109
+ label="Enter text to convert to speech",
110
+ elem_classes="rtl-text",
111
+ elem_id="input-textbox",
112
+ lines=6,
113
+ max_lines=15
114
+ ),
115
+ gr.Dropdown(
116
+ choices=list(MODEL_OPTIONS.keys()),
117
+ label="Select TTS Model",
118
+ value="Muhsin"
119
+ )
120
+ ],
121
+ outputs=gr.Audio(label="Generated Speech", type="filepath"),
122
+ title="Text-to-Speech with Uyghur Arabic Script TTS",
123
+ description="Uyghur TTS Text To Speech using osman/uyghur_arabic_script_tts model",
124
+ examples=examples,
125
+ css="""
126
+ @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
127
+ .rtl-text textarea {
128
+ direction: rtl;
129
+ width: 100%;
130
+ height: 200px;
131
+ font-size: 17px;
132
+ font-family: "Noto Sans Arabic" !important;
133
+ }
134
+ .table-wrap{
135
+ font-family: "Noto Sans Arabic" !important;
136
+ }
137
+ .table-wrap table tbody tr td:first-child {
138
+ direction: rtl;
139
+ text-align: right;
140
+ }
141
+ """
142
+ )
143
+
144
+
145
+ if __name__ == "__main__":
146
+ demo.launch()
utils.py ADDED
@@ -0,0 +1,337 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Uyghur Text Processing Utilities
3
+ Contains functions for processing Uyghur text, numbers, and script conversion.
4
+ """
5
+
6
+ import unicodedata
7
+ from pypinyin import pinyin, Style
8
+ import re
9
+ from umsc import UgMultiScriptConverter
10
+
11
+ # Initialize uyghur script converter
12
+ ug_arab_to_latn = UgMultiScriptConverter('UAS', 'ULS')
13
+ ug_latn_to_arab = UgMultiScriptConverter('ULS', 'UAS')
14
+
15
+
16
+ def number_to_uyghur_arabic_script(number_str):
17
+ """
18
+ Converts a number (integer, decimal, fraction, percentage, or ordinal) up to 9 digits (integer and decimal)
19
+ to its Uyghur pronunciation in Arabic script. Decimal part is pronounced as a whole number with a fractional term.
20
+ Ordinals use the -ىنجى suffix for all numbers up to 9 digits, with special forms for single digits.
21
+
22
+ Args:
23
+ number_str (str): Number as a string (e.g., '123', '0.001', '1/4', '25%', '1968_', '123456789').
24
+
25
+ Returns:
26
+ str: Uyghur pronunciation in Arabic script.
27
+ """
28
+ # Uyghur number words in Arabic script
29
+ digits = {
30
+ 0: 'نۆل', 1: 'بىر', 2: 'ئىككى', 3: 'ئۈچ', 4: 'تۆت', 5: 'بەش',
31
+ 6: 'ئالتە', 7: 'يەتتە', 8: 'سەككىز', 9: 'توققۇز'
32
+ }
33
+ ordinals = {
34
+ 1: 'بىرىنجى', 2: 'ئىككىنجى', 3: 'ئۈچىنجى', 4: 'تۆتىنجى', 5: 'بەشىنجى',
35
+ 6: 'ئالتىنجى', 7: 'يەتتىنجى', 8: 'سەككىزىنجى', 9: 'توققۇزىنجى'
36
+ }
37
+ tens = {
38
+ 10: 'ئون', 20: 'يىگىرمە', 30: 'ئوتتۇز', 40: 'قىرىق', 50: 'ئەللىك',
39
+ 60: 'ئاتمىش', 70: 'يەتمىش', 80: 'سەكسەن', 90: 'توقسان'
40
+ }
41
+ units = [
42
+ (1000000000, 'مىليارد'), # billion
43
+ (1000000, 'مىليون'), # million
44
+ (1000, 'مىڭ'), # thousand
45
+ (100, 'يۈز') # hundred
46
+ ]
47
+ fractions = {
48
+ 1: 'ئوندا', # tenths
49
+ 2: 'يۈزدە', # hundredths
50
+ 3: 'مىڭدە', # thousandths
51
+ 4: 'ئون مىڭدە', # ten-thousandths
52
+ 5: 'يۈز مىڭدە', # hundred-thousandths
53
+ 6: 'مىليوندا', # millionths
54
+ 7: 'ئون مىليوندا', # ten-millionths
55
+ 8: 'يۈز مىليوندا', # hundred-millionths
56
+ 9: 'مىليارددا' # billionths
57
+ }
58
+
59
+ # Convert integer part to words
60
+ def integer_to_words(num):
61
+ if num == 0:
62
+ return digits[0]
63
+
64
+ result = []
65
+ num = int(num)
66
+
67
+ # Handle large units (billion, million, thousand, hundred)
68
+ for value, unit_name in units:
69
+ if num >= value:
70
+ count = num // value
71
+ if count == 1 and value >= 100: # e.g., 100 → "يۈز", not "بىر يۈز"
72
+ result.append(unit_name)
73
+ else:
74
+ result.append(integer_to_words(count) + ' ' + unit_name)
75
+ num %= value
76
+
77
+ # Handle tens and ones
78
+ if num >= 10 and num in tens:
79
+ result.append(tens[num])
80
+ elif num > 10:
81
+ ten = (num // 10) * 10
82
+ one = num % 10
83
+ if one == 0:
84
+ result.append(tens[ten])
85
+ else:
86
+ result.append(tens[ten] + ' ' + digits[one])
87
+ elif num > 0:
88
+ result.append(digits[num])
89
+
90
+ return ' '.join(result)
91
+
92
+ # Clean the input (remove commas or spaces)
93
+ number_str = number_str.replace(',', '').replace(' ', '')
94
+
95
+ # Check for ordinal (ends with '_')
96
+ is_ordinal = number_str.endswith('_') or number_str.endswith('-')
97
+ if is_ordinal:
98
+ number_str = number_str[:-1] # Remove the _ sign
99
+ num = int(number_str)
100
+ if num > 999999999:
101
+ return number_str
102
+ if num in ordinals: # Use special forms for single-digit ordinals
103
+ return ordinals[num]
104
+
105
+ # Convert to words and modify the last word for ordinal
106
+ words = integer_to_words(num).split()
107
+ last_num = num % 100 # Get the last two digits to handle tens and ones
108
+ if last_num in tens:
109
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
110
+ elif last_num % 10 == 0 and last_num > 0:
111
+ words[-1] = tens[last_num] + 'ىنجى ' # e.g., 60_ → ئاتمىشىنجى
112
+ else:
113
+ last_digit = num % 10
114
+ if last_digit in ordinals:
115
+ # Replace last digit with ordinal form
116
+ words[-1] = ordinals[last_digit] + ' '
117
+ elif last_digit == 0:
118
+ words[-1] += 'ىنجى'
119
+ return ' '.join(words)
120
+
121
+ # Check for percentage
122
+ is_percentage = number_str.endswith('%')
123
+ if is_percentage:
124
+ number_str = number_str[:-1] # Remove the % sign
125
+
126
+ # Check for fraction
127
+ if '/' in number_str:
128
+ numerator, denominator = map(int, number_str.split('/'))
129
+ if numerator in digits and denominator in digits:
130
+ return f"{digits[denominator]}دە {digits[numerator]}"
131
+ else:
132
+ return number_str
133
+
134
+ # Split into integer and decimal parts
135
+ parts = number_str.split('.')
136
+ integer_part = parts[0]
137
+ decimal_part = parts[1] if len(parts) > 1 else None
138
+
139
+ # Validate integer part (up to 9 digits)
140
+ if len(integer_part) > 9:
141
+ return number_str
142
+
143
+ # Validate decimal part (up to 9 digits)
144
+ if decimal_part and len(decimal_part) > 9:
145
+ return number_str
146
+
147
+ # Convert the integer part
148
+ pronunciation = integer_to_words(int(integer_part))
149
+
150
+ # Handle decimal part as a whole number with fractional term
151
+ if decimal_part:
152
+ pronunciation += ' پۈتۈن'
153
+ if decimal_part != '0': # Only pronounce non-zero decimal parts
154
+ # Remove trailing zeros
155
+ decimal_value = int(decimal_part.rstrip('0'))
156
+ # Count significant decimal places
157
+ decimal_places = len(decimal_part.rstrip('0'))
158
+ # Fallback for beyond 9 digits
159
+ fraction_term = fractions.get(decimal_places, 'مىليارددا')
160
+ pronunciation += ' ' + fraction_term + \
161
+ ' ' + integer_to_words(decimal_value)
162
+
163
+ # Append percentage term if applicable
164
+ if is_percentage:
165
+ pronunciation += ' پىرسەنت'
166
+
167
+ return pronunciation.strip()
168
+
169
+
170
+ def process_uyghur_text_with_numbers(text):
171
+ """
172
+ Processes a string containing Uyghur text and numbers, converting valid numbers to their
173
+ Uyghur pronunciation in Arabic script while preserving non-numeric text.
174
+
175
+ Args:
176
+ text (str): Input string with Uyghur text and numbers (e.g., '1/4 كىلو 25% تەملىك').
177
+
178
+ Returns:
179
+ str: String with numbers converted to Uyghur pronunciation, non-numeric text preserved.
180
+ """
181
+ text = text.replace('%', ' پىرسەنت ')
182
+ # Valid number characters and symbols
183
+ digits = '0123456789'
184
+ number_symbols = '/.%_-'
185
+
186
+ result = []
187
+ i = 0
188
+ while i < len(text):
189
+ # Check for spaces and preserve them
190
+ if text[i].isspace():
191
+ result.append(text[i])
192
+ i += 1
193
+ continue
194
+
195
+ # Try to identify a number (fraction, percentage, ordinal, decimal, or integer)
196
+ number_start = i
197
+ number_str = ''
198
+ is_number = False
199
+
200
+ # Collect potential number characters
201
+ while i < len(text) and (text[i] in digits or text[i] in number_symbols):
202
+ number_str += text[i]
203
+ i += 1
204
+ is_number = True
205
+
206
+ # If we found a potential number, validate and convert it
207
+ if is_number:
208
+ # Check if the string is a valid number format
209
+ valid = False
210
+ if '/' in number_str and number_str.count('/') == 1:
211
+ # Fraction: e.g., "1/4"
212
+ num, denom = number_str.split('/')
213
+ if num.isdigit() and denom.isdigit():
214
+ valid = True
215
+ elif number_str.endswith('%'):
216
+ # Percentage: e.g., "25%"
217
+ if number_str[:-1].isdigit():
218
+ valid = True
219
+ elif number_str.endswith('_') or number_str.endswith('-'):
220
+ # Ordinal: e.g., "1_"
221
+ if number_str[:-1].isdigit():
222
+ valid = True
223
+ elif '.' in number_str and number_str.count('.') == 1:
224
+ # Decimal: e.g., "3.14"
225
+ whole, frac = number_str.split('.')
226
+ if whole.isdigit() and frac.isdigit():
227
+ valid = True
228
+ elif number_str.isdigit():
229
+ # Integer: e.g., "123"
230
+ valid = True
231
+
232
+ if valid:
233
+ try:
234
+ # Convert the number to Uyghur pronunciation
235
+ converted = number_to_uyghur_arabic_script(number_str)
236
+ result.append(converted)
237
+ except ValueError:
238
+ # If conversion fails, append the original number string
239
+ result.append(number_str)
240
+ else:
241
+ # If not a valid number format, treat as regular text
242
+ result.append(number_str)
243
+ else:
244
+ # Non-number character, append as is
245
+ result.append(text[i])
246
+ i += 1
247
+
248
+ # Join the result list into a string
249
+ return ''.join(result)
250
+
251
+
252
+ def fix_pauctuations(batch):
253
+ """
254
+ Normalize and clean Uyghur text by fixing punctuation and character variants.
255
+
256
+ Args:
257
+ batch (str): Input text to be normalized.
258
+
259
+ Returns:
260
+ str: Normalized text with only valid Uyghur characters.
261
+ """
262
+ batch = batch.lower()
263
+ batch = unicodedata.normalize('NFKC', batch)
264
+
265
+ # Replace Uyghur character variants
266
+ batch = batch.replace('ژ', 'ج')
267
+ batch = batch.replace('ک', 'ك')
268
+ batch = batch.replace('ی', 'ى')
269
+ batch = batch.replace('ه', 'ە')
270
+
271
+ vocab = [" ", "ئ", "ا", "ب", "ت", "ج", "خ", "د", "ر", "ز", "س", "ش", "غ", "ف", "ق", "ك",
272
+ "ل", "م", "ن", "و", "ى", "ي", "پ", "چ", "ڭ", "گ", "ھ", "ۆ", "ۇ", "ۈ", "ۋ", "ې", "ە"]
273
+
274
+ # Process each character in the batch
275
+ result = []
276
+ for char in batch:
277
+ if char in vocab:
278
+ result.append(char)
279
+ elif char in {'.', '?', '؟'}:
280
+ result.append(' ') # Replace dot with two spaces
281
+ else:
282
+ # Replace other non-vocab characters with one space
283
+ result.append(' ')
284
+
285
+ # Join the result into a string
286
+ return ''.join(result)
287
+
288
+
289
+ def chinese_to_pinyin(mixed_text):
290
+ """
291
+ Convert Chinese characters in a mixed-language string to Pinyin without tone marks,
292
+ preserving non-Chinese text, using only English letters.
293
+
294
+ Args:
295
+ mixed_text (str): Input string containing Chinese characters and other languages (e.g., English, Uyghur)
296
+
297
+ Returns:
298
+ str: String with Chinese characters converted to Pinyin (no tone marks), non-Chinese text unchanged
299
+ """
300
+ # Regular expression to match Chinese characters (Unicode range for CJK Unified Ideographs)
301
+ chinese_pattern = re.compile(r'[\u4e00-\u9fff]+')
302
+
303
+ def replace_chinese(match):
304
+ chinese_text = match.group(0)
305
+ # Convert Chinese to Pinyin without tone marks, join syllables with spaces
306
+ pinyin_list = pinyin(chinese_text, style=Style.NORMAL)
307
+ return ' '.join([item[0] for item in pinyin_list])
308
+
309
+ # Replace Chinese characters with their Pinyin, leave other text unchanged
310
+ result = chinese_pattern.sub(replace_chinese, mixed_text)
311
+ return result
312
+
313
+
314
+ def preprocess_uyghur_text(text):
315
+ """
316
+ Complete preprocessing pipeline for Uyghur text.
317
+ Converts Chinese to Pinyin, Latin script to Arabic script, processes numbers, and fixes punctuation.
318
+
319
+ Args:
320
+ text (str): Input text in any supported format.
321
+
322
+ Returns:
323
+ str: Fully preprocessed Uyghur text in Arabic script.
324
+ """
325
+ # Step 1: Convert Chinese to Pinyin
326
+ text = chinese_to_pinyin(text)
327
+
328
+ # Step 2: Convert Latin script to Arabic script
329
+ text = ug_latn_to_arab(text)
330
+
331
+ # Step 3: Process numbers
332
+ text = process_uyghur_text_with_numbers(text)
333
+
334
+ # Step 4: Fix punctuation and normalize
335
+ text = fix_pauctuations(text)
336
+
337
+ return text