asasasaasasa commited on
Commit
3c3ac9d
·
verified ·
1 Parent(s): dee3f9b

Upload folder using huggingface_hub

Browse files
utils/__pycache__/chunking.cpython-311.pyc ADDED
Binary file (7.02 kB). View file
 
utils/__pycache__/file_readers.cpython-311.pyc ADDED
Binary file (2.18 kB). View file
 
utils/__pycache__/formatting.cpython-311.pyc ADDED
Binary file (1.29 kB). View file
 
utils/__pycache__/gemma_translation.cpython-311.pyc ADDED
Binary file (30.7 kB). View file
 
utils/__pycache__/readability_indices.cpython-311.pyc ADDED
Binary file (10.1 kB). View file
 
utils/__pycache__/text_processing.cpython-311.pyc ADDED
Binary file (586 Bytes). View file
 
utils/__pycache__/tilmash_translation.cpython-311.pyc ADDED
Binary file (24.1 kB). View file
 
utils/chunking.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/chunking.py
2
+
3
+ import logging
4
+ from pysbd import Segmenter
5
+ import re
6
+
7
+
8
+ def chunk_text_with_separators(text, tokenizer, max_tokens, lang):
9
+ """
10
+ Splits the input text into chunks with preserved separators, optimized for handling lists and tables.
11
+
12
+ Args:
13
+ text (str): The input text to be chunked.
14
+ tokenizer: Tokenizer object used to encode text into tokens.
15
+ max_tokens (int): Maximum number of tokens allowed per chunk.
16
+ lang (str): Language of the text, used for sentence segmentation.
17
+
18
+ Returns:
19
+ list: A list of tuples, each containing a chunk of text and its corresponding separator.
20
+ """
21
+ # Split text into sentences while preserving separators
22
+ sentences_with_seps = _split_technical_sentences(text, lang)
23
+ chunks = []
24
+ current_chunk = []
25
+ current_length = 0
26
+ current_separators = []
27
+
28
+ for sentence, sep in sentences_with_seps:
29
+ sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
30
+ sentence_len = len(sentence_tokens)
31
+
32
+ if sentence_len == 0:
33
+ continue
34
+
35
+ # Handle special cases like lists and tables
36
+ if _is_list_item(sentence) or _is_table_header(sentence):
37
+ if current_chunk:
38
+ # Finalize the current chunk before processing special items
39
+ chunks.append((' '.join(current_chunk), ''.join(current_separators)))
40
+ current_chunk = []
41
+ current_length = 0
42
+ current_separators = []
43
+
44
+ # Process list items as separate chunks
45
+ chunks.extend(_process_special_item(sentence, sep, tokenizer, max_tokens))
46
+ continue
47
+
48
+ # Add sentence to the current chunk if it fits
49
+ if current_length + sentence_len <= max_tokens:
50
+ current_chunk.append(sentence)
51
+ current_length += sentence_len
52
+ current_separators.append(sep)
53
+ else:
54
+ # Finalize the current chunk and start a new one
55
+ if current_chunk:
56
+ chunks.append((' '.join(current_chunk), ''.join(current_separators)))
57
+ current_chunk = [sentence]
58
+ current_length = sentence_len
59
+ current_separators = [sep]
60
+
61
+ # Add any remaining text to the final chunk
62
+ if current_chunk:
63
+ chunks.append((' '.join(current_chunk), ''.join(current_separators)))
64
+
65
+ return chunks
66
+
67
+
68
+ def _split_technical_sentences(text, lang):
69
+ """Enhanced splitting for technical documents with lists and tables"""
70
+ # Handle numbered lists and bullet points
71
+ text = re.sub(r'(\n\s*\d+\.)', r'\n§§§\1', text)
72
+ # Handle colon-terminated headers
73
+ text = re.sub(r'(:\s*\n)', r'\1§§§', text)
74
+
75
+ sentences = []
76
+ separators = []
77
+
78
+ if lang == 'russian':
79
+ segmenter = Segmenter(language='ru', clean=False)
80
+ raw_sentences = segmenter.segment(text)
81
+ else:
82
+ raw_sentences = re.split(r'([.!?])(\s*)', text)
83
+
84
+ buffer = ''
85
+ current_sep = ''
86
+
87
+ for part in raw_sentences:
88
+ if '§§§' in part:
89
+ parts = part.split('§§§')
90
+ for p in parts[:-1]:
91
+ if p.strip():
92
+ sentences.append(p.strip())
93
+ separators.append(current_sep)
94
+ current_sep = ''
95
+ buffer = parts[-1]
96
+ else:
97
+ buffer += part
98
+
99
+ # Process buffer when we hit sentence boundaries
100
+ if lang == 'russian':
101
+ if buffer.strip() and any(buffer.endswith(c) for c in ['.', '!', '?', ':']):
102
+ sentences.append(buffer.strip())
103
+ separators.append(current_sep)
104
+ buffer = ''
105
+ current_sep = ''
106
+ else:
107
+ if re.search(r'[.!?:]$', buffer):
108
+ sentences.append(buffer.strip())
109
+ separators.append(current_sep)
110
+ buffer = ''
111
+ current_sep = ''
112
+
113
+ if buffer.strip():
114
+ sentences.append(buffer.strip())
115
+ separators.append(current_sep)
116
+
117
+ return list(zip(sentences, separators))
118
+
119
+
120
+ def _is_list_item(text):
121
+ return re.match(r'^\s*(\d+\.|\-|\*)\s', text)
122
+
123
+
124
+ def _is_table_header(text):
125
+ return re.search(r':\s*$', text) and re.search(r'[A-ZА-Я]{3,}', text)
126
+
127
+
128
+ def _process_special_item(text, separator, tokenizer, max_tokens):
129
+ """Process list items and table headers as atomic units"""
130
+ chunks = []
131
+ current_chunk = []
132
+ current_length = 0
133
+
134
+ sentences = re.split(r'(\n+)', text)
135
+ for sentence in sentences:
136
+ if not sentence.strip():
137
+ continue
138
+
139
+ tokens = tokenizer.encode(sentence, add_special_tokens=False)
140
+ token_count = len(tokens)
141
+
142
+ if token_count > max_tokens:
143
+ # Handle oversized items with careful splitting
144
+ parts = re.split(r'([,;])', sentence)
145
+ for part in parts:
146
+ if not part.strip():
147
+ continue
148
+ part_tokens = tokenizer.encode(part, add_special_tokens=False)
149
+ part_len = len(part_tokens)
150
+
151
+ if current_length + part_len > max_tokens:
152
+ chunks.append((' '.join(current_chunk), separator))
153
+ current_chunk = [part]
154
+ current_length = part_len
155
+ else:
156
+ current_chunk.append(part)
157
+ current_length += part_len
158
+ else:
159
+ if current_length + token_count > max_tokens:
160
+ chunks.append((' '.join(current_chunk), separator))
161
+ current_chunk = [sentence]
162
+ current_length = token_count
163
+ else:
164
+ current_chunk.append(sentence)
165
+ current_length += token_count
166
+
167
+ if current_chunk:
168
+ chunks.append((' '.join(current_chunk), separator))
169
+
170
+ return chunks
utils/file_readers.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/file_readers.py
2
+
3
+ import docx
4
+ import PyPDF2
5
+
6
+ def read_txt(file_path):
7
+ with open(file_path, 'r', encoding='utf-8') as f:
8
+ return f.read()
9
+
10
+ def read_docx(file_path):
11
+ doc = docx.Document(file_path)
12
+ full_text = []
13
+ for para in doc.paragraphs:
14
+ full_text.append(para.text)
15
+ return '\n'.join(full_text)
16
+
17
+ def read_pdf(file_path):
18
+ text = ''
19
+ with open(file_path, 'rb') as f:
20
+ reader = PyPDF2.PdfReader(f)
21
+ for page in reader.pages:
22
+ page_text = page.extract_text()
23
+ if page_text:
24
+ text += page_text
25
+ return text
26
+
27
+ def read_file(file_path):
28
+ if file_path.endswith('.txt'):
29
+ return read_txt(file_path)
30
+ elif file_path.endswith('.docx'):
31
+ return read_docx(file_path)
32
+ elif file_path.endswith('.pdf'):
33
+ return read_pdf(file_path)
34
+ else:
35
+ return ""
utils/formatting.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/formatting.py
2
+
3
+ def color_code_index(index_name, value):
4
+ if index_name == "Flesch Reading Ease":
5
+ if value >= 90:
6
+ color = "green"
7
+ elif 60 <= value < 90:
8
+ color = "lightgreen"
9
+ elif 30 <= value < 60:
10
+ color = "orange"
11
+ else:
12
+ color = "red"
13
+ elif index_name == "Flesch-Kincaid Grade Level":
14
+ if value <= 5:
15
+ color = "green"
16
+ elif 6 <= value <= 10:
17
+ color = "lightgreen"
18
+ elif 11 <= value <= 15:
19
+ color = "orange"
20
+ else:
21
+ color = "red"
22
+ elif index_name in ["Gunning Fog Index", "SMOG Index"]:
23
+ if value <= 6:
24
+ color = "green"
25
+ elif 7 <= value <= 12:
26
+ color = "lightgreen"
27
+ elif 13 <= value <= 17:
28
+ color = "orange"
29
+ else:
30
+ color = "red"
31
+ else:
32
+ color = "black"
33
+ return f"<span style='color: {color};'>{value:.2f}</span>"
utils/gemma_translation.py ADDED
@@ -0,0 +1,661 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/gemma_translation.py
2
+
3
+ import os
4
+ import logging
5
+ from dotenv import load_dotenv
6
+ from llama_cpp import Llama
7
+ import streamlit as st
8
+ from typing import Iterator, Optional, List
9
+ import re
10
+ import time
11
+ import psutil
12
+ import uuid
13
+ import shutil
14
+ import sys
15
+ import contextlib
16
+
17
+ # Import configuration defaults
18
+ from config import DEFAULT_CONFIG
19
+
20
+
21
+ @contextlib.contextmanager
22
+ def suppress_stdout_stderr():
23
+ """Context manager to suppress stdout and stderr."""
24
+ # Save original stdout/stderr
25
+ old_stdout = sys.stdout
26
+ old_stderr = sys.stderr
27
+
28
+ # Create a null device to redirect output
29
+ null_device = open(os.devnull, 'w')
30
+
31
+ try:
32
+ # Redirect stdout/stderr to null device
33
+ sys.stdout = null_device
34
+ sys.stderr = null_device
35
+ yield
36
+ finally:
37
+ # Restore original stdout/stderr
38
+ sys.stdout = old_stdout
39
+ sys.stderr = old_stderr
40
+ null_device.close()
41
+
42
+ from .chunking import chunk_text_with_separators
43
+
44
+ # Load environment variables
45
+ load_dotenv()
46
+
47
+ # Configure logging
48
+ logging.basicConfig(level=logging.INFO)
49
+ logger = logging.getLogger(__name__)
50
+
51
+ # Model configuration from config
52
+ ORIGINAL_MODEL_PATH = os.path.join("local_llms", "gemma-3-12b-it-Q4_K_M.gguf")
53
+ MODEL_DIR = os.path.join("local_llms", "instances")
54
+ os.makedirs(MODEL_DIR, exist_ok=True)
55
+
56
+ # Read configuration from config
57
+ DEFAULT_CONTEXT_SIZE = DEFAULT_CONFIG["GEMMA_CONTEXT_SIZE"]
58
+ DEFAULT_MAX_TOKENS = DEFAULT_CONFIG["MAX_TOKENS"]
59
+ DEFAULT_CHUNK_SIZE = DEFAULT_CONFIG["CHUNK_SIZE"] # Max tokens per chunk
60
+ MODEL_INSTANCE_TIMEOUT = DEFAULT_CONFIG["MODEL_INSTANCE_TIMEOUT"] # 30 minutes
61
+
62
+ # Garbage collection for session-specific model files
63
+ def cleanup_model_instances():
64
+ """Remove model instances that haven't been used in the last hour"""
65
+ try:
66
+ current_time = time.time()
67
+ for filename in os.listdir(MODEL_DIR):
68
+ file_path = os.path.join(MODEL_DIR, filename)
69
+ # Check if file is a model file and older than 1 hour
70
+ if filename.endswith(".gguf") and os.path.isfile(file_path):
71
+ last_access = os.path.getatime(file_path)
72
+ if current_time - last_access > 3600: # 3600 seconds = 1 hour
73
+ try:
74
+ os.remove(file_path)
75
+ logger.info(f"Removed unused model instance: {filename}")
76
+ except Exception as e:
77
+ logger.error(f"Could not remove model file {filename}: {str(e)}")
78
+ except Exception as e:
79
+ logger.error(f"Error in cleanup: {str(e)}")
80
+
81
+ # Run cleanup every time module is imported
82
+ cleanup_model_instances()
83
+
84
+ class LlamaCppTokenizerAdapter:
85
+ """
86
+ Adapter class to make llama-cpp Llama model compatible with chunking utility
87
+ which expects a HuggingFace tokenizer interface.
88
+ """
89
+ def __init__(self, llama_model):
90
+ self.model = llama_model
91
+
92
+ def encode(self, text, add_special_tokens=False):
93
+ """
94
+ Tokenize text using llama-cpp's tokenize method.
95
+
96
+ Args:
97
+ text: Text to tokenize
98
+ add_special_tokens: Ignored (included for compatibility)
99
+
100
+ Returns:
101
+ List of token IDs
102
+ """
103
+ try:
104
+ return self.model.tokenize(bytes(text, "utf-8"))
105
+ except Exception as e:
106
+ logger.warning(f"Tokenization error: {str(e)}")
107
+ # Fallback to character-based approximate tokenization (4 chars ≈ 1 token)
108
+ return [0] * (len(text) // 4 + 1)
109
+
110
+ class GemmaTranslator:
111
+ """
112
+ Translator using Gemma 3 model in GGUF format with streaming capability.
113
+ Uses a session-specific model file for complete isolation.
114
+ """
115
+
116
+ def __init__(self):
117
+ """Initialize the Gemma translator for the current session."""
118
+ self.initialized = False
119
+ self.model = None
120
+ self.tokenizer = None
121
+ self.using_gpu = False
122
+ self.session_id = getattr(st.session_state, 'session_id', str(uuid.uuid4()))
123
+
124
+ # Create a session-specific model path
125
+ self.model_path = self._get_session_model_path()
126
+
127
+ def _get_session_model_path(self):
128
+ """Get or create a session-specific model file."""
129
+
130
+ session_model_filename = f"gemma-{self.session_id}.gguf"
131
+ session_model_path = os.path.join(MODEL_DIR, session_model_filename)
132
+
133
+ # If the model file doesn't exist yet, create it by copying the original
134
+ if not os.path.exists(session_model_path):
135
+ if not os.path.exists(ORIGINAL_MODEL_PATH):
136
+ raise FileNotFoundError(f"Original model file not found: {ORIGINAL_MODEL_PATH}")
137
+
138
+ logger.info(f"Creating session-specific model file for {self.session_id}")
139
+ try:
140
+ shutil.copy2(ORIGINAL_MODEL_PATH, session_model_path)
141
+ logger.info(f"Created session model at {session_model_path}")
142
+ except Exception as e:
143
+ logger.error(f"Failed to create session model: {str(e)}")
144
+ # Fallback to original model if copy fails
145
+ return ORIGINAL_MODEL_PATH
146
+
147
+ return session_model_path
148
+
149
+
150
+ def load_model(self,
151
+ n_gpu_layers: int = DEFAULT_CONFIG["GEMMA_GPU_LAYERS"],
152
+ context_size: int = DEFAULT_CONTEXT_SIZE) -> None:
153
+ """
154
+ Load the Gemma model with specified parameters.
155
+
156
+ Args:
157
+ n_gpu_layers: Number of layers to offload to GPU
158
+ context_size: Context window size
159
+ """
160
+ # Parameters already have defaults from config
161
+ # No need for additional checks
162
+
163
+ if self.initialized:
164
+ if n_gpu_layers > 0 and not self.using_gpu:
165
+ # Need to reload in GPU mode
166
+ logger.info("Reloading model with GPU support...")
167
+ self.unload_model()
168
+ elif n_gpu_layers == 0 and self.using_gpu:
169
+ # Need to reload in CPU mode
170
+ logger.info("Reloading model in CPU-only mode...")
171
+ self.unload_model()
172
+ else:
173
+ # No need to reload
174
+ return
175
+
176
+ # Check if model file exists
177
+ if not os.path.exists(self.model_path):
178
+ logger.error(f"Model file not found: {self.model_path}")
179
+ raise FileNotFoundError(f"Model file not found: {self.model_path}")
180
+
181
+ try:
182
+ logger.info(f"Loading Gemma model from {self.model_path}...")
183
+ logger.info(f"Using GPU layers: {n_gpu_layers}")
184
+
185
+ # Log current system memory state
186
+ memory = psutil.virtual_memory()
187
+ logger.info(f"System memory: {memory.percent}% used, {memory.available / (1024**3):.2f}GB available")
188
+
189
+ # Create Llama model with streaming capability
190
+ try:
191
+ # Suppress stderr output during model initialization
192
+ with suppress_stdout_stderr():
193
+ self.model = Llama(
194
+ model_path=str(self.model_path),
195
+ n_ctx=context_size,
196
+ n_gpu_layers=n_gpu_layers,
197
+ verbose=False
198
+ )
199
+ self.using_gpu = n_gpu_layers > 0
200
+
201
+ # Create tokenizer adapter
202
+ self.tokenizer = LlamaCppTokenizerAdapter(self.model)
203
+
204
+ self.initialized = True
205
+ logger.info(f"Gemma model loaded successfully with n_gpu_layers={n_gpu_layers}")
206
+ except Exception as load_error:
207
+ logger.error(f"Error during model loading: {str(load_error)}")
208
+
209
+ # If we failed with GPU, try CPU mode
210
+ if n_gpu_layers > 0:
211
+ logger.info("Attempting fallback to CPU-only mode...")
212
+ try:
213
+ # Suppress stderr output during model initialization
214
+ with suppress_stdout_stderr():
215
+ self.model = Llama(
216
+ model_path=str(self.model_path),
217
+ n_ctx=context_size,
218
+ n_gpu_layers=0,
219
+ verbose=False
220
+ )
221
+ self.using_gpu = False
222
+
223
+ # Create tokenizer adapter
224
+ self.tokenizer = LlamaCppTokenizerAdapter(self.model)
225
+
226
+ self.initialized = True
227
+ logger.info("Gemma model loaded successfully in CPU-only mode")
228
+ except Exception as cpu_error:
229
+ logger.error(f"CPU fallback also failed: {str(cpu_error)}")
230
+ raise
231
+ else:
232
+ raise
233
+
234
+ except Exception as e:
235
+ logger.error(f"Failed to load Gemma model: {str(e)}")
236
+ raise
237
+
238
+ def unload_model(self):
239
+ """Unload the model to free memory"""
240
+ if self.initialized:
241
+ logger.info("Unloading Gemma model to free memory...")
242
+ self.model = None
243
+ self.tokenizer = None
244
+ self.initialized = False
245
+
246
+ # Force garbage collection
247
+ import gc
248
+ gc.collect()
249
+ logger.info("Gemma model unloaded")
250
+
251
+ def __del__(self):
252
+ """Cleanup when object is destroyed"""
253
+ self.unload_model()
254
+
255
+ def generate_translation_prompt(self, text: str, src_lang: str, tgt_lang: str) -> str:
256
+ """
257
+ Create a prompt for translation.
258
+
259
+ Args:
260
+ text: Text to translate
261
+ src_lang: Source language code ('en', 'ru', 'kk')
262
+ tgt_lang: Target language code ('en', 'ru', 'kk')
263
+
264
+ Returns:
265
+ Formatted prompt for the model
266
+ """
267
+ lang_map = {
268
+ 'en': 'English',
269
+ 'ru': 'Russian',
270
+ 'kk': 'Kazakh'
271
+ }
272
+
273
+ source_lang = lang_map.get(src_lang, 'Unknown')
274
+ target_lang = lang_map.get(tgt_lang, 'Unknown')
275
+
276
+ system_prompt = (
277
+ f"Translate the following text from {source_lang} to {target_lang}. "
278
+ f"Provide only the translated text without explanations, introductions, or comments."
279
+ )
280
+
281
+ prompt = f"<|system|>\n{system_prompt}\n<|user|>\n{text}\n<|assistant|>\n"
282
+ return prompt
283
+
284
+ def is_text_too_large(self, text: str) -> bool:
285
+ """
286
+ Check if text is too large for the model's context window.
287
+
288
+ Args:
289
+ text: Input text
290
+
291
+ Returns:
292
+ True if text needs chunking, False otherwise
293
+ """
294
+ if not self.initialized:
295
+ self.load_model()
296
+
297
+ # Use actual tokenization when possible
298
+ try:
299
+ tokens = self.model.tokenize(bytes(text, "utf-8"))
300
+ token_count = len(tokens)
301
+ except Exception:
302
+ # Fallback to character-based approximation
303
+ token_count = len(text) / 4
304
+
305
+ # Allow for prompt overhead and model's response tokens
306
+ threshold = DEFAULT_CONTEXT_SIZE * 0.9
307
+
308
+ return token_count > threshold
309
+
310
+ def _split_text_into_sentences(self, text: str, lang: str) -> List[str]:
311
+ """
312
+ Split text into sentences for simple chunking when full chunking fails.
313
+
314
+ Args:
315
+ text: Text to split
316
+ lang: Language code
317
+
318
+ Returns:
319
+ List of sentences
320
+ """
321
+ if lang in ['ru', 'kk']:
322
+ # Russian/Kazakh sentence pattern
323
+ pattern = r'(?<=[.!?])\s+'
324
+ else:
325
+ # English sentence pattern
326
+ pattern = r'(?<=[.!?])\s+'
327
+
328
+ sentences = re.split(pattern, text)
329
+ return [s.strip() for s in sentences if s.strip()]
330
+
331
+ def translate(self,
332
+ text: str,
333
+ src_lang: str,
334
+ tgt_lang: str,
335
+ temperature: float = 0.1,
336
+ top_p: float = 0.95,
337
+ max_tokens: int = DEFAULT_MAX_TOKENS) -> str:
338
+ """
339
+ Translate text using Gemma model.
340
+
341
+ Args:
342
+ text: Text to translate
343
+ src_lang: Source language code ('en', 'ru', 'kk')
344
+ tgt_lang: Target language code ('en', 'ru', 'kk')
345
+ temperature: Generation temperature (lower = more deterministic)
346
+ top_p: Top-p sampling threshold
347
+ max_tokens: Maximum number of tokens to generate
348
+
349
+ Returns:
350
+ Translated text
351
+ """
352
+ if self.is_text_too_large(text):
353
+ logger.info("Text is too large, using chunking")
354
+ return self._translate_large_text(text, src_lang, tgt_lang, temperature, top_p, max_tokens)
355
+
356
+ # Prepare prompt for normal-sized text
357
+ prompt = self.generate_translation_prompt(text, src_lang, tgt_lang)
358
+
359
+ try:
360
+ # Generate translation
361
+ response = self.model(
362
+ prompt,
363
+ max_tokens=max_tokens,
364
+ temperature=temperature,
365
+ top_p=top_p,
366
+ stop=["<|user|>", "<|system|>"],
367
+ echo=False
368
+ )
369
+
370
+ # Extract translated text
371
+ if response and "choices" in response and len(response["choices"]) > 0:
372
+ return response["choices"][0]["text"].strip()
373
+ else:
374
+ logger.warning("Empty or invalid response from model")
375
+ return ""
376
+
377
+ except Exception as e:
378
+ logger.error(f"Translation error: {str(e)}")
379
+ return f"Error: {str(e)}"
380
+
381
+ def _translate_large_text(self,
382
+ text: str,
383
+ src_lang: str,
384
+ tgt_lang: str,
385
+ temperature: float = 0.1,
386
+ top_p: float = 0.95,
387
+ max_tokens: int = DEFAULT_MAX_TOKENS) -> str:
388
+ """
389
+ Translate large text by splitting it into chunks.
390
+
391
+ Args:
392
+ text: Text to translate
393
+ src_lang: Source language code ('en', 'ru', 'kk')
394
+ tgt_lang: Target language code ('en', 'ru', 'kk')
395
+ temperature: Generation temperature
396
+ top_p: Top-p sampling threshold
397
+ max_tokens: Maximum tokens to generate
398
+
399
+ Returns:
400
+ Translated text with chunks combined
401
+ """
402
+ try:
403
+ # Determine language for chunking
404
+ lang_for_chunking = 'russian' if src_lang in ['ru', 'kk'] else 'english'
405
+
406
+ # Use the chunking utility to split text
407
+ try:
408
+ chunks_with_seps = chunk_text_with_separators(
409
+ text=text,
410
+ tokenizer=self.tokenizer,
411
+ max_tokens=DEFAULT_CHUNK_SIZE,
412
+ lang=lang_for_chunking
413
+ )
414
+ except Exception as chunk_error:
415
+ # Fallback to simpler sentence splitting if advanced chunking fails
416
+ logger.warning(f"Advanced chunking failed: {str(chunk_error)}. Using simple sentence splitting.")
417
+ sentences = self._split_text_into_sentences(text, src_lang)
418
+ chunks_with_seps = [(sent, " ") for sent in sentences]
419
+
420
+ translations = []
421
+ for chunk_idx, (chunk, separator) in enumerate(chunks_with_seps):
422
+ if not chunk.strip():
423
+ translations.append(separator)
424
+ continue
425
+
426
+ logger.info(f"Translating chunk {chunk_idx + 1} of {len(chunks_with_seps)}")
427
+
428
+ # Translate each chunk
429
+ prompt = self.generate_translation_prompt(chunk, src_lang, tgt_lang)
430
+ try:
431
+ response = self.model(
432
+ prompt,
433
+ max_tokens=max_tokens,
434
+ temperature=temperature,
435
+ top_p=top_p,
436
+ stop=["<|user|>", "<|system|>"],
437
+ echo=False
438
+ )
439
+
440
+ if response and "choices" in response and len(response["choices"]) > 0:
441
+ translated_chunk = response["choices"][0]["text"].strip()
442
+ translations.append(translated_chunk)
443
+ translations.append(separator)
444
+ else:
445
+ logger.warning(f"Empty response for chunk {chunk_idx}")
446
+ translations.append(f"[Translation error]")
447
+ translations.append(separator)
448
+
449
+ except Exception as e:
450
+ logger.error(f"Error translating chunk {chunk_idx}: {str(e)}")
451
+ translations.append(f"[Error: {str(e)}]")
452
+ translations.append(separator)
453
+
454
+ # Combine all translated chunks
455
+ combined_text = ''.join(translations)
456
+
457
+ # Cleanup and postprocessing
458
+ return self._postprocess_translation(combined_text)
459
+
460
+ except Exception as e:
461
+ logger.error(f"Large text translation error: {str(e)}")
462
+ return f"Error: {str(e)}"
463
+
464
+ def _postprocess_translation(self, text: str) -> str:
465
+ """Clean up and format the translated text."""
466
+ # Remove multiple spaces
467
+ text = ' '.join(text.split())
468
+ # Fix punctuation spacing
469
+ text = text.replace(' .', '.').replace(' ,', ',')
470
+ text = text.replace(' !', '!').replace(' ?', '?')
471
+ # Fix quote spacing
472
+ text = text.replace('" ', '"').replace(' "', '"')
473
+ return text
474
+
475
+ def translate_streaming(self,
476
+ text: str,
477
+ src_lang: str,
478
+ tgt_lang: str,
479
+ temperature: float = 0.1,
480
+ top_p: float = 0.95,
481
+ max_tokens: int = DEFAULT_MAX_TOKENS) -> Iterator[str]:
482
+ """
483
+ Stream translation using Gemma model.
484
+
485
+ Args:
486
+ text: Text to translate
487
+ src_lang: Source language code ('en', 'ru', 'kk')
488
+ tgt_lang: Target language code ('en', 'ru', 'kk')
489
+ temperature: Generation temperature (lower = more deterministic)
490
+ top_p: Top-p sampling threshold
491
+ max_tokens: Maximum number of tokens to generate
492
+
493
+ Yields:
494
+ Chunks of translated text as they're generated
495
+ """
496
+ if self.is_text_too_large(text):
497
+ logger.info("Text is too large, using chunked streaming")
498
+ yield from self._translate_large_text_streaming(text, src_lang, tgt_lang, temperature, top_p, max_tokens)
499
+ return
500
+
501
+ # Prepare prompt for normal-sized text
502
+ prompt = self.generate_translation_prompt(text, src_lang, tgt_lang)
503
+
504
+ try:
505
+ # Stream translation
506
+ for chunk in self.model(
507
+ prompt,
508
+ max_tokens=max_tokens,
509
+ temperature=temperature,
510
+ top_p=top_p,
511
+ stop=["<|user|>", "<|system|>"],
512
+ echo=False,
513
+ stream=True
514
+ ):
515
+ if chunk and "choices" in chunk and len(chunk["choices"]) > 0:
516
+ token = chunk["choices"][0]["text"]
517
+ if token:
518
+ yield token
519
+
520
+ except Exception as e:
521
+ logger.error(f"Streaming translation error: {str(e)}")
522
+ yield f"Error: {str(e)}"
523
+
524
+ def _translate_large_text_streaming(self,
525
+ text: str,
526
+ src_lang: str,
527
+ tgt_lang: str,
528
+ temperature: float = 0.1,
529
+ top_p: float = 0.95,
530
+ max_tokens: int = DEFAULT_MAX_TOKENS) -> Iterator[str]:
531
+ """
532
+ Stream translation of large text by chunks.
533
+
534
+ Args:
535
+ text: Text to translate
536
+ src_lang: Source language code ('en', 'ru', 'kk')
537
+ tgt_lang: Target language code ('en', 'ru', 'kk')
538
+ temperature: Generation temperature
539
+ top_p: Top-p sampling threshold
540
+ max_tokens: Maximum tokens to generate
541
+
542
+ Yields:
543
+ Chunks of translated text
544
+ """
545
+ try:
546
+ # Determine language for chunking
547
+ lang_for_chunking = 'russian' if src_lang in ['ru', 'kk'] else 'english'
548
+
549
+ # Use the chunking utility to split text
550
+ try:
551
+ chunks_with_seps = chunk_text_with_separators(
552
+ text=text,
553
+ tokenizer=self.tokenizer,
554
+ max_tokens=DEFAULT_CHUNK_SIZE,
555
+ lang=lang_for_chunking
556
+ )
557
+ except Exception as chunk_error:
558
+ # Fallback to simpler sentence splitting if advanced chunking fails
559
+ logger.warning(f"Advanced chunking failed: {str(chunk_error)}. Using simple sentence splitting.")
560
+ sentences = self._split_text_into_sentences(text, src_lang)
561
+ chunks_with_seps = [(sent, " ") for sent in sentences]
562
+
563
+ for chunk_idx, (chunk, separator) in enumerate(chunks_with_seps):
564
+ if not chunk.strip():
565
+ yield separator
566
+ continue
567
+
568
+ if chunk_idx > 0:
569
+ yield "\n\n" # Add visual separation between chunks
570
+
571
+ # Translate each chunk
572
+ prompt = self.generate_translation_prompt(chunk, src_lang, tgt_lang)
573
+
574
+ try:
575
+ # Stream chunk translation
576
+ for token_chunk in self.model(
577
+ prompt,
578
+ max_tokens=max_tokens,
579
+ temperature=temperature,
580
+ top_p=top_p,
581
+ stop=["<|user|>", "<|system|>"],
582
+ echo=False,
583
+ stream=True
584
+ ):
585
+ if token_chunk and "choices" in token_chunk and len(token_chunk["choices"]) > 0:
586
+ token = token_chunk["choices"][0]["text"]
587
+ if token:
588
+ yield token
589
+
590
+ # Add separator after chunk
591
+ yield separator
592
+
593
+ except Exception as e:
594
+ logger.error(f"Error streaming chunk {chunk_idx}: {str(e)}")
595
+ yield f"\n[Error translating part {chunk_idx + 1}: {str(e)}]\n"
596
+
597
+ except Exception as e:
598
+ logger.error(f"Large text streaming error: {str(e)}")
599
+ yield f"\nError: {str(e)}"
600
+
601
+
602
+ def gemma_translate(text: str, src_lang: str, tgt_lang: str, streaming: bool = True) -> Optional[Iterator[str]]:
603
+ """
604
+ Main function to translate text using Gemma 3 model.
605
+
606
+ Args:
607
+ text: Text to translate
608
+ src_lang: Source language code ('en', 'ru', 'kk')
609
+ tgt_lang: Target language code ('en', 'ru', 'kk')
610
+ streaming: Whether to stream the output
611
+
612
+ Returns:
613
+ If streaming is True: Iterator yielding chunks of translated text
614
+ If streaming is False: Complete translated text
615
+ """
616
+ if not text or not src_lang or not tgt_lang:
617
+ return "" if not streaming else iter([""])
618
+
619
+ translator = GemmaTranslator()
620
+
621
+ try:
622
+ if streaming:
623
+ return translator.translate_streaming(text, src_lang, tgt_lang)
624
+ else:
625
+ return translator.translate(text, src_lang, tgt_lang)
626
+ except Exception as e:
627
+ logger.error(f"Translation failed: {str(e)}")
628
+ return "" if not streaming else iter([f"Error: {str(e)}"])
629
+
630
+
631
+ def display_streaming_translation(text: str, src_lang: str, tgt_lang: str) -> tuple:
632
+ """
633
+ Display streaming translation in a Streamlit app.
634
+
635
+ Args:
636
+ text: Text to translate
637
+ src_lang: Source language code ('en', 'ru', 'kk')
638
+ tgt_lang: Target language code ('en', 'ru', 'kk')
639
+
640
+ Returns:
641
+ tuple: (translated_text, needs_chunking)
642
+ """
643
+ if not text:
644
+ return "", False
645
+
646
+ # Check if text needs chunking
647
+ translator = GemmaTranslator()
648
+ if not translator.initialized:
649
+ translator.load_model()
650
+ needs_chunking = translator.is_text_too_large(text)
651
+
652
+ # Create placeholder for streaming output
653
+ placeholder = st.empty()
654
+ result = ""
655
+
656
+ # Stream translation
657
+ for token in gemma_translate(text, src_lang, tgt_lang, streaming=True):
658
+ result += token
659
+ placeholder.markdown(result)
660
+
661
+ return result, needs_chunking
utils/readability_indices.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # readability_indices.py
2
+
3
+ from nltk.tokenize import sent_tokenize, word_tokenize
4
+ import pyphen
5
+ import re
6
+ from IPython.display import display, HTML
7
+
8
+ def count_syllables(word, lang):
9
+ if lang == 'kk':
10
+ # Используем простой алгоритм для казахского языка
11
+ word = word.lower()
12
+ vowels = "аеёиоуыэюяіүұөө"
13
+ syllables = sum(1 for char in word if char in vowels)
14
+ return max(1, syllables)
15
+ else:
16
+ # Для русского и английского используем Pyphen
17
+ dic = pyphen.Pyphen(lang=lang)
18
+ hyphens = dic.inserted(word)
19
+ return max(1, hyphens.count('-') + 1)
20
+
21
+ # Функции для определения сложных слов
22
+ def is_complex_word(word, lang, syllable_threshold=3):
23
+ syllables = count_syllables(word, lang)
24
+ return syllables >= syllable_threshold
25
+
26
+ # Функции для расчёта индексов удобочитаемости
27
+ def flesch_reading_ease(text, lang):
28
+ sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
29
+ words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
30
+ words = [word for word in words if word.isalpha()]
31
+ num_sentences = max(1, len(sentences))
32
+ num_words = max(1, len(words))
33
+ syllable_count = sum([count_syllables(word, lang) for word in words])
34
+ asl = num_words / num_sentences # Средняя длина предложения
35
+ asw = syllable_count / num_words # Среднее количество слогов в слове
36
+ if lang == 'ru':
37
+ fre = 206.835 - (1.3 * asl) - (60.1 * asw)
38
+ elif lang == 'en':
39
+ fre = 206.835 - (1.015 * asl) - (84.6 * asw)
40
+ elif lang == 'kk':
41
+ # Предположительные коэффициенты для казахского языка
42
+ fre = 206.835 - (1.2 * asl) - (70 * asw)
43
+ else:
44
+ fre = 0
45
+ return fre
46
+
47
+ def flesch_kincaid_grade_level(text, lang):
48
+ sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
49
+ words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
50
+ words = [word for word in words if word.isalpha()]
51
+ num_sentences = max(1, len(sentences))
52
+ num_words = max(1, len(words))
53
+ syllable_count = sum([count_syllables(word, lang) for word in words])
54
+ asl = num_words / num_sentences
55
+ asw = syllable_count / num_words
56
+ if lang == 'ru':
57
+ fkgl = (0.5 * asl) + (8.4 * asw) - 15.59
58
+ elif lang == 'en':
59
+ fkgl = (0.39 * asl) + (11.8 * asw) - 15.59
60
+ elif lang == 'kk':
61
+ fkgl = (0.5 * asl) + (9 * asw) - 13
62
+ else:
63
+ fkgl = 0
64
+ return fkgl
65
+
66
+ def gunning_fog_index(text, lang):
67
+ sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
68
+ words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
69
+ words = [word for word in words if word.isalpha()]
70
+ num_sentences = max(1, len(sentences))
71
+ num_words = max(1, len(words))
72
+ complex_words = [word for word in words if is_complex_word(word, lang)]
73
+ percentage_complex = (len(complex_words) / num_words) * 100
74
+ asl = num_words / num_sentences
75
+ fog_index = 0.4 * (asl + percentage_complex)
76
+ return fog_index
77
+
78
+ def smog_index(text, lang):
79
+ sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
80
+ words = word_tokenize(text, language='russian' if lang == 'ru' else 'english')
81
+ words = [word for word in words if word.isalpha()]
82
+ num_sentences = len(sentences)
83
+ complex_words = [word for word in words if is_complex_word(word, lang)]
84
+ num_complex = len(complex_words)
85
+ if num_sentences >= 3:
86
+ smog = 1.0430 * ((num_complex * (30 / num_sentences)) ** 0.5) + 3.1291
87
+ else:
88
+ smog = 0
89
+ return smog
90
+
91
+ # Функция для выделения сложных слов и предложений
92
+ def highlight_complex_text(text, lang):
93
+ sentences = sent_tokenize(text, language='russian' if lang == 'ru' else 'english')
94
+ highlighted_sentences = []
95
+ complex_words_list = []
96
+ for sentence in sentences:
97
+ words = word_tokenize(sentence, language='russian' if lang == 'ru' else 'english')
98
+ words_filtered = [word for word in words if word.isalpha()]
99
+ complex_words = [word for word in words_filtered if is_complex_word(word, lang)]
100
+ complex_words_list.extend(complex_words)
101
+ if len(words_filtered) > 0 and (len(complex_words) / len(words_filtered)) > 0.3:
102
+ highlighted_sentence = f"<mark>{sentence}</mark>"
103
+ else:
104
+ highlighted_sentence = sentence
105
+ for word in complex_words:
106
+ highlighted_sentence = re.sub(r'\b{}\b'.format(re.escape(word)), f"<b>{word}</b>", highlighted_sentence)
107
+ highlighted_sentences.append(highlighted_sentence)
108
+ highlighted_text = ' '.join(highlighted_sentences)
109
+ return highlighted_text, complex_words_list
110
+
111
+ # Основная функция
112
+ def analyze_text(text, lang_code):
113
+ if lang_code not in ['ru', 'en', 'kk']:
114
+ print('Unsupported language code. Please use "ru" for Russian, "en" for English, or "kk" for Kazakh.')
115
+ return
116
+ fre = flesch_reading_ease(text, lang_code)
117
+ fkgl = flesch_kincaid_grade_level(text, lang_code)
118
+ fog = gunning_fog_index(text, lang_code)
119
+ smog = smog_index(text, lang_code)
120
+
121
+ highlighted_text, complex_words = highlight_complex_text(text, lang_code)
122
+
123
+ # Вывод результатов
124
+ print(f"Язык: {'Русский' if lang_code == 'ru' else 'Английский' if lang_code == 'en' else 'Казахский'}")
125
+ print(f"Индекс удобочитаемости Флеша: {fre:.2f}")
126
+ print(f"Индекс Флеша-Кинкейда: {fkgl:.2f}")
127
+ print(f"Индекс тумана Ганнинга: {fog:.2f}")
128
+ print(f"Индекс SMOG: {smog:.2f}")
129
+ print("\nСложные слова:")
130
+ print(', '.join(set(complex_words)))
131
+ print("\nТекст с выделениями:")
132
+ display(HTML(highlighted_text))
utils/sherkala.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+
4
+ model_path="inceptionai/Llama-3.1-Sherkala-8B-Chat"
5
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
6
+ model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16, device_map="auto")
7
+ device = "mps" #if torch.cuda.is_available() else "cpu"
8
+
9
+ tokenizer.chat_template="{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role']+'<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %} {% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}"
10
+
11
+
12
+ def get_response(text):
13
+ conversation = [
14
+ {"role": "user", "content": text}
15
+ ]
16
+
17
+ input_ids = tokenizer.apply_chat_template(
18
+ conversation=conversation,
19
+ tokenize=True,
20
+ add_generation_prompt=True,
21
+ return_tensors="pt").to(device)
22
+
23
+ # Generate a response
24
+ gen_tokens = model.generate(
25
+ input_ids,
26
+ max_new_tokens=500,
27
+ stop_strings=["<|eot_id|>"],
28
+ tokenizer=tokenizer
29
+ )
30
+
31
+ # Decode and print the generated text along with generation prompt
32
+ gen_text = tokenizer.decode(gen_tokens[0][len(input_ids[0]): -1])
33
+ return gen_text
34
+
35
+ question = 'Қазақстанның жақсы тағамдарын ұсына аласыз ба?'
36
+ print(get_response(question))
utils/text_processing.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/text_processing.py
2
+
3
+ from langdetect import detect, DetectorFactory
4
+
5
+ DetectorFactory.seed = 0
6
+
7
+ def detect_language(text):
8
+ try:
9
+ lang = detect(text)
10
+ # Convert 'kk' from langdetect if it indeed returns 'kk' for Kazakh
11
+ if lang not in ['ru', 'en', 'kk']:
12
+ return None
13
+ return lang
14
+ except:
15
+ return None
utils/tilmash_translation.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/tilmash_translation.py
2
+
3
+ import logging
4
+ import re
5
+ import os
6
+ import threading
7
+ import time
8
+ import uuid
9
+ from dotenv import load_dotenv
10
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, TranslationPipeline
11
+ from .chunking import chunk_text_with_separators
12
+ from huggingface_hub import login
13
+ from typing import Iterator
14
+ from config import DEFAULT_CONFIG
15
+
16
+ # Load environment variables from .env file
17
+ load_dotenv()
18
+ hf_token = os.getenv('HF_TOKEN')
19
+ if not hf_token:
20
+ logging.warning("HF_TOKEN not found in environment variables. Model downloading might fail.")
21
+ else:
22
+ login(token=hf_token)
23
+
24
+ # Global tilmash lock file
25
+ LOCK_DIR = os.path.join("local_llms", "locks")
26
+ os.makedirs(LOCK_DIR, exist_ok=True)
27
+ TILMASH_LOCK_FILE = os.path.join(LOCK_DIR, "tilmash.lock")
28
+
29
+ # Get session timeout from config
30
+ SESSION_TIMEOUT = DEFAULT_CONFIG["SESSION_TIMEOUT"]
31
+
32
+ class ExclusiveResourceLock:
33
+ """File-based lock for exclusive GPU resource access across processes."""
34
+
35
+ def __init__(self, lock_file, timeout=SESSION_TIMEOUT):
36
+ self.lock_file = lock_file
37
+ self.timeout = timeout
38
+ self.lock_id = str(uuid.uuid4())
39
+ self.acquired = False
40
+
41
+ def acquire(self):
42
+ """Acquire exclusive lock with timeout."""
43
+ start_time = time.time()
44
+
45
+ while time.time() - start_time < self.timeout:
46
+ try:
47
+ # Try to create the lock file
48
+ if not os.path.exists(self.lock_file):
49
+ with open(self.lock_file, 'w') as f:
50
+ f.write(f"{self.lock_id}\n{os.getpid()}\n{time.time()}")
51
+
52
+ # Verify we got the lock
53
+ with open(self.lock_file, 'r') as f:
54
+ content = f.read().split('\n')
55
+ if content and content[0] == self.lock_id:
56
+ self.acquired = True
57
+ return True
58
+
59
+ # Check if lock file is stale (older than 5 minutes)
60
+ elif os.path.exists(self.lock_file):
61
+ lock_time = os.path.getmtime(self.lock_file)
62
+ if time.time() - lock_time > 300: # 5 minutes
63
+ try:
64
+ # Remove stale lock
65
+ os.remove(self.lock_file)
66
+ continue
67
+ except:
68
+ pass
69
+
70
+ # Wait before retrying
71
+ time.sleep(1)
72
+
73
+ except Exception as e:
74
+ logging.error(f"Lock acquisition error: {str(e)}")
75
+ time.sleep(1)
76
+
77
+ return False
78
+
79
+ def release(self):
80
+ """Release the lock if we own it."""
81
+ if not self.acquired:
82
+ return
83
+
84
+ try:
85
+ if os.path.exists(self.lock_file):
86
+ with open(self.lock_file, 'r') as f:
87
+ content = f.read().split('\n')
88
+ if content and content[0] == self.lock_id:
89
+ os.remove(self.lock_file)
90
+ self.acquired = False
91
+ except Exception as e:
92
+ logging.error(f"Lock release error: {str(e)}")
93
+
94
+ def __enter__(self):
95
+ self.acquire()
96
+ return self
97
+
98
+ def __exit__(self, exc_type, exc_val, exc_tb):
99
+ self.release()
100
+
101
+ class TilmashTranslator:
102
+ """
103
+ Thread-safe translator using Tilmash model
104
+ """
105
+
106
+ def __init__(self):
107
+ """Initialize the Tilmash translator."""
108
+ # Use thread-local lock
109
+ self._lock = threading.RLock()
110
+ self.initialized = False
111
+ self.model = None
112
+ self.tokenizer = None
113
+
114
+ # Get session ID
115
+ import streamlit as st
116
+ self.session_id = getattr(st.session_state, 'session_id', str(uuid.uuid4()))
117
+
118
+ def load_model(self):
119
+ """Load the Tilmash model if not already loaded."""
120
+ with self._lock:
121
+ if self.initialized:
122
+ return self.model, self.tokenizer
123
+
124
+ try:
125
+ model_name = "issai/tilmash"
126
+ cache_dir = "local_llms"
127
+
128
+ # Ensure cache directory exists
129
+ os.makedirs(cache_dir, exist_ok=True)
130
+
131
+ try:
132
+ # First try to load the model locally
133
+ logging.info(f"Loading Tilmash model for session {self.session_id[:8]}...")
134
+ try:
135
+ self.tokenizer = AutoTokenizer.from_pretrained(
136
+ model_name,
137
+ cache_dir=cache_dir,
138
+ local_files_only=True
139
+ )
140
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(
141
+ model_name,
142
+ cache_dir=cache_dir,
143
+ local_files_only=True
144
+ )
145
+ logging.info("Successfully loaded model from local cache.")
146
+ except OSError:
147
+ # If local loading fails, download the model
148
+ logging.info("Model not found locally. Downloading from Hugging Face...")
149
+ self.tokenizer = AutoTokenizer.from_pretrained(
150
+ model_name,
151
+ cache_dir=cache_dir,
152
+ local_files_only=False
153
+ )
154
+ self.model = AutoModelForSeq2SeqLM.from_pretrained(
155
+ model_name,
156
+ cache_dir=cache_dir,
157
+ local_files_only=False
158
+ )
159
+ logging.info("Successfully downloaded and loaded the model.")
160
+
161
+ self.initialized = True
162
+ return self.model, self.tokenizer
163
+
164
+ except ValueError as e:
165
+ logging.error(f"Invalid model configuration: {str(e)}")
166
+ raise ValueError(f"Failed to load model: {str(e)}")
167
+ except Exception as e:
168
+ logging.error(f"Unexpected error during model initialization: {str(e)}")
169
+ raise Exception(f"Failed to load model: {str(e)}")
170
+ except Exception as e:
171
+ logging.error(f"Failed to load Tilmash model: {str(e)}")
172
+ raise
173
+
174
+ def unload_model(self):
175
+ """Unload the model to free memory"""
176
+ with self._lock:
177
+ if self.initialized:
178
+ logging.info("Unloading Tilmash model to free memory...")
179
+ self.model = None
180
+ self.tokenizer = None
181
+ self.initialized = False
182
+
183
+ # Force garbage collection
184
+ import gc
185
+ gc.collect()
186
+ logging.info("Tilmash model unloaded")
187
+
188
+ def create_pipeline(self, src_lang, tgt_lang, max_length=512):
189
+ """Create a translation pipeline with the loaded model."""
190
+ with self._lock:
191
+ lang_map = {
192
+ 'ru': 'rus_Cyrl',
193
+ 'en': 'eng_Latn',
194
+ 'kk': 'kaz_Cyrl'
195
+ }
196
+
197
+ # Validate language pair
198
+ if src_lang not in lang_map or tgt_lang not in lang_map:
199
+ raise ValueError(f"Unsupported language pair: {src_lang} -> {tgt_lang}")
200
+
201
+ # Make sure model is loaded
202
+ if not self.initialized:
203
+ self.load_model()
204
+
205
+ # Configure translation pipeline with optimized parameters
206
+ pipeline = TranslationPipeline(
207
+ model=self.model,
208
+ tokenizer=self.tokenizer,
209
+ src_lang=lang_map[src_lang],
210
+ tgt_lang=lang_map[tgt_lang],
211
+ max_length=max_length,
212
+ num_beams=7,
213
+ early_stopping=True,
214
+ repetition_penalty=1.3,
215
+ no_repeat_ngram_size=2,
216
+ length_penalty=1.1,
217
+ truncation=True,
218
+ clean_up_tokenization_spaces=True
219
+ )
220
+
221
+ return pipeline
222
+
223
+ def translate(self, text, src_lang, tgt_lang, max_length=512):
224
+ """Translate text using the Tilmash model."""
225
+ with self._lock:
226
+ try:
227
+ pipeline = self.create_pipeline(src_lang, tgt_lang, max_length)
228
+
229
+ # Split text into sentences for better quality
230
+ sentences = re.split(r'(?<=[.!?]) +', text)
231
+ translated_sentences = []
232
+
233
+ for sentence in sentences:
234
+ if sentence.strip():
235
+ result = pipeline(sentence)
236
+ translated_sentence = _extract_translation(result)
237
+ translated_sentences.append(translated_sentence)
238
+
239
+ return ' '.join(translated_sentences)
240
+ except Exception as e:
241
+ logging.error(f"Translation error: {str(e)}")
242
+ return f"Error: {str(e)}"
243
+
244
+ def translate_streaming(self, text, src_lang, tgt_lang, max_length=512) -> Iterator[str]:
245
+ """Stream translation results sentence by sentence."""
246
+ try:
247
+ # Make sure model is loaded - must be done in the locked section
248
+ with self._lock:
249
+ if not self.initialized:
250
+ self.load_model()
251
+ pipeline = self.create_pipeline(src_lang, tgt_lang, max_length)
252
+
253
+ # Check if text is too large for single processing
254
+ # Improved text size detection - check by paragraphs
255
+ paragraphs = re.split(r'\n\s*\n', text)
256
+ is_large_text = len(paragraphs) > 3 or len(text) > 1000 # Multiple paragraphs or long text
257
+
258
+ if is_large_text:
259
+ # Process paragraph by paragraph for structured documents
260
+ for i, paragraph in enumerate(paragraphs):
261
+ if not paragraph.strip():
262
+ yield "\n\n"
263
+ continue
264
+
265
+ # If paragraph itself is too large, process it sentence by sentence
266
+ if len(paragraph) > 800:
267
+ sentences = re.split(r'(?<=[.!?])\s+', paragraph)
268
+ for sentence in sentences:
269
+ if not sentence.strip():
270
+ continue
271
+
272
+ try:
273
+ # Only lock the actual model inference
274
+ with self._lock:
275
+ result = pipeline(sentence)
276
+ translated = _extract_translation(result)
277
+ yield translated + " "
278
+ except Exception as e:
279
+ logging.error(f"Error translating sentence: {str(e)}")
280
+ yield f"[Error: {str(e)}] "
281
+ else:
282
+ # Process whole paragraph at once
283
+ try:
284
+ # Only lock the actual model inference
285
+ with self._lock:
286
+ result = pipeline(paragraph)
287
+ translated = _extract_translation(result)
288
+ yield translated
289
+ # Add paragraph break after each paragraph
290
+ if i < len(paragraphs) - 1:
291
+ yield "\n\n"
292
+ except Exception as e:
293
+ logging.error(f"Error translating paragraph: {str(e)}")
294
+ yield f"[Error translating paragraph: {str(e)}]\n\n"
295
+ else:
296
+ # For short texts, process the entire text at once
297
+ try:
298
+ # Only lock the actual model inference
299
+ with self._lock:
300
+ result = pipeline(text)
301
+ translated = _extract_translation(result)
302
+ yield translated
303
+ except Exception as e:
304
+ logging.error(f"Error translating text: {str(e)}")
305
+ yield f"[Error: {str(e)}]"
306
+ except Exception as e:
307
+ logging.error(f"Streaming translation error: {str(e)}")
308
+ yield f"Error initializing translation: {str(e)}"
309
+
310
+
311
+ def tilmash_translate(input_text, src_lang, tgt_lang, max_length=512):
312
+ """Main translation function with structure preservation"""
313
+ try:
314
+ translator = TilmashTranslator()
315
+ return translator.translate(input_text, src_lang, tgt_lang, max_length)
316
+ except Exception as e:
317
+ logging.error(f"Translation failed: {str(e)}")
318
+ return f"Translation error: {str(e)}"
319
+
320
+
321
+ def tilmash_translate_streaming(input_text, src_lang, tgt_lang, max_length=512) -> Iterator[str]:
322
+ """Streaming version of the translation function that yields translated sentences one by one"""
323
+ try:
324
+ translator = TilmashTranslator()
325
+ yield from translator.translate_streaming(input_text, src_lang, tgt_lang, max_length)
326
+ except Exception as e:
327
+ logging.error(f"Streaming translation failed: {str(e)}")
328
+ yield f"Translation error: {str(e)}"
329
+
330
+
331
+ def display_tilmash_streaming_translation(text: str, src_lang: str, tgt_lang: str) -> tuple:
332
+ """
333
+ Display streaming translation in a Streamlit app.
334
+
335
+ Args:
336
+ text: Text to translate
337
+ src_lang: Source language code ('en', 'ru', 'kk')
338
+ tgt_lang: Target language code ('en', 'ru', 'kk')
339
+
340
+ Returns:
341
+ tuple: (translated_text, needs_chunking)
342
+ """
343
+ import streamlit as st
344
+
345
+ if not text:
346
+ return "", False
347
+
348
+ # Check if text needs chunking
349
+ needs_chunking = len(text) > 1000 # Roughly 250 tokens
350
+
351
+ # Create placeholder for streaming output
352
+ placeholder = st.empty()
353
+ result = ""
354
+
355
+ # Stream translation
356
+ for sentence in tilmash_translate_streaming(text, src_lang, tgt_lang):
357
+ result += sentence
358
+ placeholder.markdown(result)
359
+
360
+ return result, needs_chunking
361
+
362
+
363
+ def _extract_translation(result):
364
+ """Safe extraction of translation text from pipeline output"""
365
+ try:
366
+ if isinstance(result, list) and len(result) > 0:
367
+ return result[0].get('translation_text', '').strip()
368
+ return ""
369
+ except Exception as e:
370
+ logging.error(f"Translation extraction error: {str(e)}")
371
+ return ""
372
+
373
+
374
+ def _process_large_text(text, src_lang, pipeline, tokenizer, max_length):
375
+ """Process long documents with structure preservation"""
376
+ try:
377
+ chunks_with_seps = chunk_text_with_separators(
378
+ text=text,
379
+ tokenizer=tokenizer,
380
+ max_tokens=int(0.9 * max_length),
381
+ lang='russian' if src_lang in ['ru', 'kk'] else 'english'
382
+ )
383
+ except Exception as e:
384
+ logging.error(f"Chunking failed: {str(e)}")
385
+ return ""
386
+
387
+ translations = []
388
+ prev_separator = None
389
+
390
+ for chunk_idx, (chunk, separator) in enumerate(chunks_with_seps):
391
+ if not chunk.strip():
392
+ translations.append(separator)
393
+ continue
394
+
395
+ try:
396
+ # Process chunk through translation pipeline
397
+ result = pipeline(chunk)
398
+ translated = _extract_translation(result)
399
+
400
+ # Preserve original document structure
401
+ if prev_separator:
402
+ translations.append(prev_separator)
403
+
404
+ # Add indentation for list items and tables
405
+ if _is_structured_element(chunk):
406
+ translated = _preserve_structure(translated, chunk)
407
+
408
+ translations.append(translated)
409
+ prev_separator = separator
410
+
411
+ except Exception as e:
412
+ logging.error(f"Chunk {chunk_idx + 1} error: {str(e)}")
413
+ translations.append(f"<<ERROR: {chunk[:50]}...>>{separator or ' '}")
414
+ prev_separator = separator
415
+
416
+ # Assemble final text with cleanup
417
+ final_text = ''.join(translations).strip()
418
+ return _postprocess_translation(final_text)
419
+
420
+
421
+ def _is_structured_element(text):
422
+ """Check if text contains document structure elements"""
423
+ return any([
424
+ re.match(r'^\s*(\d+\.|\-|\*)\s', text), # List items
425
+ re.search(r':\s*$', text) and re.search(r'[A-ZА-Я]{3,}', text), # Headers
426
+ re.search(r'\|.+\|', text), # Tables
427
+ re.search(r'\b(Таблица|Table)\b', text, re.IGNORECASE) # Table labels
428
+ ])
429
+
430
+
431
+ def _preserve_structure(translated, original):
432
+ """Maintain original formatting in translated structured elements"""
433
+ # Preserve list indentation
434
+ if re.match(r'^\s*(\d+\.|\-|\*)\s', original):
435
+ return '\n' + translated.lstrip()
436
+
437
+ # Preserve table formatting
438
+ if '|' in original:
439
+ return translated.replace(' | ', '|').replace('| ', '|').replace(' |', '|')
440
+
441
+ return translated
442
+
443
+
444
+ def _postprocess_translation(text):
445
+ """Final cleanup of translated text"""
446
+ # Fix list numbering
447
+ text = re.sub(r'\n(\d+)\.\s*\n', r'\n\1. ', text)
448
+ # Repair table formatting
449
+ text = re.sub(r'(:\s*)\n(\S)', r'\1\2', text)
450
+ # Normalize whitespace
451
+ text = re.sub(r'([,:;])\s+', r'\1 ', text)
452
+ text = re.sub(r'\s+([.!?])', r'\1', text)
453
+ # Restore special characters
454
+ text = text.replace('«', '"').replace('»', '"')
455
+ return text