Spaces:

longvnhue1
/

finetune-deploy1

Sleeping

App Files Files Community

longvnhue1 commited on May 30

Commit

b701a5b

1 Parent(s): e263c34

fix13

Browse files

Files changed (1) hide show

app.py +32 -58

app.py CHANGED Viewed

@@ -1,47 +1,32 @@
 from fastapi import FastAPI, Request
 from pydantic import BaseModel
-import time
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
-#fastapi
-app = FastAPI()
-def simple_sentence_tokenize(text):
-    # Tách câu theo dấu chấm, hỏi, chấm than, theo sau là khoảng trắng hoặc xuống dòng
-    sentence_endings = re.compile(r'(?<=[.!?])\s+')
-    return sentence_endings.split(text)
-def split_text_by_sentences(text, min_words=150, max_words=200, fallback_words=180):
-    sentences = simple_sentence_tokenize(text)
     chunks = []
-    current_chunk = []
-    current_word_count = 0
-    for sentence in sentences:
-        sentence = sentence.strip()
-        if not sentence:
-            continue
-        word_count = len(sentence.split())
-        if current_word_count + word_count <= max_words:
-            current_chunk.append(sentence)
-            current_word_count += word_count
         else:
-            if current_word_count >= min_words:
-                chunks.append(' '.join(current_chunk))
-                current_chunk = [sentence]
-                current_word_count = word_count
-            else:
-                if current_chunk:
-                    chunks.append(' '.join(current_chunk))
-                current_chunk = [sentence]
-                current_word_count = word_count
-    if current_chunk:
-        chunks.append(' '.join(current_chunk))
     return chunks
 # Load model
@@ -80,31 +65,20 @@ class TranslateRequest(BaseModel):
 @app.post("/translate")
 def translate_text(req: TranslateRequest):
     tokenizer.src_lang = req.source_lang
-    text_chunks = split_text_by_sentences(req.text, min_words=150, max_words=200, fallback_words=180)
     translated_chunks = []
-    total_gen_time = 0
-    with torch.inference_mode():
-        for chunk in text_chunks:
-            encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
-            start = time.time()
-            generated_tokens = model.generate(
-                **encoded,
-                forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
-                max_length=256,
-                num_beams=1,
-                no_repeat_ngram_size=3,
-            )
-            total_gen_time += time.time() - start
-            translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-            translated_chunks.append(translation)
-            print(f"Translated chunk: {translation}")
-    print("Total generating time:", total_gen_time)
     full_translation = "\n".join(translated_chunks)
     return {
         "source_text": req.text,
         "translated_text": full_translation,

 from fastapi import FastAPI, Request
 from pydantic import BaseModel
 from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
 import torch
 import re
+app = FastAPI()
+def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
+    import re
+    words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
     chunks = []
+    start = 0
+    while start < len(words):
+        end = min(start + max_words, len(words))
+        # Tìm dấu chấm trong khoảng min_words đến max_words
+        dot_idx = -1
+        for i in range(start + min_words, min(start + max_words, len(words))):
+            if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
+                dot_idx = i
+        if dot_idx != -1:
+            chunk_end = dot_idx + 1
+        elif end - start > fallback_words:
+            chunk_end = start + fallback_words
         else:
+            chunk_end = end
+        chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
+        chunks.append(chunk.strip())
+        start = chunk_end
     return chunks
 # Load model
 @app.post("/translate")
 def translate_text(req: TranslateRequest):
     tokenizer.src_lang = req.source_lang
+    text_chunks = split_by_words_and_dot(req.text, min_words=125, max_words=160, fallback_words=150)
     translated_chunks = []
+    for chunk in text_chunks:
+        encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
+        generated_tokens = model.generate(
+            **encoded,
+            forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
+            max_length=256,
+            num_beams=2,
+            no_repeat_ngram_size=3,
+        )
+        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
+        translated_chunks.append(translated_text)
     full_translation = "\n".join(translated_chunks)
     return {
         "source_text": req.text,
         "translated_text": full_translation,