Spaces:
Sleeping
Sleeping
File size: 7,311 Bytes
d1b7cde fdd52fa a9620d6 fdd52fa 0fa06e3 d47c5f5 d1b7cde b701a5b e263c34 d1b7cde 0fa06e3 d1b7cde 18f57a5 b701a5b 18f57a5 b701a5b fdd52fa b701a5b 18f57a5 b701a5b 18f57a5 b701a5b 18f57a5 b701a5b fdd52fa b701a5b 18f57a5 b701a5b 18f57a5 fdd52fa 18f57a5 fdd52fa d1b7cde fdd52fa a9620d6 d1b7cde a9620d6 d1b7cde a9620d6 b701a5b a9620d6 d1b7cde 0382d49 a9620d6 689664f b701a5b a9620d6 d1b7cde a9620d6 d1b7cde 2777129 a9620d6 fdd52fa d1b7cde fdd52fa a9620d6 d1b7cde |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
# from fastapi import FastAPI, Request
# from pydantic import BaseModel
# from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
# import torch
# import re
# import time
# app = FastAPI()
# def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
# import re
# words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
# chunks = []
# start = 0
# while start < len(words):
# end = min(start + max_words, len(words))
# # Tìm dấu chấm trong khoảng min_words đến max_words
# dot_idx = -1
# for i in range(start + min_words, min(start + max_words, len(words))):
# if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
# dot_idx = i
# if dot_idx != -1:
# chunk_end = dot_idx + 1
# elif end - start > fallback_words:
# chunk_end = start + fallback_words
# else:
# chunk_end = end
# chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
# chunks.append(chunk.strip())
# start = chunk_end
# return chunks
# # Load model
# model_path = "longvnhue1/facebook-m2m100_418M-fine_tuning"
# tokenizer = M2M100Tokenizer.from_pretrained(model_path)
# model = M2M100ForConditionalGeneration.from_pretrained(model_path)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)
# class TranslateRequest(BaseModel):
# text: str
# source_lang: str
# target_lang: str
# @app.post("/translate")
# def translate_text(req: TranslateRequest):
# tokenizer.src_lang = req.source_lang
# text_chunks = split_by_words_and_dot(req.text, min_words=125, max_words=160, fallback_words=150)
# translated_chunks = []
# timing_info = []
# for idx, chunk in enumerate(text_chunks):
# start_time = time.perf_counter() # Bắt đầu đếm thời gian
# encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
# with torch.inference_mode():
# generated_tokens = model.generate(
# **encoded,
# forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
# max_length=256,
# num_beams=2,
# no_repeat_ngram_size=3,
# )
# translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
# translated_chunks.append(translated_text)
# end_time = time.perf_counter() # Kết thúc đếm thời gian
# elapsed = end_time - start_time
# timing_info.append(f"Translated chunk {idx+1}/{len(text_chunks)} in {elapsed:.3f} seconds")
# full_translation = "\n".join(translated_chunks)
# print(timing_info)
# return {
# "source_text": req.text,
# "translated_text": full_translation,
# "src_lang": req.source_lang,
# "tgt_lang": req.target_lang,
# }
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch
import re
import time
# Limit CPU thread
torch.set_num_threads(1)
app = FastAPI()
@app.on_event("startup")
def startup_event():
global tokenizer, model, device
model_path = "longvnhue1/facebook-m2m100_418M-fine_tuning"
tokenizer = M2M100Tokenizer.from_pretrained(model_path)
model = M2M100ForConditionalGeneration.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Model loaded and ready.")
# def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
# words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
# chunks = []
# start = 0
# while start < len(words):
# end = min(start + max_words, len(words))
# dot_idx = -1
# for i in range(start + min_words, min(start + max_words, len(words))):
# if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
# dot_idx = i
# if dot_idx != -1:
# chunk_end = dot_idx + 1
# elif end - start > fallback_words:
# chunk_end = start + fallback_words
# else:
# chunk_end = end
# chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
# chunks.append(chunk.strip())
# start = chunk_end
# return chunks
def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
import re
words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
chunks = []
start = 0
while start < len(words):
end = min(start + max_words, len(words))
dot_idx = -1
for i in range(start + min_words, end):
if words[i] in ['.', '?', '!'] or (words[i].endswith(('.', '?', '!')) and words[i] != '\n'):
dot_idx = i
if dot_idx != -1:
chunk_end = dot_idx + 1
elif end - start > fallback_words:
chunk_end = start + fallback_words
else:
chunk_end = end
chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]])
chunk = chunk.replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
chunks.append(chunk.strip())
start = chunk_end
return chunks
class TranslateRequest(BaseModel):
text: str
source_lang: str
target_lang: str
@app.post("/translate")
def translate_text(req: TranslateRequest):
tokenizer.src_lang = req.source_lang
text_chunks = split_by_words_and_dot(req.text)
translated_chunks = []
timing_info = []
global_start = time.perf_counter()
for idx, chunk in enumerate(text_chunks):
start_time = time.perf_counter()
encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
with torch.inference_mode():
generated_tokens = model.generate(
**encoded,
forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
max_length=256, # <-- Giữ mức vừa phải
num_beams=2, # <-- Đơn giản beam search
no_repeat_ngram_size=3,
)
print(f"forced_bos_token_id: {tokenizer.lang_code_to_id[req.target_lang]}")
print(f"tokenizer.src_lang: {tokenizer.src_lang}")
translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
translated_chunks.append(translated_text)
end_time = time.perf_counter()
timing_info.append(f"Translated chunk {idx+1}/{len(text_chunks)} in {end_time - start_time:.3f} seconds")
global_end = time.perf_counter()
print(f"⚡️ Total translation time: {global_end - global_start:.3f} seconds")
print(timing_info)
return {
"source_text": req.text,
"translated_text": "\n".join(translated_chunks),
"src_lang": req.source_lang,
"tgt_lang": req.target_lang,
}
|