# from fastapi import FastAPI, Request
# from pydantic import BaseModel
# from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
# import torch
# import re
# import time

# app = FastAPI()

# def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
#     import re
#     words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
#     chunks = []
#     start = 0
#     while start < len(words):
#         end = min(start + max_words, len(words))
#         # Tìm dấu chấm trong khoảng min_words đến max_words
#         dot_idx = -1
#         for i in range(start + min_words, min(start + max_words, len(words))):
#             if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
#                 dot_idx = i
#         if dot_idx != -1:
#             chunk_end = dot_idx + 1
#         elif end - start > fallback_words:
#             chunk_end = start + fallback_words
#         else:
#             chunk_end = end
#         chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
#         chunks.append(chunk.strip())
#         start = chunk_end
#     return chunks

# # Load model
# model_path = "longvnhue1/facebook-m2m100_418M-fine_tuning"
# tokenizer = M2M100Tokenizer.from_pretrained(model_path)
# model = M2M100ForConditionalGeneration.from_pretrained(model_path)
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# class TranslateRequest(BaseModel):
#     text: str
#     source_lang: str
#     target_lang: str


# @app.post("/translate")
# def translate_text(req: TranslateRequest):
#     tokenizer.src_lang = req.source_lang
#     text_chunks = split_by_words_and_dot(req.text, min_words=125, max_words=160, fallback_words=150)
#     translated_chunks = []
#     timing_info = []

#     for idx, chunk in enumerate(text_chunks):
#         start_time = time.perf_counter()  # Bắt đầu đếm thời gian

#         encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
#         with torch.inference_mode():
#             generated_tokens = model.generate(
#                 **encoded,
#                 forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
#                 max_length=256,
#                 num_beams=2,
#                 no_repeat_ngram_size=3,
#             )
#         translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
#         translated_chunks.append(translated_text)

#         end_time = time.perf_counter()  # Kết thúc đếm thời gian
#         elapsed = end_time - start_time
#         timing_info.append(f"Translated chunk {idx+1}/{len(text_chunks)} in {elapsed:.3f} seconds")

#     full_translation = "\n".join(translated_chunks)
#     print(timing_info)

#     return {
#         "source_text": req.text,
#         "translated_text": full_translation,
#         "src_lang": req.source_lang,
#         "tgt_lang": req.target_lang,
#     }


from fastapi import FastAPI
from pydantic import BaseModel
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
import torch
import re
import time

# Limit CPU thread
torch.set_num_threads(1)

app = FastAPI()

@app.on_event("startup")
def startup_event():
    global tokenizer, model, device

    model_path = "longvnhue1/facebook-m2m100_418M-fine_tuning"
    tokenizer = M2M100Tokenizer.from_pretrained(model_path)
    model = M2M100ForConditionalGeneration.from_pretrained(model_path)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    print("Model loaded and ready.")

# def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
#     words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
#     chunks = []
#     start = 0
#     while start < len(words):
#         end = min(start + max_words, len(words))
#         dot_idx = -1
#         for i in range(start + min_words, min(start + max_words, len(words))):
#             if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
#                 dot_idx = i
#         if dot_idx != -1:
#             chunk_end = dot_idx + 1
#         elif end - start > fallback_words:
#             chunk_end = start + fallback_words
#         else:
#             chunk_end = end
#         chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
#         chunks.append(chunk.strip())
#         start = chunk_end
#     return chunks

def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
    import re
    words = re.findall(r'\S+|\n', text)  # giữ nguyên \n như một "từ"
    chunks = []
    start = 0

    while start < len(words):
        end = min(start + max_words, len(words))
        dot_idx = -1

        for i in range(start + min_words, end):
            if words[i] in ['.', '?', '!'] or (words[i].endswith(('.', '?', '!')) and words[i] != '\n'):
                dot_idx = i

        if dot_idx != -1:
            chunk_end = dot_idx + 1
        elif end - start > fallback_words:
            chunk_end = start + fallback_words
        else:
            chunk_end = end

        chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]])
        chunk = chunk.replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
        chunks.append(chunk.strip())
        start = chunk_end

    return chunks


class TranslateRequest(BaseModel):
    text: str
    source_lang: str
    target_lang: str

@app.post("/translate")
def translate_text(req: TranslateRequest):
    tokenizer.src_lang = req.source_lang
    text_chunks = split_by_words_and_dot(req.text)
    translated_chunks = []
    timing_info = []

    global_start = time.perf_counter()

    for idx, chunk in enumerate(text_chunks):
        start_time = time.perf_counter()

        encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
        with torch.inference_mode():
            generated_tokens = model.generate(
                **encoded,
                forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
                max_length=256,              # <-- Giữ mức vừa phải
                num_beams=2,                 # <-- Đơn giản beam search
                no_repeat_ngram_size=3,
            )
        print(f"forced_bos_token_id: {tokenizer.lang_code_to_id[req.target_lang]}")
        print(f"tokenizer.src_lang: {tokenizer.src_lang}")
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        translated_chunks.append(translated_text)

        end_time = time.perf_counter()
        timing_info.append(f"Translated chunk {idx+1}/{len(text_chunks)} in {end_time - start_time:.3f} seconds")

    global_end = time.perf_counter()
    print(f"⚡️ Total translation time: {global_end - global_start:.3f} seconds")
    print(timing_info)

    return {
        "source_text": req.text,
        "translated_text": "\n".join(translated_chunks),
        "src_lang": req.source_lang,
        "tgt_lang": req.target_lang,
    }