longvnhue1 commited on
Commit
fdd52fa
·
1 Parent(s): 280c743

Add model with Git LFS

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. app.py +87 -0
  3. dockerfile +14 -0
  4. requirements.txt +4 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model/** filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Request
2
+ from pydantic import BaseModel
3
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
4
+ import torch
5
+ import re
6
+
7
+ app = FastAPI()
8
+
9
+ def split_by_words_and_dot(text, min_words=125, max_words=160, fallback_words=150):
10
+ import re
11
+ words = re.findall(r'\S+|\n', text) # giữ nguyên \n như một "từ"
12
+ chunks = []
13
+ start = 0
14
+ while start < len(words):
15
+ end = min(start + max_words, len(words))
16
+ # Tìm dấu chấm trong khoảng min_words đến max_words
17
+ dot_idx = -1
18
+ for i in range(start + min_words, min(start + max_words, len(words))):
19
+ if words[i] == '.' or (words[i].endswith('.') and words[i] != '\n'):
20
+ dot_idx = i
21
+ if dot_idx != -1:
22
+ chunk_end = dot_idx + 1
23
+ elif end - start > fallback_words:
24
+ chunk_end = start + fallback_words
25
+ else:
26
+ chunk_end = end
27
+ chunk = ' '.join([w if w != '\n' else '\n' for w in words[start:chunk_end]]).replace(' \n ', '\n').replace(' \n', '\n').replace('\n ', '\n')
28
+ chunks.append(chunk.strip())
29
+ start = chunk_end
30
+ return chunks
31
+
32
+ # Load model
33
+ model_path = "./model/facebook-m2m100_418M-fine_tuning"
34
+ tokenizer = M2M100Tokenizer.from_pretrained(model_path)
35
+ model = M2M100ForConditionalGeneration.from_pretrained(model_path)
36
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
37
+ model.to(device)
38
+
39
+ class TranslateRequest(BaseModel):
40
+ text: str
41
+ source_lang: str
42
+ target_lang: str
43
+
44
+ # @app.post("/translate")
45
+ # def translate_text(req: TranslateRequest):
46
+ # tokenizer.src_lang = req.source_lang
47
+ # encoded = tokenizer(req.text, return_tensors="pt", truncation=True, max_length=512).to(device)
48
+ # generated_tokens = model.generate(
49
+ # **encoded,
50
+ # forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
51
+ # max_length=512, # tăng lên nếu cần dịch đoạn dài, nhưng không nên quá lớn
52
+ # num_beams=2, # giảm beam search để nhanh hơn
53
+ # no_repeat_ngram_size=3,
54
+ # early_stopping=True
55
+ # )
56
+ # translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
57
+ # return {
58
+ # "source_text": req.text,
59
+ # "translated_text": translated_text,
60
+ # "src_lang": req.source_lang,
61
+ # "tgt_lang": req.target_lang
62
+ # }
63
+
64
+
65
+ @app.post("/translate")
66
+ def translate_text(req: TranslateRequest):
67
+ tokenizer.src_lang = req.source_lang
68
+ text_chunks = split_by_words_and_dot(req.text, min_words=125, max_words=160, fallback_words=150)
69
+ translated_chunks = []
70
+ for chunk in text_chunks:
71
+ encoded = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=256).to(device)
72
+ generated_tokens = model.generate(
73
+ **encoded,
74
+ forced_bos_token_id=tokenizer.get_lang_id(req.target_lang),
75
+ max_length=256,
76
+ num_beams=2,
77
+ no_repeat_ngram_size=3,
78
+ )
79
+ translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
80
+ translated_chunks.append(translated_text)
81
+ full_translation = "\n".join(translated_chunks)
82
+ return {
83
+ "source_text": req.text,
84
+ "translated_text": full_translation,
85
+ "src_lang": req.source_lang,
86
+ "tgt_lang": req.target_lang
87
+ }
dockerfile ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim-bullseye
2
+
3
+ WORKDIR /app
4
+
5
+ # Cài g++ để tránh lỗi transformers build lại tokenizer
6
+ RUN apt-get update && apt-get install -y g++ && apt-get upgrade -y && apt-get clean && rm -rf /var/lib/apt/lists/*
7
+
8
+ COPY requirements.txt .
9
+ RUN pip install --no-cache-dir -r requirements.txt
10
+
11
+ COPY . .
12
+
13
+ # Mặc định FastAPI chạy bằng Uvicorn
14
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ torch
4
+ transformers