Update README.md
Browse files
README.md
CHANGED
@@ -86,7 +86,7 @@ This model is intended for:
|
|
86 |
|
87 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
|
88 |
|
89 |
-
Configuration
|
90 |
CONFIG = {
|
91 |
"model_name": "lyfeyvutha/nllb_350M_en_km_v10",
|
92 |
"tokenizer_name": "facebook/nllb-200-distilled-600M",
|
@@ -95,7 +95,7 @@ CONFIG = {
|
|
95 |
"max_length": 128
|
96 |
}
|
97 |
|
98 |
-
Load model and tokenizer
|
99 |
model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])
|
100 |
tokenizer = AutoTokenizer.from_pretrained(
|
101 |
CONFIG["tokenizer_name"],
|
@@ -103,14 +103,14 @@ src_lang=CONFIG["source_lang"],
|
|
103 |
tgt_lang=CONFIG["target_lang"]
|
104 |
)
|
105 |
|
106 |
-
Set up generation configuration
|
107 |
khm_token_id = tokenizer.convert_tokens_to_ids(CONFIG["target_lang"])
|
108 |
generation_config = GenerationConfig(
|
109 |
max_length=CONFIG["max_length"],
|
110 |
forced_bos_token_id=khm_token_id
|
111 |
)
|
112 |
|
113 |
-
Translate
|
114 |
text = "Hello, how are you?"
|
115 |
inputs = tokenizer(text, return_tensors="pt")
|
116 |
outputs = model.generate(**inputs, generation_config=generation_config)
|
@@ -122,6 +122,7 @@ print(translation)
|
|
122 |
## Training Details
|
123 |
|
124 |
### Training Data
|
|
|
125 |
- **Dataset size:** 316,110 English-Khmer sentence pairs
|
126 |
- **Data source:** Synthetic data generated using DeepSeek translation API
|
127 |
- **Preprocessing:** Tokenized using NLLB-200 tokenizer with max length 128
|
|
|
86 |
|
87 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
|
88 |
|
89 |
+
# Configuration
|
90 |
CONFIG = {
|
91 |
"model_name": "lyfeyvutha/nllb_350M_en_km_v10",
|
92 |
"tokenizer_name": "facebook/nllb-200-distilled-600M",
|
|
|
95 |
"max_length": 128
|
96 |
}
|
97 |
|
98 |
+
# Load model and tokenizer
|
99 |
model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])
|
100 |
tokenizer = AutoTokenizer.from_pretrained(
|
101 |
CONFIG["tokenizer_name"],
|
|
|
103 |
tgt_lang=CONFIG["target_lang"]
|
104 |
)
|
105 |
|
106 |
+
# Set up generation configuration
|
107 |
khm_token_id = tokenizer.convert_tokens_to_ids(CONFIG["target_lang"])
|
108 |
generation_config = GenerationConfig(
|
109 |
max_length=CONFIG["max_length"],
|
110 |
forced_bos_token_id=khm_token_id
|
111 |
)
|
112 |
|
113 |
+
# Translate
|
114 |
text = "Hello, how are you?"
|
115 |
inputs = tokenizer(text, return_tensors="pt")
|
116 |
outputs = model.generate(**inputs, generation_config=generation_config)
|
|
|
122 |
## Training Details
|
123 |
|
124 |
### Training Data
|
125 |
+
|
126 |
- **Dataset size:** 316,110 English-Khmer sentence pairs
|
127 |
- **Data source:** Synthetic data generated using DeepSeek translation API
|
128 |
- **Preprocessing:** Tokenized using NLLB-200 tokenizer with max length 128
|