lyfeyvutha commited on
Commit
eeb6308
·
verified ·
1 Parent(s): 8c54dab

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -4
README.md CHANGED
@@ -86,7 +86,7 @@ This model is intended for:
86
 
87
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
88
 
89
- Configuration
90
  CONFIG = {
91
  "model_name": "lyfeyvutha/nllb_350M_en_km_v10",
92
  "tokenizer_name": "facebook/nllb-200-distilled-600M",
@@ -95,7 +95,7 @@ CONFIG = {
95
  "max_length": 128
96
  }
97
 
98
- Load model and tokenizer
99
  model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])
100
  tokenizer = AutoTokenizer.from_pretrained(
101
  CONFIG["tokenizer_name"],
@@ -103,14 +103,14 @@ src_lang=CONFIG["source_lang"],
103
  tgt_lang=CONFIG["target_lang"]
104
  )
105
 
106
- Set up generation configuration
107
  khm_token_id = tokenizer.convert_tokens_to_ids(CONFIG["target_lang"])
108
  generation_config = GenerationConfig(
109
  max_length=CONFIG["max_length"],
110
  forced_bos_token_id=khm_token_id
111
  )
112
 
113
- Translate
114
  text = "Hello, how are you?"
115
  inputs = tokenizer(text, return_tensors="pt")
116
  outputs = model.generate(**inputs, generation_config=generation_config)
@@ -122,6 +122,7 @@ print(translation)
122
  ## Training Details
123
 
124
  ### Training Data
 
125
  - **Dataset size:** 316,110 English-Khmer sentence pairs
126
  - **Data source:** Synthetic data generated using DeepSeek translation API
127
  - **Preprocessing:** Tokenized using NLLB-200 tokenizer with max length 128
 
86
 
87
  from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, GenerationConfig
88
 
89
+ # Configuration
90
  CONFIG = {
91
  "model_name": "lyfeyvutha/nllb_350M_en_km_v10",
92
  "tokenizer_name": "facebook/nllb-200-distilled-600M",
 
95
  "max_length": 128
96
  }
97
 
98
+ # Load model and tokenizer
99
  model = AutoModelForSeq2SeqLM.from_pretrained(CONFIG["model_name"])
100
  tokenizer = AutoTokenizer.from_pretrained(
101
  CONFIG["tokenizer_name"],
 
103
  tgt_lang=CONFIG["target_lang"]
104
  )
105
 
106
+ # Set up generation configuration
107
  khm_token_id = tokenizer.convert_tokens_to_ids(CONFIG["target_lang"])
108
  generation_config = GenerationConfig(
109
  max_length=CONFIG["max_length"],
110
  forced_bos_token_id=khm_token_id
111
  )
112
 
113
+ # Translate
114
  text = "Hello, how are you?"
115
  inputs = tokenizer(text, return_tensors="pt")
116
  outputs = model.generate(**inputs, generation_config=generation_config)
 
122
  ## Training Details
123
 
124
  ### Training Data
125
+
126
  - **Dataset size:** 316,110 English-Khmer sentence pairs
127
  - **Data source:** Synthetic data generated using DeepSeek translation API
128
  - **Preprocessing:** Tokenized using NLLB-200 tokenizer with max length 128