OminduAnjana commited on
Commit
07c9f24
·
verified ·
1 Parent(s): c071132

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -2,7 +2,7 @@ from datasets import load_dataset, Dataset
2
  from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, PreTrainedTokenizerFast
3
  from tokenizers import Tokenizer, models, trainers
4
 
5
- # Load and combine datasets
6
  wiki_dataset = load_dataset("LexiconShiftInnovations/SinhalaWikipediaArticles")
7
  gov_dataset = load_dataset("Virajtharinda/SinhalaGOV")
8
  text_dataset = load_dataset("rmihiranga/sinhala-text-fullfill-v2")
@@ -13,6 +13,7 @@ combined_dataset = Dataset.from_dict({"text": combined_texts})
13
  tokenizer = Tokenizer(models.BPE())
14
  trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
15
  tokenizer.train_from_iterator(combined_texts, trainer)
 
16
  tokenizer = PreTrainedTokenizerFast(
17
  tokenizer_file="sinhala_tokenizer.json",
18
  unk_token="[UNK]", cls_token="[CLS]", sep_token="[SEP]", pad_token="[PAD]", mask_token="[MASK]"
@@ -31,14 +32,14 @@ model = GPT2LMHeadModel(config)
31
  training_args = TrainingArguments(
32
  output_dir="./sinhala_LM_V1",
33
  overwrite_output_dir=True,
34
- num_train_epochs=1, # Reduced for demo; increase as needed
35
  per_device_train_batch_size=4,
36
  save_steps=10_000,
37
  save_total_limit=2,
38
  logging_steps=500,
39
  fp16=True,
40
  push_to_hub=True,
41
- hub_model_id="OminduAnjana/sinhala_LM_V1", # Replace with your username
42
  )
43
 
44
  trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
 
2
  from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, PreTrainedTokenizerFast
3
  from tokenizers import Tokenizer, models, trainers
4
 
5
+ # Load datasets
6
  wiki_dataset = load_dataset("LexiconShiftInnovations/SinhalaWikipediaArticles")
7
  gov_dataset = load_dataset("Virajtharinda/SinhalaGOV")
8
  text_dataset = load_dataset("rmihiranga/sinhala-text-fullfill-v2")
 
13
  tokenizer = Tokenizer(models.BPE())
14
  trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
15
  tokenizer.train_from_iterator(combined_texts, trainer)
16
+ tokenizer.save("sinhala_tokenizer.json")
17
  tokenizer = PreTrainedTokenizerFast(
18
  tokenizer_file="sinhala_tokenizer.json",
19
  unk_token="[UNK]", cls_token="[CLS]", sep_token="[SEP]", pad_token="[PAD]", mask_token="[MASK]"
 
32
  training_args = TrainingArguments(
33
  output_dir="./sinhala_LM_V1",
34
  overwrite_output_dir=True,
35
+ num_train_epochs=1,
36
  per_device_train_batch_size=4,
37
  save_steps=10_000,
38
  save_total_limit=2,
39
  logging_steps=500,
40
  fp16=True,
41
  push_to_hub=True,
42
+ hub_model_id="your-username/sinhala_LM_V1", # Replace with your username
43
  )
44
 
45
  trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)