Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -2,7 +2,7 @@ from datasets import load_dataset, Dataset
|
|
2 |
from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, PreTrainedTokenizerFast
|
3 |
from tokenizers import Tokenizer, models, trainers
|
4 |
|
5 |
-
# Load
|
6 |
wiki_dataset = load_dataset("LexiconShiftInnovations/SinhalaWikipediaArticles")
|
7 |
gov_dataset = load_dataset("Virajtharinda/SinhalaGOV")
|
8 |
text_dataset = load_dataset("rmihiranga/sinhala-text-fullfill-v2")
|
@@ -13,6 +13,7 @@ combined_dataset = Dataset.from_dict({"text": combined_texts})
|
|
13 |
tokenizer = Tokenizer(models.BPE())
|
14 |
trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
15 |
tokenizer.train_from_iterator(combined_texts, trainer)
|
|
|
16 |
tokenizer = PreTrainedTokenizerFast(
|
17 |
tokenizer_file="sinhala_tokenizer.json",
|
18 |
unk_token="[UNK]", cls_token="[CLS]", sep_token="[SEP]", pad_token="[PAD]", mask_token="[MASK]"
|
@@ -31,14 +32,14 @@ model = GPT2LMHeadModel(config)
|
|
31 |
training_args = TrainingArguments(
|
32 |
output_dir="./sinhala_LM_V1",
|
33 |
overwrite_output_dir=True,
|
34 |
-
num_train_epochs=1,
|
35 |
per_device_train_batch_size=4,
|
36 |
save_steps=10_000,
|
37 |
save_total_limit=2,
|
38 |
logging_steps=500,
|
39 |
fp16=True,
|
40 |
push_to_hub=True,
|
41 |
-
hub_model_id="
|
42 |
)
|
43 |
|
44 |
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
|
|
|
2 |
from transformers import GPT2LMHeadModel, GPT2Config, Trainer, TrainingArguments, PreTrainedTokenizerFast
|
3 |
from tokenizers import Tokenizer, models, trainers
|
4 |
|
5 |
+
# Load datasets
|
6 |
wiki_dataset = load_dataset("LexiconShiftInnovations/SinhalaWikipediaArticles")
|
7 |
gov_dataset = load_dataset("Virajtharinda/SinhalaGOV")
|
8 |
text_dataset = load_dataset("rmihiranga/sinhala-text-fullfill-v2")
|
|
|
13 |
tokenizer = Tokenizer(models.BPE())
|
14 |
trainer = trainers.BpeTrainer(vocab_size=30000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
15 |
tokenizer.train_from_iterator(combined_texts, trainer)
|
16 |
+
tokenizer.save("sinhala_tokenizer.json")
|
17 |
tokenizer = PreTrainedTokenizerFast(
|
18 |
tokenizer_file="sinhala_tokenizer.json",
|
19 |
unk_token="[UNK]", cls_token="[CLS]", sep_token="[SEP]", pad_token="[PAD]", mask_token="[MASK]"
|
|
|
32 |
training_args = TrainingArguments(
|
33 |
output_dir="./sinhala_LM_V1",
|
34 |
overwrite_output_dir=True,
|
35 |
+
num_train_epochs=1,
|
36 |
per_device_train_batch_size=4,
|
37 |
save_steps=10_000,
|
38 |
save_total_limit=2,
|
39 |
logging_steps=500,
|
40 |
fp16=True,
|
41 |
push_to_hub=True,
|
42 |
+
hub_model_id="your-username/sinhala_LM_V1", # Replace with your username
|
43 |
)
|
44 |
|
45 |
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
|