Update app.py
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import os
|
|
| 2 |
import platform
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
import torch
|
| 5 |
-
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
|
| 6 |
from datasets import load_dataset, concatenate_datasets
|
| 7 |
from huggingface_hub import login
|
| 8 |
import time
|
|
@@ -44,7 +44,7 @@ async def root():
|
|
| 44 |
def load_and_train():
|
| 45 |
model_name = 'gpt2'
|
| 46 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 47 |
-
model = GPT2LMHeadModel.from_pretrained(model_name)
|
| 48 |
|
| 49 |
# Asignar el pad_token al eos_token
|
| 50 |
tokenizer.pad_token = tokenizer.eos_token
|
|
@@ -128,13 +128,14 @@ def load_and_train():
|
|
| 128 |
|
| 129 |
# Funci贸n de tokenizaci贸n basada en el campo 'text'
|
| 130 |
def tokenize_function(examples):
|
| 131 |
-
|
| 132 |
examples['text'],
|
| 133 |
truncation=True,
|
| 134 |
padding='max_length',
|
| 135 |
max_length=512
|
| 136 |
-
# clean_up_tokenization_spaces=True # Eliminado porque no es reconocido
|
| 137 |
)
|
|
|
|
|
|
|
| 138 |
|
| 139 |
# Tokenizar el dataset
|
| 140 |
tokenized_dataset = combined_dataset.map(
|
|
@@ -142,6 +143,12 @@ def load_and_train():
|
|
| 142 |
batched=True
|
| 143 |
)
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
# Configurar argumentos de entrenamiento
|
| 146 |
training_args = TrainingArguments(
|
| 147 |
output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
|
|
@@ -164,6 +171,7 @@ def load_and_train():
|
|
| 164 |
model=model,
|
| 165 |
args=training_args,
|
| 166 |
train_dataset=tokenized_dataset,
|
|
|
|
| 167 |
)
|
| 168 |
|
| 169 |
while True:
|
|
|
|
| 2 |
import platform
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
import torch
|
| 5 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
|
| 6 |
from datasets import load_dataset, concatenate_datasets
|
| 7 |
from huggingface_hub import login
|
| 8 |
import time
|
|
|
|
| 44 |
def load_and_train():
|
| 45 |
model_name = 'gpt2'
|
| 46 |
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
|
| 47 |
+
model = GPT2LMHeadModel.from_pretrained(model_name, return_dict=True)
|
| 48 |
|
| 49 |
# Asignar el pad_token al eos_token
|
| 50 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
| 128 |
|
| 129 |
# Funci贸n de tokenizaci贸n basada en el campo 'text'
|
| 130 |
def tokenize_function(examples):
|
| 131 |
+
tokenized = tokenizer(
|
| 132 |
examples['text'],
|
| 133 |
truncation=True,
|
| 134 |
padding='max_length',
|
| 135 |
max_length=512
|
|
|
|
| 136 |
)
|
| 137 |
+
tokenized['labels'] = tokenized['input_ids'].copy()
|
| 138 |
+
return tokenized
|
| 139 |
|
| 140 |
# Tokenizar el dataset
|
| 141 |
tokenized_dataset = combined_dataset.map(
|
|
|
|
| 143 |
batched=True
|
| 144 |
)
|
| 145 |
|
| 146 |
+
# Configurar el Data Collator
|
| 147 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 148 |
+
tokenizer=tokenizer,
|
| 149 |
+
mlm=False # Para modelado de lenguaje causal
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
# Configurar argumentos de entrenamiento
|
| 153 |
training_args = TrainingArguments(
|
| 154 |
output_dir=os.path.join(cache_dir, 'results'), # Almacenar temporalmente en RAM
|
|
|
|
| 171 |
model=model,
|
| 172 |
args=training_args,
|
| 173 |
train_dataset=tokenized_dataset,
|
| 174 |
+
data_collator=data_collator,
|
| 175 |
)
|
| 176 |
|
| 177 |
while True:
|