pikhan's picture
uploading some code files
b5f1696 verified
raw
history blame
8.05 kB
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, \
TrainingArguments
import logging
from transformers import logging as hf_logging
import os
from torch.quantization import quantize_dynamic
import torch
# Setup logging
logging.basicConfig(level=logging.INFO) # Adjust as per the desired verbosity
hf_logging.set_verbosity_info()
hf_logging.enable_default_handler()
hf_logging.enable_explicit_format()
# Define the custom data collator
class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
def collate_batch(self, features):
batch = super().collate_batch(features)
batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
return batch
# Use GPT-2 XL
model = GPT2LMHeadModel.from_pretrained('gpt2-medium').to(torch.bfloat16)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path="formatted_data_small.txt",
block_size=256) # Increased block size for larger model
# Instantiate the custom data collator
data_collator = CustomDataCollatorForLanguageModeling(
tokenizer=tokenizer, mlm=False
)
training_args = TrainingArguments(
output_dir="./gpt2-medium-finetuned",
overwrite_output_dir=True,
num_train_epochs=4,
per_device_train_batch_size=4, # Adjusted for potential memory constraints
gradient_accumulation_steps=4, # Increased accumulation to handle larger model size
learning_rate=2e-4,
save_steps=1_000,
save_total_limit=3,
logging_dir='./logs',
logging_steps=50,
fp16=False # Ensure this is false since we're using bfloat16 manually
)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
)
trainer.train()
# Save model and tokenizer
model_path = "./saved_gpt2_medium_nice_model_directory"
if not os.path.exists(model_path):
os.makedirs(model_path)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)
# Load the full-precision model
model.eval() # Ensure the model is in evaluation mode
quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
quantized_model_path = "./quantized_model_directory"
if not os.path.exists(quantized_model_path):
os.makedirs(quantized_model_path)
torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_nice_medium_model.pth'))
# from transformers import BertForMaskedLM, BertTokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# import logging
# from transformers import logging as hf_logging
# import os
# from torch.quantization import quantize_dynamic
# import torch
#
# # Setup logging
# logging.basicConfig(level=logging.INFO)
# hf_logging.set_verbosity_info()
# hf_logging.enable_default_handler()
# hf_logging.enable_explicit_format()
#
# # Define the custom data collator for masked language modeling
# class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
# def collate_batch(self, features):
# batch = super().collate_batch(features)
# batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
# return batch
#
# # Load BioBERT
# model = BertForMaskedLM.from_pretrained('dmis-lab/biobert-v1.1').to(torch.bfloat16)
# tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
#
# # Prepare the dataset
# train_dataset = TextDataset(
# tokenizer=tokenizer,
# file_path="papers_data_mountain.txt",
# block_size=512) # Adjust block_size if necessary
#
# data_collator = CustomDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)
#
# training_args = TrainingArguments(
# output_dir="./biobert-finetuned",
# overwrite_output_dir=True,
# num_train_epochs=4,
# per_device_train_batch_size=8,
# gradient_accumulation_steps=2,
# learning_rate=2e-4,
# save_steps=1_000,
# save_total_limit=3,
# logging_dir='./logs',
# logging_steps=50,
# fp16=False # Ensure this is false since we're using bfloat16 manually
# )
#
# trainer = Trainer(
# model=model,
# args=training_args,
# data_collator=data_collator,
# train_dataset=train_dataset,
# )
#
# trainer.train()
#
# # Save model and tokenizer
# model_path = "./saved_mountain_model_directory"
# if not os.path.exists(model_path):
# os.makedirs(model_path)
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)
#
# # Quantize the model
# model.eval() # Ensure the model is in evaluation mode
# quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
# quantized_model_path = "./quantized_model_directory"
# if not os.path.exists(quantized_model_path):
# os.makedirs(quantized_model_path)
# torch.save(quantized_model.state_dict(), os.path.join(quantized_model_path, 'quantized_mountain_model.pth'))
# from transformers import Phi3Model, Phi3Config, AutoTokenizer, Trainer, TrainingArguments
# import logging
# from transformers import logging as hf_logging
# import os
# import torch
# from torch.utils.data import Dataset
#
#
# class TextDataset(Dataset):
# def __init__(self, tokenizer, file_path, block_size=512):
# self.tokenizer = tokenizer
# self.block_size = block_size
# self.input_ids = []
#
# # Read and tokenize the file content in chunks
# with open(file_path, 'r', encoding='utf-8') as f:
# while True:
# text = f.read(1024 * 1024) # Read approximately 1MB of text at a time
# if not text:
# break
# tokens = tokenizer(text, add_special_tokens=True, truncation=True, max_length=block_size,
# return_tensors="pt")
# self.input_ids.extend(tokens.input_ids.tolist())
#
# def __len__(self):
# # Ensure we return a non-negative value
# return max(0, len(self.input_ids) - self.block_size + 1)
#
# def __getitem__(self, idx):
# # Ensure the index does not exceed the bounds and forms a proper sequence
# input_ids = self.input_ids[idx:idx + self.block_size]
# return {"input_ids": torch.tensor(input_ids, dtype=torch.long)}
#
#
#
#
# # Setup logging
# logging.basicConfig(level=logging.INFO)
# hf_logging.set_verbosity_info()
# hf_logging.enable_default_handler()
# hf_logging.enable_explicit_format()
#
# # Load Phi-3 model
# configuration = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
# model = Phi3Model(configuration).to(torch.bfloat16)
#
# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
#
#
# # Initialize the dataset
# train_dataset = TextDataset(tokenizer, "papers_data_mountain.txt", block_size=512)
#
# # Custom data collator function (simplified for generality)
# def custom_collate_fn(examples):
# batch = tokenizer.pad(
# examples,
# return_tensors='pt',
# padding=True,
# max_length=512
# )
# batch = {k: v.to(torch.bfloat16) if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
# return batch
#
#
# training_args = TrainingArguments(
# output_dir="./phi3-finetuned",
# overwrite_output_dir=True,
# num_train_epochs=4,
# per_device_train_batch_size=8,
# gradient_accumulation_steps=2,
# learning_rate=2e-4,
# save_steps=1_000,
# save_total_limit=3,
# logging_dir='./logs',
# logging_steps=50,
# fp16=False # bfloat16 usage is manual
# )
#
# trainer = Trainer(
# model=model,
# args=training_args,
# data_collator=custom_collate_fn,
# train_dataset=train_dataset,
# )
#
# trainer.train()
#
# # Save model and tokenizer
# model_path = "./saved_phi3_model_directory"
# if not os.path.exists(model_path):
# os.makedirs(model_path)
# model.save_pretrained(model_path)
# tokenizer.save_pretrained(model_path)