In [12]:
# import pandas as pd
# import torch
# from transformers import T5Tokenizer
# import pandas as pd
# from torch.utils.data import DataLoader, TensorDataset
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 
# import numpy as np
# from transformers import T5Tokenizer


In [None]:
# df = pd.read_parquet("press_releases_all_with_CAP_issues.parquet")

In [None]:
# df = df[['title', 'text']]

In [None]:
# df = df.head(10000)

In [None]:
# df['title'].fillna('', inplace=True)

In [None]:
# df['title'] = df['title'].replace('', 'No Title') 

In [None]:
# print(df.isna().sum())

In [None]:
# df.to_parquet('press_releases_consolidated.parquet', engine='pyarrow')

In [1]:
import pandas as pd
df = pd.read_parquet('press_releases_consolidated.parquet')

In [3]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch
from transformers import T5Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = T5Tokenizer.from_pretrained('t5-small')

# modify accordingly
MAX_TARGET_LENGTH = 128
MAX_INPUT_LENGTH = 512

class SummarizationDataset(Dataset):
 def __init__(self, dataframe, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH):
 self.data = dataframe
 self.tokenizer = tokenizer
 self.max_input_length = max_input_length
 self.max_target_length = max_target_length

 def __len__(self):
 return len(self.data)
 
 def __getitem__(self, idx):
 text = self.data.iloc[idx]['text']
 title = self.data.iloc[idx]['title']
 
 
 # tokenize
 text_to_token = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_input_length, return_tensors='pt')
 title_to_token = self.tokenizer(title, padding='max_length', truncation=True, max_length=self.max_target_length, return_tensors='pt')
 
 
 input_ids = text_to_token['input_ids'].squeeze(0) 
 attention_mask = text_to_token['attention_mask'].squeeze(0) 
 labels = title_to_token['input_ids'].squeeze(0) 
 labels[labels == self.tokenizer.pad_token_id] = -100 
 
 return {
 'input_ids': input_ids,
 'attention_mask': attention_mask,
 'labels': labels 
 }

dataset = SummarizationDataset(df, tokenizer)


train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)



You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
import torch
from transformers import T5ForConditionalGeneration
from torch.optim import Adam
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
import evaluate

model = T5ForConditionalGeneration.from_pretrained('t5-small')
optimizer = Adam(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

rouge = evaluate.load("rouge")

def train():
 model.train()
 total_loss = 0
 for batch in train_dataloader:
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 labels = batch['labels'].to(device)

 outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
 loss = outputs.loss
 total_loss += loss.item()

 loss.backward()
 optimizer.step()
 optimizer.zero_grad()

 return total_loss / len(train_dataloader)

def evaluate():
 model.eval()
 total_loss = 0
 all_preds = []
 all_labels = []
 
 with torch.no_grad():
 for batch in val_dataloader:
 input_ids = batch['input_ids'].to(device)
 attention_mask = batch['attention_mask'].to(device)
 labels = batch['labels'].to(device)

 outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
 total_loss += outputs.loss.item()
 
 try:
 summary_ids = model.generate(
 input_ids=input_ids,
 attention_mask=attention_mask,
 max_length=MAX_TARGET_LENGTH,
 num_beams=8,
 early_stopping=True
 )
 
 summary_ids = summary_ids[0] if len(summary_ids) > 0 else torch.tensor([tokenizer.pad_token_id])
 
 preds = tokenizer.decode(summary_ids.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=True)
 labels_decoded = tokenizer.decode(
 labels[0].masked_select(labels[0] != -100).cpu(), 
 skip_special_tokens=True,
 clean_up_tokenization_spaces=True
 )
 
 all_preds.append(preds if preds else " ")
 all_labels.append(labels_decoded if labels_decoded else " ")
 
 except Exception as e:
 print(f"Error during generation: {e}")
 all_preds.append(" ")
 all_labels.append(" ")
 continue

 all_preds = [p if p.strip() else " " for p in all_preds]
 all_labels = [l if l.strip() else " " for l in all_labels]
 
 rouge_result = rouge.compute(predictions=all_preds, references=all_labels)
 
 return total_loss / len(val_dataloader), rouge_result


epochs = 15
best_val_loss = float('inf')

for epoch in range(epochs):
 print(f"Epoch {epoch + 1}/{epochs}")

 train_loss = train()
 print(f"Training Loss: {train_loss:.4f}")

 val_loss, rouge_result = evaluate()
 print(f"Validation Loss: {val_loss:.4f}")
 print(f"ROUGE Scores: {rouge_result}")

 if val_loss < best_val_loss:
 best_val_loss = val_loss
 model.save_pretrained(f"best_model_epoch_{epoch + 1}")
 tokenizer.save_pretrained(f"best_model_epoch_{epoch + 1}")


Epoch 1/15
Training Loss: 2.3327
Validation Loss: 1.9963
ROUGE Scores: {'rouge1': 0.21808722374319384, 'rouge2': 0.1182736024791169, 'rougeL': 0.19976099496233557, 'rougeLsum': 0.19920689338385827}
Epoch 2/15
Training Loss: 2.1164
Validation Loss: 1.9190
ROUGE Scores: {'rouge1': 0.24314444230564494, 'rouge2': 0.14001878402499457, 'rougeL': 0.2237854024840728, 'rougeLsum': 0.22246462572576908}
Epoch 3/15
Training Loss: 2.0179
Validation Loss: 1.8727
ROUGE Scores: {'rouge1': 0.23564530968156083, 'rouge2': 0.13669895563342216, 'rougeL': 0.21725589526977998, 'rougeLsum': 0.2151015219135301}
Epoch 4/15
Training Loss: 1.9257
Validation Loss: 1.8389
ROUGE Scores: {'rouge1': 0.23937899093803855, 'rouge2': 0.13888041555479988, 'rougeL': 0.21854222551451663, 'rougeLsum': 0.21721511685962552}
Epoch 5/15
Training Loss: 1.8781
Validation Loss: 1.8102
ROUGE Scores: {'rouge1': 0.2412030325505815, 'rouge2': 0.1373245465699872, 'rougeL': 0.22158876960762192, 'rougeLsum': 0.21964406824128718}
Epoch 6/15

In [15]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_id = "tdickson17/Text_Summarization"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

def generate_summary(
 text,
 model=model,
 tokenizer=tok,
 device=device,
 max_new_tokens=128,
 min_new_tokens=20,
 num_beams=4
):
 # T5 often uses a task prefix; keep if your model expects it
 if not text.lower().startswith("summarize:"):
 text = "summarize: " + text

 inputs = tokenizer(text, return_tensors="pt", truncation=True).to(device)

 with torch.no_grad():
 out_ids = model.generate(
 **inputs,
 max_new_tokens=max_new_tokens, 
 min_new_tokens=min_new_tokens,
 num_beams=num_beams,
 no_repeat_ngram_size=3,
 early_stopping=True
 )

 return tokenizer.decode(out_ids[0], skip_special_tokens=True)

input_text = (
 "At Susquehanna, we approach quantitative finance with a deep commitment to scientific rigor and innovation. Our research leverages vast and diverse datasets, applying cutting-edge machine learning to uncover actionable insights and driving data-informed decisions from predictive modeling to strategic execution. Today, Susquehanna has over 3,000 employees in 17+ global locations. While we have grown in size and expanded our reach, our collaborative culture and love for gaming remains."
)
print("Summary:", generate_summary(input_text))


Summary: quantitative finance is driven by scientific rigor and innovation. Susquehanna has over 3,000 employees.
