{ "cells": [ { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-08-10T15:22:21.391963Z", "start_time": "2025-08-10T15:22:21.389220Z" } }, "source": [ "# import pandas as pd\n", "# import torch\n", "# from transformers import T5Tokenizer\n", "# import pandas as pd\n", "# from torch.utils.data import DataLoader, TensorDataset\n", "# device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "# \n", "# import numpy as np\n", "# from transformers import T5Tokenizer\n" ], "outputs": [], "execution_count": 12 }, { "metadata": {}, "cell_type": "markdown", "source": "", "id": "18d7838a0a2b47f0" }, { "metadata": { "ExecuteTime": { "start_time": "2025-08-10T15:22:21.416790Z" } }, "cell_type": "code", "source": "# df = pd.read_parquet(\"press_releases_all_with_CAP_issues.parquet\")", "id": "3318aa3e574f90cf", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "# df = df[['title', 'text']]", "id": "f3816d3ecce5a8e0", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "# df = df.head(10000)", "id": "2cc68e87814bc931", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "# df['title'].fillna('', inplace=True)", "id": "8f3c1efe99f9dcdf", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "# df['title'] = df['title'].replace('', 'No Title') ", "id": "3d4322138b08d0f5", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "# print(df.isna().sum())", "id": "393b3b45b339c991", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "# df.to_parquet('press_releases_consolidated.parquet', engine='pyarrow')", "id": "4561d51aa9a63bba", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-10T15:39:06.429249Z", "start_time": "2025-08-10T15:39:06.123602Z" } }, "cell_type": "code", "source": [ "import pandas as pd\n", "df = pd.read_parquet('press_releases_consolidated.parquet')" ], "id": "3f9ca20cb8190e2a", "outputs": [], "execution_count": 1 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-10T15:39:14.393933Z", "start_time": "2025-08-10T15:39:12.502613Z" } }, "cell_type": "code", "source": [ "import torch\n", "from torch.utils.data import Dataset, DataLoader, random_split\n", "import torch\n", "from transformers import T5Tokenizer\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "tokenizer = T5Tokenizer.from_pretrained('t5-small')\n", "\n", "# modify accordingly\n", "MAX_TARGET_LENGTH = 128\n", "MAX_INPUT_LENGTH = 512\n", "\n", "class SummarizationDataset(Dataset):\n", " def __init__(self, dataframe, tokenizer, max_input_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH):\n", " self.data = dataframe\n", " self.tokenizer = tokenizer\n", " self.max_input_length = max_input_length\n", " self.max_target_length = max_target_length\n", "\n", " def __len__(self):\n", " return len(self.data)\n", " \n", " def __getitem__(self, idx):\n", " text = self.data.iloc[idx]['text']\n", " title = self.data.iloc[idx]['title']\n", " \n", " \n", " # tokenize\n", " text_to_token = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_input_length, return_tensors='pt')\n", " title_to_token = self.tokenizer(title, padding='max_length', truncation=True, max_length=self.max_target_length, return_tensors='pt')\n", " \n", " \n", " input_ids = text_to_token['input_ids'].squeeze(0) \n", " attention_mask = text_to_token['attention_mask'].squeeze(0) \n", " labels = title_to_token['input_ids'].squeeze(0) \n", " labels[labels == self.tokenizer.pad_token_id] = -100 \n", " \n", " return {\n", " 'input_ids': input_ids,\n", " 'attention_mask': attention_mask,\n", " 'labels': labels \n", " }\n", "\n", "dataset = SummarizationDataset(df, tokenizer)\n", "\n", "\n", "train_size = int(0.8 * len(dataset))\n", "val_size = len(dataset) - train_size\n", "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n", "\n", "train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)\n", "val_dataloader = DataLoader(val_dataset, batch_size=8)\n", "\n" ], "id": "22604924094a8cd3", "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n" ] } ], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-10T21:47:41.277658Z", "start_time": "2025-08-10T15:39:15.673627Z" } }, "cell_type": "code", "source": [ "import torch\n", "from transformers import T5ForConditionalGeneration\n", "from torch.optim import Adam\n", "from torch.utils.data import DataLoader\n", "from sklearn.model_selection import train_test_split\n", "import evaluate\n", "\n", "model = T5ForConditionalGeneration.from_pretrained('t5-small')\n", "optimizer = Adam(model.parameters(), lr=5e-5)\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)\n", "\n", "rouge = evaluate.load(\"rouge\")\n", "\n", "def train():\n", " model.train()\n", " total_loss = 0\n", " for batch in train_dataloader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['labels'].to(device)\n", "\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n", " loss = outputs.loss\n", " total_loss += loss.item()\n", "\n", " loss.backward()\n", " optimizer.step()\n", " optimizer.zero_grad()\n", "\n", " return total_loss / len(train_dataloader)\n", "\n", "def evaluate():\n", " model.eval()\n", " total_loss = 0\n", " all_preds = []\n", " all_labels = []\n", " \n", " with torch.no_grad():\n", " for batch in val_dataloader:\n", " input_ids = batch['input_ids'].to(device)\n", " attention_mask = batch['attention_mask'].to(device)\n", " labels = batch['labels'].to(device)\n", "\n", " outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n", " total_loss += outputs.loss.item()\n", " \n", " try:\n", " summary_ids = model.generate(\n", " input_ids=input_ids,\n", " attention_mask=attention_mask,\n", " max_length=MAX_TARGET_LENGTH,\n", " num_beams=8,\n", " early_stopping=True\n", " )\n", " \n", " summary_ids = summary_ids[0] if len(summary_ids) > 0 else torch.tensor([tokenizer.pad_token_id])\n", " \n", " preds = tokenizer.decode(summary_ids.cpu(), skip_special_tokens=True, clean_up_tokenization_spaces=True)\n", " labels_decoded = tokenizer.decode(\n", " labels[0].masked_select(labels[0] != -100).cpu(), \n", " skip_special_tokens=True,\n", " clean_up_tokenization_spaces=True\n", " )\n", " \n", " all_preds.append(preds if preds else \" \")\n", " all_labels.append(labels_decoded if labels_decoded else \" \")\n", " \n", " except Exception as e:\n", " print(f\"Error during generation: {e}\")\n", " all_preds.append(\" \")\n", " all_labels.append(\" \")\n", " continue\n", "\n", " all_preds = [p if p.strip() else \" \" for p in all_preds]\n", " all_labels = [l if l.strip() else \" \" for l in all_labels]\n", " \n", " rouge_result = rouge.compute(predictions=all_preds, references=all_labels)\n", " \n", " return total_loss / len(val_dataloader), rouge_result\n", "\n", "\n", "epochs = 15\n", "best_val_loss = float('inf')\n", "\n", "for epoch in range(epochs):\n", " print(f\"Epoch {epoch + 1}/{epochs}\")\n", "\n", " train_loss = train()\n", " print(f\"Training Loss: {train_loss:.4f}\")\n", "\n", " val_loss, rouge_result = evaluate()\n", " print(f\"Validation Loss: {val_loss:.4f}\")\n", " print(f\"ROUGE Scores: {rouge_result}\")\n", "\n", " if val_loss < best_val_loss:\n", " best_val_loss = val_loss\n", " model.save_pretrained(f\"best_model_epoch_{epoch + 1}\")\n", " tokenizer.save_pretrained(f\"best_model_epoch_{epoch + 1}\")\n" ], "id": "2041549aaa86af9f", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1/15\n", "Training Loss: 2.3327\n", "Validation Loss: 1.9963\n", "ROUGE Scores: {'rouge1': 0.21808722374319384, 'rouge2': 0.1182736024791169, 'rougeL': 0.19976099496233557, 'rougeLsum': 0.19920689338385827}\n", "Epoch 2/15\n", "Training Loss: 2.1164\n", "Validation Loss: 1.9190\n", "ROUGE Scores: {'rouge1': 0.24314444230564494, 'rouge2': 0.14001878402499457, 'rougeL': 0.2237854024840728, 'rougeLsum': 0.22246462572576908}\n", "Epoch 3/15\n", "Training Loss: 2.0179\n", "Validation Loss: 1.8727\n", "ROUGE Scores: {'rouge1': 0.23564530968156083, 'rouge2': 0.13669895563342216, 'rougeL': 0.21725589526977998, 'rougeLsum': 0.2151015219135301}\n", "Epoch 4/15\n", "Training Loss: 1.9257\n", "Validation Loss: 1.8389\n", "ROUGE Scores: {'rouge1': 0.23937899093803855, 'rouge2': 0.13888041555479988, 'rougeL': 0.21854222551451663, 'rougeLsum': 0.21721511685962552}\n", "Epoch 5/15\n", "Training Loss: 1.8781\n", "Validation Loss: 1.8102\n", "ROUGE Scores: {'rouge1': 0.2412030325505815, 'rouge2': 0.1373245465699872, 'rougeL': 0.22158876960762192, 'rougeLsum': 0.21964406824128718}\n", "Epoch 6/15\n", "Training Loss: 1.8266\n", "Validation Loss: 1.8030\n", "ROUGE Scores: {'rouge1': 0.24693945766624123, 'rouge2': 0.13859814431515555, 'rougeL': 0.22609207133571282, 'rougeLsum': 0.22456133662136685}\n", "Epoch 7/15\n", "Training Loss: 1.7831\n", "Validation Loss: 1.7842\n", "ROUGE Scores: {'rouge1': 0.24995693123364204, 'rouge2': 0.13730760003890233, 'rougeL': 0.22966043449504253, 'rougeLsum': 0.22839320529835103}\n", "Epoch 8/15\n", "Training Loss: 1.7398\n", "Validation Loss: 1.7843\n", "ROUGE Scores: {'rouge1': 0.24797510003323764, 'rouge2': 0.13919083038634567, 'rougeL': 0.22646443435896133, 'rougeLsum': 0.22558282591894607}\n", "Epoch 9/15\n", "Training Loss: 1.7068\n", "Validation Loss: 1.7860\n", "ROUGE Scores: {'rouge1': 0.25390876204792084, 'rouge2': 0.13814393342112263, 'rougeL': 0.231234438215985, 'rougeLsum': 0.2311260176829176}\n", "Epoch 10/15\n", "Training Loss: 1.6779\n", "Validation Loss: 1.7854\n", "ROUGE Scores: {'rouge1': 0.25411363403331366, 'rouge2': 0.14468888317851958, 'rougeL': 0.2354872641812709, 'rougeLsum': 0.23342210178892542}\n", "Epoch 11/15\n", "Training Loss: 1.6413\n", "Validation Loss: 1.7642\n", "ROUGE Scores: {'rouge1': 0.2679774072064855, 'rouge2': 0.14667787569965263, 'rougeL': 0.24705660369839066, 'rougeLsum': 0.2454144686019869}\n", "Epoch 12/15\n", "Training Loss: 1.6075\n", "Validation Loss: 1.7712\n", "ROUGE Scores: {'rouge1': 0.268361111086107, 'rouge2': 0.15128550708369404, 'rougeL': 0.24768429614360232, 'rougeLsum': 0.24575241584538624}\n", "Epoch 13/15\n", "Training Loss: 1.5857\n", "Validation Loss: 1.7618\n", "ROUGE Scores: {'rouge1': 0.28096384664011065, 'rouge2': 0.1595810134136424, 'rougeL': 0.2575870112336856, 'rougeLsum': 0.25663783533294626}\n", "Epoch 14/15\n", "Training Loss: 1.5552\n", "Validation Loss: 1.7620\n", "ROUGE Scores: {'rouge1': 0.2833173462582747, 'rouge2': 0.1648174970170761, 'rougeL': 0.2615026211543109, 'rougeLsum': 0.2600381314435784}\n", "Epoch 15/15\n", "Training Loss: 1.5316\n", "Validation Loss: 1.7716\n", "ROUGE Scores: {'rouge1': 0.2782139285308772, 'rouge2': 0.1606118164438922, 'rougeL': 0.2581515139790868, 'rougeLsum': 0.2571149575053421}\n" ] } ], "execution_count": 4 }, { "metadata": {}, "cell_type": "code", "source": "", "id": "c8d5f56240932910", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "code", "source": "", "id": "3cecb16d8154a783", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-08-11T23:22:29.491880Z", "start_time": "2025-08-11T23:22:28.364057Z" } }, "cell_type": "code", "source": [ "import torch\n", "from transformers import AutoModelForSeq2SeqLM, AutoTokenizer\n", "\n", "model_id = \"tdickson17/Text_Summarization\"\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "\n", "tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)\n", "model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)\n", "\n", "def generate_summary(\n", " text,\n", " model=model,\n", " tokenizer=tok,\n", " device=device,\n", " max_new_tokens=128,\n", " min_new_tokens=20,\n", " num_beams=4\n", "):\n", " # T5 often uses a task prefix; keep if your model expects it\n", " if not text.lower().startswith(\"summarize:\"):\n", " text = \"summarize: \" + text\n", "\n", " inputs = tokenizer(text, return_tensors=\"pt\", truncation=True).to(device)\n", "\n", " with torch.no_grad():\n", " out_ids = model.generate(\n", " **inputs,\n", " max_new_tokens=max_new_tokens, \n", " min_new_tokens=min_new_tokens,\n", " num_beams=num_beams,\n", " no_repeat_ngram_size=3,\n", " early_stopping=True\n", " )\n", "\n", " return tokenizer.decode(out_ids[0], skip_special_tokens=True)\n", "\n", "input_text = (\n", " \"At Susquehanna, we approach quantitative finance with a deep commitment to scientific rigor and innovation. Our research leverages vast and diverse datasets, applying cutting-edge machine learning to uncover actionable insights and driving data-informed decisions from predictive modeling to strategic execution. Today, Susquehanna has over 3,000 employees in 17+ global locations. While we have grown in size and expanded our reach, our collaborative culture and love for gaming remains.\"\n", ")\n", "print(\"Summary:\", generate_summary(input_text))\n" ], "id": "add7d5e5d17e708b", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Summary: quantitative finance is driven by scientific rigor and innovation. Susquehanna has over 3,000 employees.\n" ] } ], "execution_count": 15 }, { "metadata": {}, "cell_type": "code", "outputs": [], "execution_count": null, "source": "", "id": "976fd3465f63b737" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }