In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import math
import time
from tqdm.notebook import trange, tqdm

import torch
import torch.nn as nn
from torch import optim
from torch.utils.data import DataLoader
from torch import Tensor
from torch.utils.data.dataset import Dataset
import torch.nn.functional as F
from torch.distributions import Categorical
from torch.cuda.amp import autocast, GradScaler

from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

torch.backends.cuda.matmul.allow_tf32 = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
%cd /content/drive/My Drive/fake-news-detection/new-truth-guard

/content/drive/My Drive/fake-news-detection/new-truth-guard


In [5]:
import pandas as pd

# Kaggle Fake and Real News Dataset
fake_df = pd.read_csv("kaggle/Fake.csv")[["title","text"]]
fake_df["veracity"] = 0.0

real_df = pd.read_csv("kaggle/True.csv")[["title","text"]]
real_df["veracity"] = 1.0

kaggle_df = pd.concat([fake_df, real_df], ignore_index=True)

In [6]:
kaggle_df

Unnamed: 0,title,text,veracity
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,0.0
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,0.0
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",0.0
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",0.0
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,0.0
...,...,...,...
44893,'Fully committed' NATO backs new U.S. approach...,BRUSSELS (Reuters) - NATO allies on Tuesday we...,1.0
44894,LexisNexis withdrew two products from Chinese ...,"LONDON (Reuters) - LexisNexis, a provider of l...",1.0
44895,Minsk cultural hub becomes haven from authorities,MINSK (Reuters) - In the shadow of disused Sov...,1.0
44896,Vatican upbeat on possibility of Pope Francis ...,MOSCOW (Reuters) - Vatican Secretary of State ...,1.0


In [7]:
# Liar2 Dataset
train_df = pd.read_csv("liar2/train.csv")[["statement", "label"]]
val_df = pd.read_csv("liar2/valid.csv")[["statement", "label"]]
test_liar2 = pd.read_csv("liar2/test.csv")[["statement", "label"]]

train_liar2 = pd.concat([train_df, val_df], ignore_index=True)

train_liar2["veracity"] = train_liar2["label"] / 5
test_liar2["veracity"] = test_liar2["label"] / 5

train_liar2["text"] = train_liar2["statement"]
test_liar2["text"] = test_liar2["statement"]

train_liar2.drop(columns=["statement", "label"], inplace=True)
test_liar2.drop(columns=["statement", "label"], inplace=True)

In [8]:
train_liar2

Unnamed: 0,veracity,text
0,1.0,"90 percent of Americans ""support universal bac..."
1,0.2,Last year was one of the deadliest years ever ...
2,0.0,"Bernie Sanders's plan is ""to raise your taxes ..."
3,0.8,Voter ID is supported by an overwhelming major...
4,0.4,"Says Barack Obama ""robbed Medicare (of) $716 b..."
...,...,...
20661,0.8,"Worldwide credit card transactions, the credit..."
20662,0.6,"Rick Perry ""advocated to liquidate"" the state'..."
20663,0.2,AirPods are essentially microwaving your brain.
20664,0.6,Emails released to the public show Hillary Cli...


In [9]:
from sklearn.model_selection import train_test_split

all_df = pd.concat([train_liar2, test_liar2, kaggle_df], ignore_index=True)
train_df, test_df = train_test_split(
    all_df,
    test_size=0.1,
    stratify=(all_df["veracity"] > 0.5),
    random_state=42
)

In [10]:
train_df

Unnamed: 0,veracity,text,title
29891,0.0,The Brady Campaign to Prevent Gun Violence rel...,WATCH: Anti-Gun Group Absolutely DESTROYS NRA...
34071,0.0,Just when everyone thought the Crazy Fat Kid ...,“Crazy Fat Kid” Kim Jong Un Has HILARIOUS Rule...
31563,0.0,Thinking about going to the movies? You might ...,Oops: Ammosexual Playing With Gun During Beng...
54421,1.0,BOGOTA (Reuters) - Colombia’s peace deal with ...,U.S Treasury’s Lew says Colombia peace deal to...
27197,0.0,Not only is Donald Trump an admitted serial gr...,BREAKING: Donald Trump Walked Into Dressing R...
...,...,...,...
45540,0.0,Tune in to the Alternate Current Radio Network...,BOILER ROOM – EP #55 – Roasting the Wretched H...
14550,0.2,"Says William Barr tweeted, ""BREAKING NEWS Sena...",
36614,0.0,Nothing to see here. No conflict of interest. ...,FLASHBACK: HILLARY Received $500K In Jewelry F...
21159,0.4,"Two million federal workers negotiate zippo, z...",


In [11]:
test_df

Unnamed: 0,veracity,text,title
22284,0.0,"Around the world, people who exercise their ""h...",
380,0.6,The sanctions that we put on (Russia) for the ...,
67133,1.0,WASHINGTON (Reuters) - U.S. President Donald T...,Trump says hopes to avoid use of military acti...
15460,0.4,The price of soybeans has fallen 50% since 5 y...,
20949,0.2,"Human trafficking and drugs"" at the Mexico bor...",
...,...,...,...
17801,0.8,One in six Texans don't have health care. We'r...,
61245,1.0,"CHILPANCINGO, Mexico (Reuters) - A protest by ...",Protest over odor from rotting corpses shuts M...
4258,0.4,"The Trump budget ""cuts $845 billion, almost a ...",
44634,0.0,Ultimate gun control is the end game of Barack...,ABOVE THE LAW: Obama Goes Around Congress (Aga...


In [12]:
def compute_token_stats(df, tokenizer, add_special_tokens=True):
  tqdm.pandas()

  lengths = df["text"].progress_apply(
      lambda txt: len(
          tokenizer(
              txt,
              add_special_tokens=add_special_tokens,
              truncation=False,
              padding=False,
          )["input_ids"]
      )
  )

  max_len = int(lengths.max())
  avg_len = float(lengths.mean())

  return max_len, avg_len, lengths

In [13]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [14]:
train_max, train_avg, train_lengths = compute_token_stats(train_df, tokenizer)
print(f"Train: longest = {train_max} tokens, avg = {train_avg:.2f} tokens")

test_max, test_avg, test_lengths = compute_token_stats(test_df, tokenizer)
print(f"TestL: longest = {test_max} tokens, avg = {test_avg:.2f} tokens")

  0%|          | 0/61074 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1404 > 512). Running this sequence through the model will result in indexing errors


Train: longest = 10292 tokens, avg = 345.82 tokens


  0%|          | 0/6786 [00:00<?, ?it/s]

TestL: longest = 14285 tokens, avg = 350.74 tokens


In [15]:
MAX_LEN = 512
tokenizer.vocab_size

30522

In [17]:
print("[PAD] token id:", tokenizer.pad_token_id) # 0
print("[CLS] token id:", tokenizer.cls_token_id) # 101
print("[SEP] token id:", tokenizer.sep_token_id) # 102

[PAD] token id: 0
[CLS] token id: 101
[SEP] token id: 102


In [18]:
class FakeNewsDataset(Dataset):
  def __init__(self, df):
    self.texts = df["text"].tolist()
    self.labels = df["veracity"].tolist()

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, i):
    enc = tokenizer(
        self.texts[i],
        add_special_tokens=True,
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN,
        return_tensors="pt"
    )

    return {
        "input_ids": enc["input_ids"].squeeze(0),
        "attention_mask": enc["attention_mask"].squeeze(0),
        "labels": torch.tensor(self.labels[i], dtype=torch.float)
    }

In [21]:
batch_size = 16

train_dataloader = DataLoader(
    FakeNewsDataset(train_df),
    batch_size=batch_size,
    shuffle=True,
    num_workers=os.cpu_count(),
    pin_memory=True
)

test_dataloader = DataLoader(
    FakeNewsDataset(test_df),
    batch_size=batch_size,
    shuffle=False,
    num_workers=os.cpu_count(),
    pin_memory=True
)

len(train_dataloader), len(test_dataloader)

(3818, 425)

In [22]:
batch = next(iter(train_dataloader))
print(batch["input_ids"].shape, batch["attention_mask"].shape, batch["labels"].shape)

torch.Size([16, 512]) torch.Size([16, 512]) torch.Size([16])


In [33]:
model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=1).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
texts = [
    "Breaking: Scientists discover cure for common cold!",
    "Study finds no link between vaccines and autism."
]

encodings = tokenizer(
    texts,
    add_special_tokens=True,
    padding="max_length",
    truncation=True,
    max_length=MAX_LEN,
    return_tensors="pt"
).to(device)


logits = model(
    input_ids=encodings.input_ids,
    attention_mask=encodings.attention_mask,
).logits.squeeze(-1)

print(logits)

tensor([0.0314, 0.0221], device='cuda:0', grad_fn=<SqueezeBackward1>)


In [35]:
print(f"The model has {(sum(p.numel() for p in model.parameters() if p.requires_grad)):,} trainable parameters")

The model has 66,954,241 trainable parameters


In [36]:
from transformers import get_linear_schedule_with_warmup

lr = 2e-5
epochs = 5
clip = 1

optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=0.01)
loss_fn = nn.BCEWithLogitsLoss()
scaler = GradScaler()

num_steps = len(train_dataloader) * epochs
num_warmup = int(0.1 * num_steps)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup,
    num_training_steps=num_steps
)

  scaler = GradScaler()


In [37]:
def train(model,
          iterator,
          optimizer,
          loss_fn,
          clip,
          epoch):
  model.train()
  epoch_loss = 0

  pbar = tqdm(iterator, total=len(iterator), desc=f"Epoch {epoch + 1} Progress", colour="#005500")
  for i, batch in enumerate(pbar):
    src = batch["input_ids"].to(device)
    mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    optimizer.zero_grad()
    with autocast():
      # Forward pass
      outputs = model(
          input_ids=src,
          attention_mask=mask,
      )
      logits = outputs.logits.squeeze(-1) # shape: (batch_size)

      # Calculate the loss
      loss = loss_fn(logits, labels)

    scaler.scale(loss).backward()
    scaler.unscale_(optimizer)
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    scaler.step(optimizer)
    scheduler.step()
    scaler.update()
    epoch_loss += loss.item()

    pbar.set_postfix(loss=loss.item()) # Update the loss on the tqdm progress bar

  return (epoch_loss / len(iterator))

In [38]:
def evaluate(model,
             model_path,
             iterator,
             loss_fn):
  model.load_state_dict(torch.load(model_path, map_location=device))
  model.eval()
  epoch_loss = 0

  with torch.inference_mode():
    for i, batch in enumerate(tqdm(iterator)):
      src = batch["input_ids"].to(device)
      mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      # Forward pass
      outputs = model(
          input_ids=src,
          attention_mask=mask,
      )
      logits = outputs.logits.squeeze(-1) # shape: (batch_size)

      # Calculate the loss
      loss = loss_fn(logits, labels)
      epoch_loss += loss.item()

  return (epoch_loss / len(iterator))

In [39]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

In [40]:
best_valid_loss = float("inf")
model_path = "truth_guard_model.pt"

if os.path.exists(model_path):
  print(f"Loading model from {model_path}...")
  model.load_state_dict(torch.load(model_path, map_location=device))

In [41]:
should_train = True

if should_train:
  for epoch in tqdm(range(epochs), desc=f"Training progress", colour="#00ff00"):
    start_time = time.time()

    train_loss = train(model=model,
                      iterator=train_dataloader,
                      optimizer=optimizer,
                      loss_fn=loss_fn,
                      clip=clip,
                      epoch=epoch)

    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    message = f"Epoch: {epoch + 1} | Time: {epoch_mins}m {epoch_secs}s --> STORED"

    torch.save(model.state_dict(), f"truth_guard_model_epoch_{epoch + 1}.pt")

    print(message)
    print(f"Train Loss: {train_loss:.6f}")

Training progress:   0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1 Progress:   0%|          | 0/3818 [00:00<?, ?it/s]

  with autocast():


Epoch: 1 | Time: 2m 18s --> STORED
Train Loss: 0.253842


Epoch 2 Progress:   0%|          | 0/3818 [00:00<?, ?it/s]

Epoch: 2 | Time: 2m 17s --> STORED
Train Loss: 0.206589


Epoch 3 Progress:   0%|          | 0/3818 [00:00<?, ?it/s]

Epoch: 3 | Time: 2m 18s --> STORED
Train Loss: 0.188411


Epoch 4 Progress:   0%|          | 0/3818 [00:00<?, ?it/s]

Epoch: 4 | Time: 2m 18s --> STORED
Train Loss: 0.173705


Epoch 5 Progress:   0%|          | 0/3818 [00:00<?, ?it/s]

Epoch: 5 | Time: 2m 18s --> STORED
Train Loss: 0.165124


In [46]:
test_loss = evaluate(model=model,
                     model_path="truth_guard_model_epoch_5.pt",
                     iterator=test_dataloader,
                     loss_fn=loss_fn)

print(f"Test Loss: {test_loss:.6f}")

# Test Losses:
# Epoch 1 - 0.214233
# Epoch 2 - 0.213548
# Epoch 3 - 0.222628
# Epoch 4 - 0.227762
# Epoch 5 - 0.231639

  0%|          | 0/425 [00:00<?, ?it/s]

Test Loss: 0.231639


In [47]:
def get_accuracy(model,
                 model_path,
                 iterator):
  model.load_state_dict(torch.load(model_path, map_location=device))
  model.eval()

  num_correct = 0
  total = 0

  with torch.inference_mode():
    for i, batch in enumerate(tqdm(iterator)):
      src = batch["input_ids"].to(device)
      mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      # Forward pass
      outputs = model(
          input_ids=src,
          attention_mask=mask,
      )
      logits = outputs.logits.squeeze(-1) # shape: (batch_size)

      # Calculate the accuracy
      probs = torch.sigmoid(logits)
      preds = (probs >= 0.5)
      truths = (labels >= 0.5)

      num_correct += (preds == truths).sum().item()
      total += labels.size(0)

  return (num_correct / total)

In [52]:
test_acc = get_accuracy(model=model,
                        model_path="truth_guard_model_epoch_5.pt",
                        iterator=test_dataloader)

print(f"Test Accuracy: {test_acc:.6f}")

# Test Accuracies:
# Epoch 1 - 0.898762
# Epoch 2 - 0.900383
# Epoch 3 - 0.897583
# Epoch 4 - 0.893604
# Epoch 5 - 0.893457

  0%|          | 0/425 [00:00<?, ?it/s]

Test Accuracy: 0.893457


In [53]:
def get_prob_accuracy(model,
                      model_path,
                      iterator,
                      tolerance: float = 0.2):
  model.load_state_dict(torch.load(model_path, map_location=device))
  model.eval()

  num_correct = 0
  total = 0

  with torch.inference_mode():
    for i, batch in enumerate(tqdm(iterator)):
      src = batch["input_ids"].to(device)
      mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      # Forward pass
      outputs = model(
          input_ids=src,
          attention_mask=mask,
      )
      logits = outputs.logits.squeeze(-1) # shape: (batch_size)

      # Calculate the accuracy
      probs = torch.sigmoid(logits)

      diffs = torch.abs(probs - labels)
      num_correct += (diffs <= tolerance).sum().item()

      total += labels.size(0)

  return (num_correct / total)

In [58]:
test_prob_acc = get_prob_accuracy(model=model,
                                  model_path="truth_guard_model_epoch_5.pt",
                                  iterator=test_dataloader,
                                  tolerance=0.2)

print(f"Test Prob Accuracy: {test_prob_acc:.6f}")

# Test Prob Accuracies:
# Epoch 1 - 0.834365
# Epoch 2 - 0.837165
# Epoch 3 - 0.846743
# Epoch 4 - 0.843943
# Epoch 5 - 0.843207

  0%|          | 0/425 [00:00<?, ?it/s]

Test Prob Accuracy: 0.843207


In [59]:
def get_prediction(text, model, device, max_length: int = MAX_LEN):
  model.eval()

  encodings = tokenizer(
      text,
      padding=True,
      truncation=True,
      max_length=max_length,
      return_tensors="pt"
  )

  with torch.inference_mode():
    logits = model(
        input_ids=encodings.input_ids.to(device),
        attention_mask=encodings.attention_mask.to(device),
    ).logits.squeeze(-1)

  prob = torch.sigmoid(logits)
  return prob

In [60]:
test_df.iloc[0], test_df.iloc[1]

(veracity                                                  0.0
 text        Around the world, people who exercise their "h...
 title                                                     NaN
 Name: 22284, dtype: object,
 veracity                                                  0.6
 text        The sanctions that we put on (Russia) for the ...
 title                                                     NaN
 Name: 380, dtype: object)

In [85]:
test_df.iloc[115]["text"]

'WASHINGTON (Reuters) - President Donald Trump complained on Monday that the United States is shouldering an unfair burden of the cost of the United Nations, but said if the world body reforms how it operates, the investment would be worth it. Trump, who has frequently criticized the cost to the United States of supporting the NATO alliance, took his concerns directly to the ambassadors of the U.N. Security Council, who joined him at the White House for a lunch. “If we do a great job, I care much less about the budget because you’re talking about peanuts compared to the important work you’re doing,” Trump told the 15 council envoys.  The United States is the biggest U.N. contributor, paying 22 percent of the $5.4 billion core budget and 28.5 percent of the $7.9 billion peacekeeping budget. These assessed contributions are agreed by the 193-member U.N. General Assembly. Trump said the U.S. share of those budgets was “unfair.”  He has proposed a 28 percent budget cut for diplomacy and fo

In [86]:
test_idx = 115

src_text = test_df.iloc[test_idx]["text"]

model.load_state_dict(torch.load("truth_guard_model_epoch_3.pt", map_location=device))
prob = get_prediction(src_text, model, device)

# 0 - fake
# 1 - real

print(f"Real label: {test_df.iloc[test_idx]['veracity']}")
print(f"Predicted Prob: {prob}")

Real label: 1.0
Predicted Prob: tensor([1.0000], device='cuda:0')


In [71]:
!pip install onnx onnxruntime

Collecting onnx
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m110.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m118.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (4

In [72]:
import onnx
import transformers
import transformers.convert_graph_to_onnx as onnx_convert
from pathlib import Path

model = DistilBertForSequenceClassification.from_pretrained(
        "distilbert-base-uncased",
        num_labels=1)
model.load_state_dict(torch.load("truth_guard_model_epoch_3.pt", map_location="cpu"))
model.eval()
model = model.to("cpu")

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

save_dir = "./onnx_model"

pipeline = transformers.pipeline("text-classification", model=model, tokenizer=tokenizer)
model = model.to("cpu")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


In [75]:
with torch.no_grad():
  onnx_convert.convert_pytorch(pipeline, opset=14, output=Path("truth_guard.onnx"), use_external_format=False)

Using framework PyTorch: 2.6.0+cu124
Found input input_ids with shape: {0: 'batch', 1: 'sequence'}
Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}
Found output output_0 with shape: {0: 'batch'}
Ensuring inputs are in correct order
head_mask is not present in the generated input list.
Generated inputs order: ['input_ids', 'attention_mask']


  inverted_mask = torch.tensor(1.0, dtype=dtype) - expanded_mask


In [76]:
from onnxruntime.quantization import quantize_dynamic, QuantType

quantize_dynamic("truth_guard.onnx", "truth_guard_int8.onnx", weight_type=QuantType.QUInt8)



In [9]:
# tokenizer.save_pretrained("./tokenizer")

('./tokenizer/tokenizer_config.json',
 './tokenizer/special_tokens_map.json',
 './tokenizer/spiece.model',
 './tokenizer/added_tokens.json')