In [3]:
import pandas as pd
from pathlib import Path as pp
from pandas.core.frame import DataFrame as df
import os

# reading dataset files
right_dir: pp = pp("../data/right")
wrong_dir: pp = pp("../data/wrong")

# dataframes
right_df : df = df()
wrong_df : df = df()

In [4]:
# merging right datafiles into a dataframe
rdf_list = []
for file in os.scandir(right_dir.absolute()):
  if os.path.exists(file):
    rdf_list.append(pd.read_csv(file))
right_df = pd.concat(rdf_list)

In [5]:
# merging left datafiles into a dataframe
ldf_list = []
for file in os.scandir(wrong_dir.absolute()):
  if os.path.exists(file):
    ldf_list.append(pd.read_csv(file))
wrong_df = pd.concat(ldf_list)
wrong_df

Unnamed: 0,index,content
0,0,Feeling annoyed? Follow this thread
1,1,"Thread, why you should vote for BJP. #PhirEKBa..."
2,2,The biggest festival of democracy is here! EC ...
3,3,Slogan competition \nShare your slogan idea ( ...
4,4,"4 जून की करो तैयारी,\n\nआ रहे हैं भगवाधारी...."
...,...,...
92,92,"मैं बता रहा हूँ, बेंगलुरू मामले में भी बहुत ते..."
93,93,ईरान में करीब 1200 भारतीय फंसे हैं। 800 छात्र ...
94,94,वैसे तो TikTok ने PMCares फंड में LAC पर चीन स...
95,95,बंगाल में भीड़ से खचाखच भरी रैलियां हो सकती है...


In [6]:
# add category to the dataframes
right_df['category'] = 'RIGHT'
wrong_df['category'] = 'WRONG'
wrong_df

# cleaning up the columns and adding the binray labels to the dataframes
frame = [right_df, wrong_df]
final_dataset: df = pd.concat(frame)

final_dataset.drop('index', axis=1)
final_dataset
final_dataset['content'] = final_dataset['content'].astype(str)   # all rows in 'content' column must be of type str


In [7]:
.to_pandas()

2024-03-29 19:54:59.568730: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [8]:
# Tokenization of dataset
tokenizer = Tokenizer()
tokenizer.fit_on_texts(final_dataset['content'])

vocab_size = len(tokenizer.word_index) + 1    # our dataset vocab size (space split)
max_length = 200    # max words in a sentence
embedding_dim = 50    # TODO: need to adjust accordingly

X = tokenizer.texts_to_sequences(final_dataset['content'])
X = pad_sequences(X, maxlen=max_length, padding='post')

In [9]:
# Encode the lables
labels = final_dataset['category'].map({'RIGHT': '1', 'WRONG': '0'}).astype('float32').values


In [10]:
# Build the model
model = keras.Sequential([
  keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(max_length,)),
  keras.layers.GlobalAveragePooling1D(),
  keras.layers.Dense(16, activation='relu'),
  keras.layers.Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)
2024-03-29 19:55:16.544096: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-29 19:55:21.152411: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-03-29 19:55:21.152806: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/AB

In [11]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [18]:
# split the dataset into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42, shuffle=True)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

print(labels.shape)

(3140, 200)
(1347, 200)
(3140,)
(1347,)
(4487,)


In [13]:
# train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/10


I0000 00:00:1711722330.932635   76533 service.cc:145] XLA service 0x7bbdd40053b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1711722330.932689   76533 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce 920MX, Compute Capability 5.0
2024-03-29 19:55:31.370178: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-03-29 19:55:33.092199: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:465] Loaded cuDNN version 8907


[1m17/99[0m [32m━━━[0m[37m━━━━━━━━━━━━━━━━━[0m [1m0s[0m 7ms/step - accuracy: 0.5972 - loss: 0.6806

I0000 00:00:1711722338.226902   76533 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 56ms/step - accuracy: 0.6186 - loss: 0.6631 - val_accuracy: 0.6511 - val_loss: 0.6453
Epoch 2/10
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6242 - loss: 0.6554 - val_accuracy: 0.6511 - val_loss: 0.6393
Epoch 3/10
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6134 - loss: 0.6635 - val_accuracy: 0.6511 - val_loss: 0.6375
Epoch 4/10
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6270 - loss: 0.6466 - val_accuracy: 0.6511 - val_loss: 0.6305
Epoch 5/10
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6063 - loss: 0.6464 - val_accuracy: 0.6548 - val_loss: 0.6168
Epoch 6/10
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6640 - loss: 0.6361 - val_accuracy: 0.6600 - val_loss: 0.5972
Epoch 7/10
[1m99/99[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7bbdc43be710>

In [14]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
accuracy

[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7254 - loss: 0.5047


0.7193763852119446

In [15]:
# save the model
try:
  model.save("../models/right-wrong-BC.keras")
except FileNotFoundError:
  os.mkdir("../models")
  model.save("../models/right-wrong-BC.keras")

In [16]:
# use model
def preprocess_text(text):
  # Tokenize the text
  tokenized_text = tokenizer.texts_to_sequences([text])
  # Pad sequences to the same length as training data
  padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post')
  return padded_text

# load model
model = keras.models.load_model("../models/right-wrong-BC.keras")  # Replace "your_model.h5" with the path to your trained model

# Preprocess the custom input text
preprocessed_text = preprocess_text("Modi ji is Moon Pappu on Bangkok honeymoon")

# Make predictions
predictions = model.predict(preprocessed_text)

predictions[0][0]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 347ms/step


0.71704614