archis99's picture
Initial Commit
d587b0b
# text_processing.py
import pandas as pd
import numpy as np
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModel
# ------------------------
# Text preprocessing
# ------------------------
def clean_text(text):
if pd.isna(text): # Handle missing values
return ""
text = text.lower() # Convert to lowercase
text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters
return ' '.join(text.split()) # Remove extra whitespaces
def preprocess_text_columns(df, text_columns):
for col in text_columns:
df[col] = df[col].fillna("No info provided")
df[col] = df[col].apply(clean_text)
return df
# ------------------------
# Tokenization of textual Columns
# ------------------------
def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
"""
Tokenizes multiple textual columns in batches for inference.
Args:
df (pd.DataFrame): DataFrame containing textual columns.
textual_columns (list): List of column names to tokenize.
tokenizer: HuggingFace tokenizer.
batch_size (int): Number of samples per batch.
max_length (int): Maximum token length per sequence.
Returns:
dict: Dictionary with column names as keys and tokenized tensors as values.
"""
def tokenize_in_batches(column_texts):
tokenized_batches = []
for i in range(0, len(column_texts), batch_size):
batch = column_texts[i:i + batch_size].tolist()
tokenized_batch = tokenizer(
batch,
padding="max_length",
truncation=True,
max_length=max_length,
return_tensors="pt"
)
tokenized_batches.append(tokenized_batch)
# Combine batches
return {
"input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
"attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
}
tokenized_data = {}
for col in textual_columns:
tokenized_data[col] = tokenize_in_batches(df[col])
return tokenized_data