# text_processing.py import pandas as pd import numpy as np import joblib from sklearn.preprocessing import LabelEncoder, StandardScaler import torch from torch.utils.data import TensorDataset, DataLoader from transformers import AutoModel # ------------------------ # Text preprocessing # ------------------------ def clean_text(text): if pd.isna(text): # Handle missing values return "" text = text.lower() # Convert to lowercase text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters return ' '.join(text.split()) # Remove extra whitespaces def preprocess_text_columns(df, text_columns): for col in text_columns: df[col] = df[col].fillna("No info provided") df[col] = df[col].apply(clean_text) return df # ------------------------ # Tokenization of textual Columns # ------------------------ def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256): """ Tokenizes multiple textual columns in batches for inference. Args: df (pd.DataFrame): DataFrame containing textual columns. textual_columns (list): List of column names to tokenize. tokenizer: HuggingFace tokenizer. batch_size (int): Number of samples per batch. max_length (int): Maximum token length per sequence. Returns: dict: Dictionary with column names as keys and tokenized tensors as values. """ def tokenize_in_batches(column_texts): tokenized_batches = [] for i in range(0, len(column_texts), batch_size): batch = column_texts[i:i + batch_size].tolist() tokenized_batch = tokenizer( batch, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt" ) tokenized_batches.append(tokenized_batch) # Combine batches return { "input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0), "attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0) } tokenized_data = {} for col in textual_columns: tokenized_data[col] = tokenize_in_batches(df[col]) return tokenized_data