Spaces:

archis99
/

clinical-trial-prediction

Running

App Files Files Community

clinical-trial-prediction / backend /preprocessing /text_processing.py

archis99

Initial Commit

d587b0b 3 months ago

raw

history blame contribute delete

2.34 kB

	# text_processing.py
	import pandas as pd
	import numpy as np
	import joblib
	from sklearn.preprocessing import LabelEncoder, StandardScaler
	import torch
	from torch.utils.data import TensorDataset, DataLoader
	from transformers import AutoModel

	# ------------------------
	# Text preprocessing
	# ------------------------

	def clean_text(text):
	if pd.isna(text): # Handle missing values
	return ""
	text = text.lower() # Convert to lowercase
	text = ''.join(char for char in text if char.isalnum() or char.isspace()) # Remove special characters
	return ' '.join(text.split()) # Remove extra whitespaces

	def preprocess_text_columns(df, text_columns):
	for col in text_columns:
	df[col] = df[col].fillna("No info provided")
	df[col] = df[col].apply(clean_text)
	return df

	# ------------------------
	# Tokenization of textual Columns
	# ------------------------

	def tokenize_text_columns(df, textual_columns, tokenizer, batch_size=50, max_length=256):
	"""
	Tokenizes multiple textual columns in batches for inference.

	Args:
	df (pd.DataFrame): DataFrame containing textual columns.
	textual_columns (list): List of column names to tokenize.
	tokenizer: HuggingFace tokenizer.
	batch_size (int): Number of samples per batch.
	max_length (int): Maximum token length per sequence.

	Returns:
	dict: Dictionary with column names as keys and tokenized tensors as values.
	"""
	def tokenize_in_batches(column_texts):
	tokenized_batches = []
	for i in range(0, len(column_texts), batch_size):
	batch = column_texts[i:i + batch_size].tolist()
	tokenized_batch = tokenizer(
	batch,
	padding="max_length",
	truncation=True,
	max_length=max_length,
	return_tensors="pt"
	)
	tokenized_batches.append(tokenized_batch)
	# Combine batches
	return {
	"input_ids": torch.cat([batch["input_ids"] for batch in tokenized_batches], dim=0),
	"attention_mask": torch.cat([batch["attention_mask"] for batch in tokenized_batches], dim=0)
	}

	tokenized_data = {}
	for col in textual_columns:
	tokenized_data[col] = tokenize_in_batches(df[col])
	return tokenized_data