Spaces:

georgeek
/

HF-LLM-Intent-Detection

Running

App Files Files Community

HF-LLM-Intent-Detection / src /A_Preprocess.py

georgeek

Transfer

5ecde30 9 days ago

raw

history blame contribute delete

2.04 kB

	import pandas as pd
	import re



	def load_pdf_data(file_path):
	data = pd.read_csv(file_path, encoding="ISO-8859-1")
	return data

	def clean_text(text):
	# Function to clean text
	text = text.lower()
	text = re.sub(r'\W+', ' ', text) # replace all non-alphanumeric characters with a space
	text = re.sub(r'\d+', '', text) # remove all digits
	text = text.strip()
	return text

	def preprocess_data(data):
	data['utterance'] = data['utterance'].apply(clean_text)
	return data

	def save_filter_intents(data, file_path):
	# Get the 50 most balanced intents
	filtered_df = data.groupby('intent', group_keys=False).apply(lambda x: x.sample(min(len(x), 15)))
	# Save the filtered DataFrame as a CSV file
	filtered_df.to_csv(file_path+'\Pager_filtered_Intents.csv', index=False)

	def save_cleaned_data(data, file_path):
	# Save the cleaned DataFrame as a CSV file
	data.to_csv(file_path+'\Pager_Intents_cleaned.csv', index=False)

	# data = load_data(r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data\Pager_Intents.csv')
	# cleaned_data = preprocess_data(data)
	# save_cleaned_data(data, r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data')
	# save_filter_intents(cleaned_data, r'C:\Users\serban.tica\Documents\tobi_llm_intent_recognition\data')

	# split train test
	def split_train_test(data, test_size=0.3):
	# Split the data into training and testing sets
	train_data = data.sample(frac=1-test_size, random_state=42)
	test_data = data.drop(train_data.index)
	print(f"Train data shape: {train_data.shape}")

	return train_data, test_data

	def save_train_test_data(train_data, test_data, file_path):
	# Save the training and testing data to CSV files
	train_data.to_csv(file_path+'\\train_data.csv', index=False)
	test_data.to_csv(file_path+'\\test_data.csv', index=False)


	# train_data, test_data = split_train_test(cleaned_data)
	# save_train_test_data(train_data, test_data, 'C:\\Users\\serban.tica\\Documents\\tobi_llm_intent_recognition\\data')