|
import pandas as pd |
|
import re |
|
|
|
|
|
|
|
def load_pdf_data(file_path): |
|
data = pd.read_csv(file_path, encoding="ISO-8859-1") |
|
return data |
|
|
|
def clean_text(text): |
|
|
|
text = text.lower() |
|
text = re.sub(r'\W+', ' ', text) |
|
text = re.sub(r'\d+', '', text) |
|
text = text.strip() |
|
return text |
|
|
|
def preprocess_data(data): |
|
data['utterance'] = data['utterance'].apply(clean_text) |
|
return data |
|
|
|
def save_filter_intents(data, file_path): |
|
|
|
filtered_df = data.groupby('intent', group_keys=False).apply(lambda x: x.sample(min(len(x), 15))) |
|
|
|
filtered_df.to_csv(file_path+'\Pager_filtered_Intents.csv', index=False) |
|
|
|
def save_cleaned_data(data, file_path): |
|
|
|
data.to_csv(file_path+'\Pager_Intents_cleaned.csv', index=False) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_train_test(data, test_size=0.3): |
|
|
|
train_data = data.sample(frac=1-test_size, random_state=42) |
|
test_data = data.drop(train_data.index) |
|
print(f"Train data shape: {train_data.shape}") |
|
|
|
return train_data, test_data |
|
|
|
def save_train_test_data(train_data, test_data, file_path): |
|
|
|
train_data.to_csv(file_path+'\\train_data.csv', index=False) |
|
test_data.to_csv(file_path+'\\test_data.csv', index=False) |
|
|
|
|
|
|
|
|
|
|