|
from tqdm import tqdm |
|
import os |
|
import pandas as pd |
|
import json |
|
import time |
|
import re |
|
import threading |
|
from concurrent.futures import ThreadPoolExecutor |
|
from transformers import AutoTokenizer |
|
import torch |
|
from enum import Enum |
|
import traceback |
|
|
|
import warnings |
|
warnings.filterwarnings("ignore") |
|
os.environ['HF_TOKEN'] = "YOUR HUGGINGFACE ACCESS TOKEN" |
|
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "YOUR HUGGINGFACE ACCESS TOKEN" |
|
|
|
''' directory tree |
|
.root |
|
├── Training |
|
│ ├── cs |
|
│ ├── dasan |
|
│ ├── finance |
|
│ └── health |
|
└── Validation |
|
├── cs |
|
├── dasan |
|
├── finance |
|
└── health |
|
''' |
|
|
|
root_dir = os.path.join('SET YOUR DATA DIR PATH TO root') |
|
is_train = "Training" |
|
|
|
|
|
|
|
category = ['dasan', 'finance', 'health', 'cs'] |
|
|
|
filedirpaths = [] |
|
for c in category: |
|
filedirpaths.append(os.path.join(root_dir, is_train, c)) |
|
filepaths = [] |
|
for fdp in filedirpaths: |
|
filenames = os.listdir(fdp) |
|
for filename in filenames: |
|
filepaths.append(os.path.join(fdp,filename)) |
|
filepaths = list(filter(lambda x:'.json' in x, filepaths)) |
|
|
|
|
|
ddf_columns = ['category','seq_id','dialogue'] |
|
dialogue_df = pd.DataFrame( |
|
data=[['' for _ in ddf_columns]], |
|
columns=ddf_columns |
|
) |
|
dialogue_df_lock = threading.Lock() |
|
|
|
def data_preprocessing(filepath): |
|
ddf_columns = ['category','seq_id','dialogue'] |
|
with open(os.path.join('SET PATH of short_sentences.json'), 'r', encoding='utf-8') as f: |
|
shorts_to_filter = json.load(f) |
|
|
|
def set_private_token(text): |
|
|
|
pattern = r"(x{2,}|X{2,}|o{2,}|O{2,}|0{2,}|\-{2,}|ㅇ{2,})" |
|
|
|
return re.sub(pattern, '<|private|>', text) |
|
|
|
def merge_continuous_role(conversation): |
|
merged_conversation = [] |
|
current_entry = None |
|
|
|
for entry in conversation: |
|
|
|
if current_entry is None: |
|
current_entry = entry |
|
|
|
elif current_entry['role'] == entry['role']: |
|
current_entry['content'] += ' ' + entry['content'] |
|
|
|
else: |
|
merged_conversation.append(current_entry) |
|
current_entry = entry |
|
|
|
|
|
if current_entry is not None: |
|
merged_conversation.append(current_entry) |
|
return merged_conversation |
|
|
|
try: |
|
|
|
global dialogue_df |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("song9/CC-KuLLM3-LoRA") |
|
savepath = os.path.join(f"{filepath.split('.json')[0]}.tsv") |
|
columns = ['domain','category','id','speaker','sequence','cust_intent','couns_intent','QA','cust_q','couns_q','cust_a','couns_a','entities','dictionary','knowlegde_base'] |
|
|
|
|
|
|
|
if os.path.exists(savepath): |
|
df = pd.read_csv(savepath, sep='\t') |
|
|
|
else: |
|
df = pd.DataFrame(data=[['0' for _ in range(len(columns))]],columns=columns) |
|
with open(filepath, 'r', encoding='utf-8') as f: |
|
data = json.load(f) |
|
for d in tqdm(data): |
|
df = pd.concat([df, pd.DataFrame(data=[d.values()], columns=columns)], axis=0) |
|
df = df.iloc[1:] |
|
df.reset_index(inplace=True, drop=True) |
|
|
|
df.to_csv(savepath, sep='\t', index=False) |
|
|
|
|
|
|
|
|
|
|
|
df['sequence'] = df['sequence'].astype(int) |
|
|
|
|
|
|
|
|
|
temp_dialogue_df = pd.DataFrame( |
|
data=[['' for _ in ddf_columns]], |
|
columns=ddf_columns |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for seq_id in tqdm(df['id'].unique()): |
|
|
|
sequence_df = df[df['id']==seq_id] |
|
if len(sequence_df) < 9: |
|
continue |
|
|
|
seq_category = os.path.dirname(filepath).split('/')[-1] |
|
category_dirpath = os.path.join(root_dir,is_train,f'{seq_category}-preprocessed') |
|
if not os.path.exists(category_dirpath): |
|
os.mkdir(category_dirpath) |
|
savepath_processed_json = os.path.join(category_dirpath,f'{seq_category}_{seq_id}.json') |
|
|
|
|
|
|
|
|
|
|
|
|
|
sequence_df = sequence_df.sort_values('sequence', ascending=True) |
|
sequence_df.reset_index(inplace=True, drop=True) |
|
dialogue = [] |
|
past_role = '' |
|
|
|
for i, row in sequence_df.iterrows(): |
|
|
|
if row['speaker'] == '상담사': |
|
role = 'assistant' |
|
else: |
|
role = 'user' |
|
|
|
c = ''.join(filter(lambda x: not pd.isna(x), sequence_df.loc[i,['cust_q','cust_a','couns_q','couns_a']].tolist())).strip() |
|
c = set_private_token(c) |
|
if past_role == role: |
|
|
|
|
|
|
|
dialogue[-1]['content'] = dialogue[-1]['content'] + ' ' + c |
|
past_role = role |
|
continue |
|
past_role = role |
|
dialogue = dialogue + [{'role':role, 'content':c}] |
|
|
|
if dialogue[0].get('role') == 'assistant': |
|
dialogue.insert(0,{'role':'user', 'content':'<|startofcall|>'}) |
|
else: |
|
dialogue[0]['content'] = '<|startofcall|>' + dialogue[0]['content'] |
|
|
|
if dialogue[-1].get('role') == 'user': |
|
dialogue = dialogue[:-1] |
|
|
|
|
|
is_short = False |
|
new_dialogue = [] |
|
dialogue_copy = dialogue.copy() |
|
for i, turn in enumerate(dialogue_copy): |
|
if i < 2: |
|
new_dialogue.append(turn) |
|
continue |
|
cur_role = turn['role'] |
|
content = turn['content'] |
|
if i == len(dialogue_copy)-2: |
|
new_dialogue.append(turn) |
|
continue |
|
if sum([x in content for x in shorts_to_filter]) > 0 and len(content)<9: |
|
|
|
is_short = True |
|
continue |
|
if is_short: |
|
is_short = False |
|
if cur_role != new_dialogue[-1]['role']: |
|
new_dialogue.append(turn) |
|
else: |
|
new_dialogue[-1]['content'] += " " + content |
|
else: |
|
if cur_role != new_dialogue[-1]['role']: |
|
new_dialogue.append(turn) |
|
else: |
|
new_dialogue[-1]['content'] += " " + content |
|
|
|
new_dialogue = merge_continuous_role(new_dialogue) |
|
|
|
dialogue = [ |
|
{ |
|
'dialogue':new_dialogue |
|
} |
|
] |
|
|
|
with open(savepath_processed_json, 'w', encoding='utf-8') as f: |
|
json.dump(dialogue, f, ensure_ascii=False) |
|
|
|
|
|
|
|
chat_template_formatted_dialogue = tokenizer.apply_chat_template(dialogue[0]['dialogue'], tokenize=False, ) |
|
|
|
|
|
|
|
temp_dialogue_df = pd.concat([temp_dialogue_df, pd.DataFrame( |
|
data=[[seq_category, seq_id, chat_template_formatted_dialogue]], |
|
columns=ddf_columns |
|
)]) |
|
|
|
|
|
temp_dialogue_df = temp_dialogue_df.iloc[1:] |
|
if not os.path.exists(os.path.join(root_dir,is_train,'temp_tsvs')): |
|
os.mkdir(os.path.join(root_dir,is_train,'temp_tsvs')) |
|
tddf_savepath = os.path.join(root_dir,is_train,'temp_tsvs',os.path.basename(filepath).replace('.json','.tsv')) |
|
temp_dialogue_df.to_csv(tddf_savepath, sep='\t', index=False) |
|
|
|
|
|
with dialogue_df_lock: |
|
dialogue_df = pd.concat([dialogue_df, temp_dialogue_df], axis=0) |
|
except Exception as e: |
|
print(f"에러 타입: {type(e).__name__}") |
|
print(f"에러 메시지: {e}") |
|
traceback.print_exc() |
|
|
|
|
|
with ThreadPoolExecutor(max_workers=15) as executor: |
|
executor.map(data_preprocessing, filepaths) |
|
|
|
dialogue_df = dialogue_df.iloc[1:] |
|
savepath_dialogue_df = os.path.join(root_dir, is_train, 'train_no_short.tsv') |
|
dialogue_df.to_csv(savepath_dialogue_df, sep='\t', index=False) |
|
print('✅dialogue_df 저장 완료') |
|
|