Mishal23 commited on
Commit
ed2b381
Β·
verified Β·
1 Parent(s): 9c3ff37

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -0
app.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from datasets import load_dataset
3
+ from sklearn.utils import resample
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForSeq2Seq
5
+ from torch.utils.data import Dataset
6
+ import gradio as gr
7
+
8
+ # Step 1: Load the dataset from Hugging Face (Customer Support dataset)
9
+ dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
10
+
11
+ # Step 2: Sample a subset (20% of the dataset for testing)
12
+ sampled_data = dataset["train"].shuffle(seed=42).select([i for i in range(int(len(dataset["train"]) * 0.2))])
13
+
14
+ # Convert to DataFrame and display some rows
15
+ sampled_data_df = pd.DataFrame(sampled_data)
16
+ df_limited = sampled_data_df[['instruction', 'response']]
17
+
18
+ # Step 3: Handle class imbalance using oversampling
19
+ df_majority = df_limited[df_limited['response'] == df_limited['response'].mode()[0]]
20
+ df_minority = df_limited[df_limited['response'] != df_limited['response'].mode()[0]]
21
+ df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
22
+ df_balanced = pd.concat([df_majority, df_minority_upsampled])
23
+
24
+ # Step 4: Load the pre-trained DialoGPT model and tokenizer
25
+ model_name = "microsoft/DialoGPT-medium"
26
+ model = AutoModelForCausalLM.from_pretrained(model_name)
27
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
28
+
29
+ # Check if pad_token is None, and set it to eos_token if it is
30
+ if tokenizer.pad_token is None:
31
+ tokenizer.pad_token = tokenizer.eos_token
32
+
33
+ # Step 5: Preprocess the data for training
34
+ def preprocess_data_for_training(df, max_length=512):
35
+ inputs = tokenizer(df['instruction'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
36
+ targets = tokenizer(df['response'].tolist(), padding=True, truncation=True, max_length=max_length, return_tensors="pt")
37
+ input_ids = inputs['input_ids']
38
+ target_ids = targets['input_ids']
39
+ if input_ids.shape[1] != target_ids.shape[1]:
40
+ target_ids = target_ids[:, :input_ids.shape[1]]
41
+ target_ids = target_ids.roll(1, dims=1)
42
+ target_ids[:, 0] = tokenizer.pad_token_id
43
+ return {'input_ids': input_ids, 'attention_mask': inputs['attention_mask'], 'labels': target_ids}
44
+
45
+ preprocessed_data = preprocess_data_for_training(df_balanced)
46
+
47
+ # Step 6: Create a custom dataset class for fine-tuning
48
+ class ChatbotDataset(Dataset):
49
+ def __init__(self, inputs, targets):
50
+ self.inputs = inputs
51
+ self.targets = targets
52
+
53
+ def __len__(self):
54
+ return len(self.inputs['input_ids'])
55
+
56
+ def __getitem__(self, idx):
57
+ return {
58
+ 'input_ids': self.inputs['input_ids'][idx],
59
+ 'attention_mask': self.inputs['attention_mask'][idx],
60
+ 'labels': self.targets['input_ids'][idx]
61
+ }
62
+
63
+ train_dataset = ChatbotDataset(preprocessed_data, preprocessed_data)
64
+
65
+ # Step 7: Set up training arguments
66
+ training_args = TrainingArguments(
67
+ output_dir='./results',
68
+ num_train_epochs=3,
69
+ per_device_train_batch_size=4,
70
+ save_steps=10_000,
71
+ save_total_limit=2,
72
+ logging_dir='./logs',
73
+ logging_steps=500,
74
+ )
75
+
76
+ # Step 8: Initialize Trainer
77
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
78
+ trainer = Trainer(
79
+ model=model,
80
+ args=training_args,
81
+ train_dataset=train_dataset,
82
+ tokenizer=tokenizer,
83
+ data_collator=data_collator
84
+ )
85
+
86
+ # Step 9: Fine-tune the model
87
+ trainer.train()
88
+
89
+ # Save the trained model and tokenizer
90
+ model.save_pretrained("./trained_model")
91
+ tokenizer.save_pretrained("./trained_model")
92
+
93
+ # Optional: Test the chatbot after training
94
+ def generate_response(input_text):
95
+ inputs = tokenizer(input_text, return_tensors="pt")
96
+ outputs = model.generate(inputs['input_ids'], max_length=50, pad_token_id=tokenizer.eos_token_id)
97
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
98
+ return response
99
+
100
+ # Gradio Interface
101
+ def chatbot_interface(input_text):
102
+ return generate_response(input_text)
103
+
104
+ iface = gr.Interface(fn=chatbot_interface, inputs="text", outputs="text", live=True)
105
+ iface.launch()