Create model.py
Browse files
model.py
ADDED
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
import pandas as pd
|
3 |
+
import requests
|
4 |
+
from io import StringIO
|
5 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
7 |
+
import speech_recognition as sr
|
8 |
+
import pyttsx3
|
9 |
+
from googlesearch import search
|
10 |
+
from bs4 import BeautifulSoup
|
11 |
+
import urllib.request
|
12 |
+
from urllib.parse import quote
|
13 |
+
|
14 |
+
class HybridChatBot:
|
15 |
+
def __init__(self, dataset_url=None):
|
16 |
+
self.dataset_url = dataset_url
|
17 |
+
self.qa_pairs = {}
|
18 |
+
self.vectorizer = TfidfVectorizer()
|
19 |
+
self.X = None
|
20 |
+
self.recognizer = sr.Recognizer()
|
21 |
+
self.engine = pyttsx3.init()
|
22 |
+
|
23 |
+
# Voice engine settings
|
24 |
+
voices = self.engine.getProperty('voices')
|
25 |
+
self.engine.setProperty('voice', voices[0].id)
|
26 |
+
self.engine.setProperty('rate', 150)
|
27 |
+
|
28 |
+
if dataset_url:
|
29 |
+
self.load_dataset()
|
30 |
+
self.train()
|
31 |
+
|
32 |
+
def load_dataset(self):
|
33 |
+
"""Load dataset from web resource"""
|
34 |
+
try:
|
35 |
+
response = requests.get(self.dataset_url)
|
36 |
+
response.raise_for_status()
|
37 |
+
|
38 |
+
if self.dataset_url.endswith('.csv'):
|
39 |
+
data = pd.read_csv(StringIO(response.text))
|
40 |
+
elif self.dataset_url.endswith('.json'):
|
41 |
+
data = pd.read_json(StringIO(response.text))
|
42 |
+
else:
|
43 |
+
print("File format not supported")
|
44 |
+
return
|
45 |
+
|
46 |
+
for _, row in data.iterrows():
|
47 |
+
self.qa_pairs[row["question"].lower()] = row["answer"]
|
48 |
+
|
49 |
+
print(f"Loaded {len(self.qa_pairs)} question-answer pairs")
|
50 |
+
|
51 |
+
except Exception as e:
|
52 |
+
print(f"Error loading dataset: {e}")
|
53 |
+
|
54 |
+
def train(self):
|
55 |
+
"""Train the model on loaded data"""
|
56 |
+
if not self.qa_pairs:
|
57 |
+
print("No data available for training!")
|
58 |
+
return
|
59 |
+
|
60 |
+
questions = list(self.qa_pairs.keys())
|
61 |
+
self.X = self.vectorizer.fit_transform(questions)
|
62 |
+
print("Model trained on loaded data")
|
63 |
+
|
64 |
+
def add_qa_pair(self, question, answer):
|
65 |
+
"""Add new question-answer pair"""
|
66 |
+
self.qa_pairs[question.lower()] = answer
|
67 |
+
self.train()
|
68 |
+
|
69 |
+
def web_search(self, query, num_results=3):
|
70 |
+
"""Perform web search and extract information"""
|
71 |
+
try:
|
72 |
+
print(f"\nSearching the web: {query}")
|
73 |
+
search_results = []
|
74 |
+
|
75 |
+
# Perform Google search
|
76 |
+
for url in search(query, num_results=num_results, lang='en'):
|
77 |
+
try:
|
78 |
+
# Get page content
|
79 |
+
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
|
80 |
+
with urllib.request.urlopen(req, timeout=5) as response:
|
81 |
+
html = response.read()
|
82 |
+
|
83 |
+
# Parse HTML
|
84 |
+
soup = BeautifulSoup(html, 'html.parser')
|
85 |
+
|
86 |
+
# Remove unwanted elements (scripts, styles, etc.)
|
87 |
+
for script in soup(["script", "style", "iframe", "nav", "footer"]):
|
88 |
+
script.extract()
|
89 |
+
|
90 |
+
# Get page text
|
91 |
+
text = soup.get_text(separator=' ', strip=True)
|
92 |
+
text = ' '.join(text.split()[:200]) # Take first 200 words
|
93 |
+
|
94 |
+
search_results.append({
|
95 |
+
'url': url,
|
96 |
+
'content': text
|
97 |
+
})
|
98 |
+
|
99 |
+
except Exception as e:
|
100 |
+
print(f"Error processing {url}: {e}")
|
101 |
+
continue
|
102 |
+
|
103 |
+
return search_results
|
104 |
+
|
105 |
+
except Exception as e:
|
106 |
+
print(f"Search error: {e}")
|
107 |
+
return None
|
108 |
+
|
109 |
+
def get_response(self, user_input):
|
110 |
+
"""Get response to user input"""
|
111 |
+
if not self.qa_pairs:
|
112 |
+
return "I'm not trained yet. Please add questions and answers."
|
113 |
+
|
114 |
+
# Check if user wants to perform web search
|
115 |
+
if "search the web for" in user_input.lower() or "find online" in user_input.lower():
|
116 |
+
query = user_input.replace("search the web for", "").replace("find online", "").strip()
|
117 |
+
search_results = self.web_search(query)
|
118 |
+
if search_results:
|
119 |
+
response = "Here's what I found online:\n"
|
120 |
+
for i, result in enumerate(search_results, 1):
|
121 |
+
response += f"\n{i}. {result['content']}\n(Source: {result['url']})\n"
|
122 |
+
return response[:2000] # Limit response length
|
123 |
+
else:
|
124 |
+
return "Couldn't find any information online."
|
125 |
+
|
126 |
+
# Regular question-answer search
|
127 |
+
user_vec = self.vectorizer.transform([user_input.lower()])
|
128 |
+
similarities = cosine_similarity(user_vec, self.X)
|
129 |
+
best_match_idx = np.argmax(similarities)
|
130 |
+
best_match_score = similarities[0, best_match_idx]
|
131 |
+
|
132 |
+
if best_match_score > 0.5:
|
133 |
+
best_question = list(self.qa_pairs.keys())[best_match_idx]
|
134 |
+
return self.qa_pairs[best_question]
|
135 |
+
else:
|
136 |
+
return "I don't know the answer to this question. Would you like me to search online? (Say 'search the web for...')"
|
137 |
+
|
138 |
+
def text_to_speech(self, text):
|
139 |
+
"""Convert text to speech"""
|
140 |
+
self.engine.say(text)
|
141 |
+
self.engine.runAndWait()
|
142 |
+
|
143 |
+
def speech_to_text(self):
|
144 |
+
"""Convert speech from microphone to text"""
|
145 |
+
with sr.Microphone() as source:
|
146 |
+
print("\nSpeak now...")
|
147 |
+
self.recognizer.adjust_for_ambient_noise(source)
|
148 |
+
try:
|
149 |
+
audio = self.recognizer.listen(source, timeout=5)
|
150 |
+
text = self.recognizer.recognize_google(audio, language="en-US")
|
151 |
+
print(f"Recognized: {text}")
|
152 |
+
return text
|
153 |
+
except sr.UnknownValueError:
|
154 |
+
print("Speech not recognized")
|
155 |
+
return None
|
156 |
+
except sr.RequestError:
|
157 |
+
print("Recognition service error")
|
158 |
+
return None
|
159 |
+
except sr.WaitTimeoutError:
|
160 |
+
print("Timeout expired")
|
161 |
+
return None
|
162 |
+
|
163 |
+
def run(self):
|
164 |
+
"""Improved interaction interface"""
|
165 |
+
print("\n" + "="*50)
|
166 |
+
print("WELCOME TO INTELLIGENT CHATBOT".center(50))
|
167 |
+
print("="*50)
|
168 |
+
|
169 |
+
current_mode = "text"
|
170 |
+
while True:
|
171 |
+
print("\n" + "-"*50)
|
172 |
+
print(f"Current input mode: {current_mode.upper()}")
|
173 |
+
print("[1] Send text message")
|
174 |
+
print("[2] Speak to the bot")
|
175 |
+
print("[3] Switch input mode")
|
176 |
+
print("[4] Teach the bot a new answer")
|
177 |
+
print("[5] Web search")
|
178 |
+
print("[6] Exit")
|
179 |
+
|
180 |
+
try:
|
181 |
+
choice = input("Choose action (1-6): ").strip()
|
182 |
+
|
183 |
+
if choice == "1":
|
184 |
+
user_input = input("\nYour message: ")
|
185 |
+
if user_input.lower() in ["exit", "stop"]:
|
186 |
+
break
|
187 |
+
|
188 |
+
response = self.get_response(user_input)
|
189 |
+
if response:
|
190 |
+
print(f"\nBot: {response}")
|
191 |
+
self.text_to_speech(response)
|
192 |
+
else:
|
193 |
+
print("\nBot: I don't know what to say. Would you like to teach me?")
|
194 |
+
|
195 |
+
elif choice == "2":
|
196 |
+
user_input = self.speech_to_text()
|
197 |
+
if user_input:
|
198 |
+
if user_input.lower() in ["exit", "stop"]:
|
199 |
+
break
|
200 |
+
|
201 |
+
response = self.get_response(user_input)
|
202 |
+
if response:
|
203 |
+
print(f"\nBot: {response}")
|
204 |
+
self.text_to_speech(response)
|
205 |
+
else:
|
206 |
+
print("\nBot: I don't know how to respond to that.")
|
207 |
+
self.text_to_speech("I don't know how to respond to that")
|
208 |
+
|
209 |
+
elif choice == "3":
|
210 |
+
current_mode = "voice" if current_mode == "text" else "text"
|
211 |
+
print(f"\nMode changed to: {current_mode.upper()}")
|
212 |
+
|
213 |
+
elif choice == "4":
|
214 |
+
print("\nTeaching the bot:")
|
215 |
+
question = input("Enter question: ")
|
216 |
+
answer = input("Enter answer: ")
|
217 |
+
self.add_qa_pair(question, answer)
|
218 |
+
print("Bot successfully trained!")
|
219 |
+
|
220 |
+
elif choice == "5":
|
221 |
+
query = input("\nEnter search query: ")
|
222 |
+
search_results = self.web_search(query)
|
223 |
+
if search_results:
|
224 |
+
print("\nSearch results:")
|
225 |
+
for i, result in enumerate(search_results, 1):
|
226 |
+
print(f"\n{i}. {result['content']}\n(Source: {result['url']})\n")
|
227 |
+
else:
|
228 |
+
print("\nNothing found.")
|
229 |
+
|
230 |
+
elif choice == "6":
|
231 |
+
print("\nShutting down...")
|
232 |
+
break
|
233 |
+
|
234 |
+
else:
|
235 |
+
print("\nPlease choose an option between 1 and 6")
|
236 |
+
|
237 |
+
except KeyboardInterrupt:
|
238 |
+
print("\nShutting down...")
|
239 |
+
break
|
240 |
+
|
241 |
+
if __name__ == "__main__":
|
242 |
+
|
243 |
+
DATASET_URL = "https://raw.githubusercontent.com/user/repo/main/qa_data.csv"
|
244 |
+
|
245 |
+
bot = HybridChatBot(DATASET_URL)
|
246 |
+
bot.run()
|