IProject-10 commited on
Commit
0872833
·
verified ·
1 Parent(s): 4866971

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +125 -189
app.py CHANGED
@@ -1,210 +1,146 @@
1
  # app.py
2
 
3
- import os
4
- import uuid
5
- import nltk
6
- import trafilatura
7
- import chromadb
8
- import tiktoken
9
- import gradio as gr
10
-
11
- from langchain_core.prompts import ChatPromptTemplate
12
- from langchain_core.runnables import RunnableLambda, RunnablePassthrough
13
- from langchain_core.output_parsers import StrOutputParser
14
- from langchain_together import ChatTogether
15
- from langchain_community.vectorstores import Chroma
16
- from sentence_transformers import SentenceTransformer
17
- from nltk.tokenize import sent_tokenize
18
- from langchain_huggingface import HuggingFaceEmbeddings
19
-
20
-
21
- # Download NLTK resources
22
- nltk.download('punkt')
23
- nltk.download('punkt_tab')
24
-
25
- # Initialize tokenizer
26
- tokenizer = tiktoken.get_encoding("cl100k_base")
27
-
28
- # Initialize embedding model
29
- embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
30
- embedding_function = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
31
-
32
- # Initialize ChromaDB
33
- chroma_client = chromadb.PersistentClient(path="./chroma_store")
34
- collection = chroma_client.get_or_create_collection(name="imageonline_chunks")
35
-
36
- # Sectioned URL List
37
- url_dict = {
38
- "Website Designing": [
39
- "https://www.imageonline.co.in/website-designing-mumbai.html",
40
- "https://www.imageonline.co.in/domain-hosting-services-india.html",
41
- "https://www.imageonline.co.in/best-seo-company-mumbai.html",
42
- "https://www.imageonline.co.in/wordpress-blog-designing-india.html",
43
- "https://www.imageonline.co.in/social-media-marketing-company-mumbai.html",
44
- "https://www.imageonline.co.in/website-template-customization-india.html",
45
- "https://www.imageonline.co.in/regular-website-maintanence-services.html",
46
- "https://www.imageonline.co.in/mobile-app-designing-mumbai.html",
47
- "https://www.imageonline.co.in/web-application-screen-designing.html"
48
- ],
49
- "Website Development": [
50
- "https://www.imageonline.co.in/website-development-mumbai.html",
51
- "https://www.imageonline.co.in/open-source-customization.html",
52
- "https://www.imageonline.co.in/ecommerce-development-company-mumbai.html",
53
- "https://www.imageonline.co.in/website-with-content-management-system.html",
54
- "https://www.imageonline.co.in/web-application-development-india.html"
55
- ],
56
- "Mobile App Development": [
57
- "https://www.imageonline.co.in/mobile-app-development-company-mumbai.html"
58
- ],
59
- "About Us": [
60
- "https://www.imageonline.co.in/about-us.html",
61
- "https://www.imageonline.co.in/vision.html",
62
- "https://www.imageonline.co.in/team.html"
63
- ],
64
- "Testimonials": [
65
- "https://www.imageonline.co.in/testimonial.html"
66
- ]
67
- }
68
-
69
- # Helper functions
70
- def extract_clean_text(url):
71
- try:
72
- print(f"🔗 Fetching URL: {url}")
73
- downloaded = trafilatura.fetch_url(url)
74
- if downloaded:
75
- content = trafilatura.extract(downloaded, include_comments=False, include_tables=False)
76
- print(f"✅ Extracted text from {url}")
77
- return content
78
- else:
79
- print(f"⚠️ Failed to fetch content from {url}")
80
- except Exception as e:
81
- print(f"❌ Error fetching {url}: {e}")
82
- return None
83
-
84
- def chunk_text(text, max_tokens=400):
85
- sentences = sent_tokenize(text)
86
- chunks = []
87
- current_chunk = []
88
-
89
- for sentence in sentences:
90
- current_chunk.append(sentence)
91
- tokens = tokenizer.encode(" ".join(current_chunk))
92
- if len(tokens) > max_tokens:
93
- current_chunk.pop()
94
- chunks.append(" ".join(current_chunk).strip())
95
- current_chunk = [sentence]
96
-
97
- if current_chunk:
98
- chunks.append(" ".join(current_chunk).strip())
99
-
100
- print(f"📄 Text split into {len(chunks)} chunks.")
101
- return chunks
102
-
103
- # Check refresh override
104
- force_refresh = os.getenv("FORCE_REFRESH", "false").lower() == "true"
105
-
106
- # Load data into ChromaDB
107
- if collection.count() == 0 or force_refresh:
108
- print("🔄 Loading documents into ChromaDB...")
109
- for section, urls in url_dict.items():
110
- for url in urls:
111
- text = extract_clean_text(url)
112
- if not text:
113
- continue
114
- chunks = chunk_text(text)
115
- embeddings = embedding_model.encode(chunks, convert_to_numpy=True)
116
- metadatas = [{"source": url, "section": section} for _ in chunks]
117
- ids = [str(uuid.uuid4()) for _ in chunks]
118
-
119
- collection.add(
120
- documents=chunks,
121
- embeddings=embeddings.tolist(),
122
- metadatas=metadatas,
123
- ids=ids
124
- )
125
- print("✅ Document loading complete.")
126
- else:
127
- print("✅ Using existing ChromaDB collection.")
128
-
129
- # Vectorstore & Retriever
130
- vectorstore = Chroma(
131
- client=chroma_client,
132
- collection_name="imageonline_chunks",
133
- embedding_function=embedding_function
134
- )
135
-
136
- retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
137
-
138
- # Together.ai LLM
139
- llm = ChatTogether(
140
  model="meta-llama/Llama-3-8b-chat-hf",
 
141
  temperature=0.3,
142
  max_tokens=1024,
143
- top_p=0.7,
144
- together_api_key=os.getenv("TOGETHER_API_KEY")
145
  )
146
 
147
- # Prompt template (refined)
148
- prompt = ChatPromptTemplate.from_template("""
149
- You are a helpful assistant for ImageOnline Web Solutions.
150
-
151
- Use ONLY the information provided in the context to answer the user's query.
152
-
153
- Context:
154
- {context}
155
-
156
- Question:
157
- {question}
158
-
159
- If the answer is not found in the context, say "I'm sorry, I don't have enough information to answer that."
160
- """)
161
-
162
- # Context retrieval
163
- def retrieve_and_format(query):
164
- docs = retriever.get_relevant_documents(query)
165
- context_strings = []
166
- for doc in docs:
167
- content = doc.page_content
168
- metadata = doc.metadata
169
- source = metadata.get("source", "")
170
- section = metadata.get("section", "")
171
- context_strings.append(f"[{section}] {content}\n(Source: {source})")
172
- return "\n\n".join(context_strings)
173
-
174
- # RAG chain
175
- rag_chain = (
176
- {"context": RunnableLambda(retrieve_and_format), "question": RunnablePassthrough()}
177
- | prompt
178
- | llm
179
- | StrOutputParser()
180
  )
181
 
182
- # Gradio Interface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
  def chat_interface(message, history):
184
  history = history or []
185
- history.append(("🧑 You: " + message, "⏳ Generating response..."))
 
 
 
 
 
 
 
 
 
186
  try:
187
- answer = rag_chain.invoke(message)
188
- history[-1] = ("🧑 You: " + message, "🤖 Bot: " + answer)
 
 
 
 
 
 
189
  except Exception as e:
190
- error_msg = f"⚠️ Error: {str(e)}"
191
- history[-1] = ("🧑 You: " + message, f"🤖 Bot: {error_msg}")
192
- return history, history
 
 
193
 
 
194
  def launch_gradio():
195
- with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  gr.Markdown("# 💬 ImageOnline RAG Chatbot")
197
- gr.Markdown("Ask about Website Designing, App Development, SEO, Hosting, etc.")
198
 
199
  chatbot = gr.Chatbot()
200
  state = gr.State([])
201
 
202
- with gr.Row():
203
- msg = gr.Textbox(placeholder="Ask your question here...", show_label=False, scale=8)
204
- send_btn = gr.Button("📨 Send", scale=1)
 
 
 
 
205
 
206
- msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
207
- send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state])
208
 
209
  with gr.Row():
210
  clear_btn = gr.Button("🧹 Clear Chat")
@@ -212,6 +148,6 @@ def launch_gradio():
212
 
213
  return demo
214
 
215
- if __name__ == "__main__":
216
- demo = launch_gradio()
217
- demo.launch()
 
1
  # app.py
2
 
3
+ from llama_index.core import VectorStoreIndex, StorageContext, ServiceContext, Document
4
+ from llama_index.vector_stores.qdrant import QdrantVectorStore
5
+ from llama_index.embeddings.huggingface import HuggingFaceEmbedding
6
+ from llama_index.llms.together import TogetherLLM
7
+ from llama_index.core import Settings
8
+ from qdrant_client import QdrantClient
9
+
10
+ # === Qdrant Config ===
11
+ QDRANT_API_KEY = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhY2Nlc3MiOiJtIn0.9Pj8v4ACpX3m5U3SZUrG_jzrjGF-T41J5icZ6EPMxnc"
12
+ QDRANT_URL = "https://d36718f0-be68-4040-b276-f1f39bc1aeb9.us-east4-0.gcp.cloud.qdrant.io"
13
+ COLLECTION_NAME = "demo-chatbot"
14
+
15
+ # === Embedding & LLM Setup ===
16
+ embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-base-en-v1.5")
17
+
18
+ llm = TogetherLLM(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  model="meta-llama/Llama-3-8b-chat-hf",
20
+ api_key="a36246d65d8290f43667350b364c5b6bb8562eb50a4b947eec5bd7e79f2dffc6",
21
  temperature=0.3,
22
  max_tokens=1024,
23
+ top_p=0.7
 
24
  )
25
 
26
+ Settings.llm = llm
27
+ Settings.embed_model = embed_model
28
+
29
+ # === Qdrant Integration ===
30
+ qdrant_client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
31
+
32
+ vector_store = QdrantVectorStore(
33
+ client=qdrant_client,
34
+ collection_name=COLLECTION_NAME
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  )
36
 
37
+ # === Build Index ===
38
+ index = VectorStoreIndex.from_vector_store(vector_store)
39
+ query_engine = index.as_query_engine(similarity_top_k=5)
40
+
41
+ # === Enhanced RAG Chain with References ===
42
+ def rag_chain(query: str, include_sources: bool = True) -> str:
43
+ response = query_engine.query(query)
44
+ response_text = str(response)
45
+
46
+ if include_sources:
47
+ references = get_clickable_references_from_response(response)
48
+ if references:
49
+ response_text += "\n\n🔗 **Sources:**\n" + "\n".join(references)
50
+
51
+ return response_text
52
+
53
+ # === Clickable Reference Links (top-2 from response nodes) ===
54
+ def get_clickable_references_from_response(response, max_refs: int = 2):
55
+ seen = set()
56
+ links = []
57
+ for node in response.source_nodes:
58
+ metadata = node.node.metadata
59
+ section = metadata.get("section", "Unknown")
60
+ source = metadata.get("source", "Unknown")
61
+ key = (section, source)
62
+ if key not in seen:
63
+ seen.add(key)
64
+ if source.startswith("http"):
65
+ links.append(f"- [{section}]({source})")
66
+ else:
67
+ links.append(f"- {section}: {source}")
68
+ if len(links) >= max_refs:
69
+ break
70
+ return links
71
+
72
+ from datetime import datetime
73
+ import time
74
+ import gradio as gr
75
+
76
+ # Chat handler
77
  def chat_interface(message, history):
78
  history = history or []
79
+
80
+ message = message.strip()
81
+ if not message:
82
+ raise ValueError("Please enter a valid question.")
83
+
84
+ timestamp_user = datetime.now().strftime("%H:%M:%S")
85
+ user_msg = f"🧑 **You**\n{message}\n\n⏱️ {timestamp_user}"
86
+ bot_msg = "⏳ _Bot is typing..._"
87
+ history.append((user_msg, bot_msg))
88
+
89
  try:
90
+ time.sleep(0.5)
91
+ answer = rag_chain(message) # already includes references
92
+ full_response = answer.strip()
93
+
94
+ timestamp_bot = datetime.now().strftime("%H:%M:%S")
95
+ bot_msg = f"🤖 **Bot**\n{full_response}\n\n⏱️ {timestamp_bot}"
96
+ history[-1] = (user_msg, bot_msg)
97
+
98
  except Exception as e:
99
+ timestamp_bot = datetime.now().strftime("%H:%M:%S")
100
+ error_msg = f"🤖 **Bot**\n⚠️ {str(e)}\n\n⏱️ {timestamp_bot}"
101
+ history[-1] = (user_msg, error_msg)
102
+
103
+ return history, history, ""
104
 
105
+ # Gradio UI
106
  def launch_gradio():
107
+ with gr.Blocks(css="""
108
+ .gr-button {
109
+ background-color: orange !important;
110
+ color: white !important;
111
+ font-weight: bold;
112
+ border-radius: 6px !important;
113
+ border: 1px solid darkorange !important;
114
+ }
115
+
116
+ .gr-button:hover {
117
+ background-color: darkorange !important;
118
+ }
119
+
120
+ .gr-textbox textarea {
121
+ border: 2px solid orange !important;
122
+ border-radius: 6px !important;
123
+ padding: 0.75rem !important;
124
+ font-size: 1rem;
125
+ }
126
+ """) as demo:
127
+
128
  gr.Markdown("# 💬 ImageOnline RAG Chatbot")
129
+ gr.Markdown("Welcome! Ask about Website Designing, Web Development, App Development, About Us, Digital Marketing etc.")
130
 
131
  chatbot = gr.Chatbot()
132
  state = gr.State([])
133
 
134
+ with gr.Row(equal_height=True):
135
+ msg = gr.Textbox(
136
+ placeholder="Ask your question here...",
137
+ show_label=False,
138
+ scale=9
139
+ )
140
+ send_btn = gr.Button("🚀 Send", scale=1)
141
 
142
+ msg.submit(chat_interface, inputs=[msg, state], outputs=[chatbot, state, msg])
143
+ send_btn.click(chat_interface, inputs=[msg, state], outputs=[chatbot, state, msg])
144
 
145
  with gr.Row():
146
  clear_btn = gr.Button("🧹 Clear Chat")
 
148
 
149
  return demo
150
 
151
+ # Launch
152
+ demo = launch_gradio()
153
+ demo.launch()