Spaces:

teapotai
/

teapotllm_discord_bot

Running

App Files Files Community

zakerytclarke commited on Mar 26

Commit

6ebe85c

verified ·

1 Parent(s): bc74d1a

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -6

app.py CHANGED Viewed

@@ -56,6 +56,49 @@ async def brave_search(query, count=1):
 # pipeline_lock = asyncio.Lock()
 @traceable
@@ -72,19 +115,32 @@ async def query_teapot(prompt, context, user_input):
 @log_time
 async def handle_chat(user_input):
-    search_start_time = time.time()
-    results = await brave_search(user_input)
-    search_end_time = time.time()
-    documents = [desc.replace('<strong>', '').replace('</strong>', '') for _, desc, _ in results]
-    context = "\n".join(documents)
     prompt = """You are Teapot, an open-source AI assistant optimized for low-end devices, providing short, accurate responses without hallucinating while excelling at information extraction and text summarization."""
     # Custom prompt shims
     if "translate" in user_input:
         context=""
         prompt=""
     generation_start_time = time.time()
     response = await query_teapot(prompt, context, user_input)

+import re
+import urllib.request
+import html  # For decoding HTML escape codes
+# Function to extract the first URL from the text and remove others
+def extract_first_url(query):
+    urls = re.findall(r'https?://\S+', query)  # Find all URLs
+    if urls:
+        # Remove all URLs except the first one
+        query = re.sub(r'https?://\S+', '', query)  # Remove all URLs
+        first_url = urls[0]
+        return query, first_url
+    return query, None
+def extract_text_from_html(url, max_words=250, max_chars=2000):
+    # Fetch the HTML content from the URL
+    response = urllib.request.urlopen(url)
+    html_content = response.read().decode('utf-8')
+    # Find all text within <p> tags using regular expression
+    p_tag_content = re.findall(r'<p>(.*?)</p>', html_content, re.DOTALL)
+    # Remove any HTML tags from the extracted text
+    clean_text = [re.sub(r'<.*?>', '', p) for p in p_tag_content]
+    # Decode any HTML escape codes (e.g., &lt; -> <)
+    decoded_text = [html.unescape(p) for p in clean_text]
+    # Join all paragraphs into one large string
+    full_text = ' '.join(decoded_text)
+    # Split the text into words and get the first 250 words
+    words = full_text.split()
+    first_words = ' '.join(words[:max_words])
+    # Ensure the text does not exceed 1000 characters
+    if len(first_words) > max_chars:
+        first_words = first_words[:max_chars]
+    return first_words
 # pipeline_lock = asyncio.Lock()
 @traceable
 @log_time
 async def handle_chat(user_input):
+    ### Handle logic for scraping, search or translation
     prompt = """You are Teapot, an open-source AI assistant optimized for low-end devices, providing short, accurate responses without hallucinating while excelling at information extraction and text summarization."""
+    # Check if there's a URL and process the input
+    processed_query, url = extract_first_url(user_query)
+    # If there's a URL, fetch the context
+    if url:
+        context = extract_text_from_html(url)
+        user_input = processed_query
+    else:
     # Custom prompt shims
     if "translate" in user_input:
         context=""
         prompt=""
+    else: # Search task
+        search_start_time = time.time()
+        results = await brave_search(user_input)
+        search_end_time = time.time()
+        documents = [desc.replace('<strong>', '').replace('</strong>', '') for _, desc, _ in results]
+        context = "\n".join(documents)
     generation_start_time = time.time()
     response = await query_teapot(prompt, context, user_input)