Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -56,6 +56,49 @@ async def brave_search(query, count=1):
|
|
56 |
|
57 |
|
58 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
# pipeline_lock = asyncio.Lock()
|
60 |
|
61 |
@traceable
|
@@ -72,19 +115,32 @@ async def query_teapot(prompt, context, user_input):
|
|
72 |
|
73 |
@log_time
|
74 |
async def handle_chat(user_input):
|
75 |
-
search_start_time = time.time()
|
76 |
-
results = await brave_search(user_input)
|
77 |
-
search_end_time = time.time()
|
78 |
-
|
79 |
-
documents = [desc.replace('<strong>', '').replace('</strong>', '') for _, desc, _ in results]
|
80 |
|
81 |
-
|
|
|
82 |
prompt = """You are Teapot, an open-source AI assistant optimized for low-end devices, providing short, accurate responses without hallucinating while excelling at information extraction and text summarization."""
|
|
|
|
|
|
|
83 |
|
|
|
|
|
|
|
|
|
|
|
84 |
# Custom prompt shims
|
85 |
if "translate" in user_input:
|
86 |
context=""
|
87 |
prompt=""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
|
89 |
generation_start_time = time.time()
|
90 |
response = await query_teapot(prompt, context, user_input)
|
|
|
56 |
|
57 |
|
58 |
|
59 |
+
import re
|
60 |
+
import urllib.request
|
61 |
+
import html # For decoding HTML escape codes
|
62 |
+
|
63 |
+
# Function to extract the first URL from the text and remove others
|
64 |
+
def extract_first_url(query):
|
65 |
+
urls = re.findall(r'https?://\S+', query) # Find all URLs
|
66 |
+
if urls:
|
67 |
+
# Remove all URLs except the first one
|
68 |
+
query = re.sub(r'https?://\S+', '', query) # Remove all URLs
|
69 |
+
first_url = urls[0]
|
70 |
+
return query, first_url
|
71 |
+
return query, None
|
72 |
+
|
73 |
+
def extract_text_from_html(url, max_words=250, max_chars=2000):
|
74 |
+
# Fetch the HTML content from the URL
|
75 |
+
response = urllib.request.urlopen(url)
|
76 |
+
html_content = response.read().decode('utf-8')
|
77 |
+
|
78 |
+
# Find all text within <p> tags using regular expression
|
79 |
+
p_tag_content = re.findall(r'<p>(.*?)</p>', html_content, re.DOTALL)
|
80 |
+
|
81 |
+
# Remove any HTML tags from the extracted text
|
82 |
+
clean_text = [re.sub(r'<.*?>', '', p) for p in p_tag_content]
|
83 |
+
|
84 |
+
# Decode any HTML escape codes (e.g., < -> <)
|
85 |
+
decoded_text = [html.unescape(p) for p in clean_text]
|
86 |
+
|
87 |
+
# Join all paragraphs into one large string
|
88 |
+
full_text = ' '.join(decoded_text)
|
89 |
+
|
90 |
+
# Split the text into words and get the first 250 words
|
91 |
+
words = full_text.split()
|
92 |
+
first_words = ' '.join(words[:max_words])
|
93 |
+
|
94 |
+
# Ensure the text does not exceed 1000 characters
|
95 |
+
if len(first_words) > max_chars:
|
96 |
+
first_words = first_words[:max_chars]
|
97 |
+
|
98 |
+
return first_words
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
# pipeline_lock = asyncio.Lock()
|
103 |
|
104 |
@traceable
|
|
|
115 |
|
116 |
@log_time
|
117 |
async def handle_chat(user_input):
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
+
|
120 |
+
### Handle logic for scraping, search or translation
|
121 |
prompt = """You are Teapot, an open-source AI assistant optimized for low-end devices, providing short, accurate responses without hallucinating while excelling at information extraction and text summarization."""
|
122 |
+
|
123 |
+
# Check if there's a URL and process the input
|
124 |
+
processed_query, url = extract_first_url(user_query)
|
125 |
|
126 |
+
# If there's a URL, fetch the context
|
127 |
+
if url:
|
128 |
+
context = extract_text_from_html(url)
|
129 |
+
user_input = processed_query
|
130 |
+
else:
|
131 |
# Custom prompt shims
|
132 |
if "translate" in user_input:
|
133 |
context=""
|
134 |
prompt=""
|
135 |
+
|
136 |
+
else: # Search task
|
137 |
+
search_start_time = time.time()
|
138 |
+
results = await brave_search(user_input)
|
139 |
+
search_end_time = time.time()
|
140 |
+
|
141 |
+
documents = [desc.replace('<strong>', '').replace('</strong>', '') for _, desc, _ in results]
|
142 |
+
|
143 |
+
context = "\n".join(documents)
|
144 |
|
145 |
generation_start_time = time.time()
|
146 |
response = await query_teapot(prompt, context, user_input)
|