zakerytclarke commited on
Commit
6ebe85c
·
verified ·
1 Parent(s): bc74d1a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -6
app.py CHANGED
@@ -56,6 +56,49 @@ async def brave_search(query, count=1):
56
 
57
 
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # pipeline_lock = asyncio.Lock()
60
 
61
  @traceable
@@ -72,19 +115,32 @@ async def query_teapot(prompt, context, user_input):
72
 
73
  @log_time
74
  async def handle_chat(user_input):
75
- search_start_time = time.time()
76
- results = await brave_search(user_input)
77
- search_end_time = time.time()
78
-
79
- documents = [desc.replace('<strong>', '').replace('</strong>', '') for _, desc, _ in results]
80
 
81
- context = "\n".join(documents)
 
82
  prompt = """You are Teapot, an open-source AI assistant optimized for low-end devices, providing short, accurate responses without hallucinating while excelling at information extraction and text summarization."""
 
 
 
83
 
 
 
 
 
 
84
  # Custom prompt shims
85
  if "translate" in user_input:
86
  context=""
87
  prompt=""
 
 
 
 
 
 
 
 
 
88
 
89
  generation_start_time = time.time()
90
  response = await query_teapot(prompt, context, user_input)
 
56
 
57
 
58
 
59
+ import re
60
+ import urllib.request
61
+ import html # For decoding HTML escape codes
62
+
63
+ # Function to extract the first URL from the text and remove others
64
+ def extract_first_url(query):
65
+ urls = re.findall(r'https?://\S+', query) # Find all URLs
66
+ if urls:
67
+ # Remove all URLs except the first one
68
+ query = re.sub(r'https?://\S+', '', query) # Remove all URLs
69
+ first_url = urls[0]
70
+ return query, first_url
71
+ return query, None
72
+
73
+ def extract_text_from_html(url, max_words=250, max_chars=2000):
74
+ # Fetch the HTML content from the URL
75
+ response = urllib.request.urlopen(url)
76
+ html_content = response.read().decode('utf-8')
77
+
78
+ # Find all text within <p> tags using regular expression
79
+ p_tag_content = re.findall(r'<p>(.*?)</p>', html_content, re.DOTALL)
80
+
81
+ # Remove any HTML tags from the extracted text
82
+ clean_text = [re.sub(r'<.*?>', '', p) for p in p_tag_content]
83
+
84
+ # Decode any HTML escape codes (e.g., &lt; -> <)
85
+ decoded_text = [html.unescape(p) for p in clean_text]
86
+
87
+ # Join all paragraphs into one large string
88
+ full_text = ' '.join(decoded_text)
89
+
90
+ # Split the text into words and get the first 250 words
91
+ words = full_text.split()
92
+ first_words = ' '.join(words[:max_words])
93
+
94
+ # Ensure the text does not exceed 1000 characters
95
+ if len(first_words) > max_chars:
96
+ first_words = first_words[:max_chars]
97
+
98
+ return first_words
99
+
100
+
101
+
102
  # pipeline_lock = asyncio.Lock()
103
 
104
  @traceable
 
115
 
116
  @log_time
117
  async def handle_chat(user_input):
 
 
 
 
 
118
 
119
+
120
+ ### Handle logic for scraping, search or translation
121
  prompt = """You are Teapot, an open-source AI assistant optimized for low-end devices, providing short, accurate responses without hallucinating while excelling at information extraction and text summarization."""
122
+
123
+ # Check if there's a URL and process the input
124
+ processed_query, url = extract_first_url(user_query)
125
 
126
+ # If there's a URL, fetch the context
127
+ if url:
128
+ context = extract_text_from_html(url)
129
+ user_input = processed_query
130
+ else:
131
  # Custom prompt shims
132
  if "translate" in user_input:
133
  context=""
134
  prompt=""
135
+
136
+ else: # Search task
137
+ search_start_time = time.time()
138
+ results = await brave_search(user_input)
139
+ search_end_time = time.time()
140
+
141
+ documents = [desc.replace('<strong>', '').replace('</strong>', '') for _, desc, _ in results]
142
+
143
+ context = "\n".join(documents)
144
 
145
  generation_start_time = time.time()
146
  response = await query_teapot(prompt, context, user_input)