|
import re |
|
import requests |
|
from markdownify import markdownify |
|
from requests.exceptions import RequestException |
|
import gradio as gr |
|
|
|
from utils.preprocessor import Preprocessor |
|
|
|
def visit_webpage(url, max_output_length=40000): |
|
""" |
|
Fetch the webpage, convert to markdown, and use Preprocessor methods. |
|
""" |
|
try: |
|
response = requests.get(url, timeout=20) |
|
response.raise_for_status() |
|
markdown_content = markdownify(response.text).strip() |
|
markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) |
|
if len(markdown_content) > max_output_length: |
|
markdown_content = ( |
|
markdown_content[: max_output_length // 2] |
|
+ f"\n..._This content has been truncated to stay below {max_output_length} characters_...\n" |
|
+ markdown_content[-max_output_length // 2 :] |
|
) |
|
|
|
section = Preprocessor.extract_section(markdown_content) |
|
dir_paths, files = Preprocessor.extract_dirs_from_text(section) |
|
|
|
result = ( |
|
f"paths: {dir_paths}\n\n" |
|
f"files: {files}" |
|
) |
|
return result |
|
except requests.exceptions.Timeout: |
|
return "The request timed out. Please try again later or check the URL." |
|
except RequestException as e: |
|
return f"Error fetching the webpage: {str(e)}" |
|
except Exception as e: |
|
return f"An unexpected error occurred: {str(e)}" |
|
|
|
demo = gr.Interface( |
|
fn=visit_webpage, |
|
inputs=gr.Textbox(label="Website URL"), |
|
outputs=gr.Textbox(label="Extracted Section, Directory Paths, and File Paths"), |
|
title="Webpage Section and Path Extractor" |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |