Spaces:

frascuchon
/

rag-mcp-server

Running

rag-mcp-server / scrape.py

frascuchon HF Staff

prevent unhandled error when getting URL content

504b4eb 5 days ago

1.01 kB

	import requests
	from bs4 import BeautifulSoup


	def get_url_content(url: str) -> str:
	"""
	Retrieve the content of a URL.

	:param url: The URL to retrieve content from.
	:return: The content of the URL as a string.
	"""
	try:
	response = requests.get(
	url,
	headers={
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
	}
	)

	if response.status_code != 200:
	print(f"Failed to retrieve content from {url}. Status code: {response.status_code} - {response.reason}")
	return ""

	# parse the html content using BeautifulSoup
	parser = BeautifulSoup(response.text, 'html.parser')

	# extract text from the parsed HTML
	return parser.text.strip() if parser.text else ""

	except Exception as e:
	print(f"An error occurred while retrieving content from {url}: {e}")
	return ""