Spaces:
Sleeping
Sleeping
| """Beautiful Soup Web scraper.""" | |
| import logging | |
| from typing import Any, Callable, Dict, List, Optional, Tuple | |
| from urllib.parse import urljoin | |
| from llama_index.core.bridge.pydantic import PrivateAttr | |
| from llama_index.core.readers.base import BasePydanticReader | |
| from llama_index.core.schema import Document | |
| logger = logging.getLogger(__name__) | |
| def _substack_reader(soup: Any, **kwargs) -> Tuple[str, Dict[str, Any]]: | |
| """Extract text from Substack blog post.""" | |
| extra_info = { | |
| "Title of this Substack post": soup.select_one("h1.post-title").getText(), | |
| "Subtitle": soup.select_one("h3.subtitle").getText(), | |
| "Author": soup.select_one("span.byline-names").getText(), | |
| } | |
| text = soup.select_one("div.available-content").getText() | |
| return text, extra_info | |
| def _readthedocs_reader(soup: Any, url: str, **kwargs) -> Tuple[str, Dict[str, Any]]: | |
| """Extract text from a ReadTheDocs documentation site.""" | |
| import requests | |
| from bs4 import BeautifulSoup | |
| links = soup.find_all("a", {"class": "reference internal"}) | |
| rtd_links = [] | |
| for link in links: | |
| rtd_links.append(link["href"]) | |
| for i in range(len(rtd_links)): | |
| if not rtd_links[i].startswith("http"): | |
| rtd_links[i] = urljoin(url, rtd_links[i]) | |
| texts = [] | |
| for doc_link in rtd_links: | |
| page_link = requests.get(doc_link) | |
| soup = BeautifulSoup(page_link.text, "html.parser") | |
| try: | |
| text = soup.find(attrs={"role": "main"}).get_text() | |
| except IndexError: | |
| text = None | |
| if text: | |
| texts.append("\n".join([t for t in text.split("\n") if t])) | |
| return "\n".join(texts), {} | |
| def _readmedocs_reader( | |
| soup: Any, url: str, include_url_in_text: bool = True | |
| ) -> Tuple[str, Dict[str, Any]]: | |
| """Extract text from a ReadMe documentation site.""" | |
| import requests | |
| from bs4 import BeautifulSoup | |
| links = soup.find_all("a") | |
| docs_links = [link["href"] for link in links if "/docs/" in link["href"]] | |
| docs_links = list(set(docs_links)) | |
| for i in range(len(docs_links)): | |
| if not docs_links[i].startswith("http"): | |
| docs_links[i] = urljoin(url, docs_links[i]) | |
| texts = [] | |
| for doc_link in docs_links: | |
| page_link = requests.get(doc_link) | |
| soup = BeautifulSoup(page_link.text, "html.parser") | |
| try: | |
| text = "" | |
| for element in soup.find_all("article", {"id": "content"}): | |
| for child in element.descendants: | |
| if child.name == "a" and child.has_attr("href"): | |
| if include_url_in_text: | |
| url = child.get("href") | |
| if url is not None and "edit" in url: | |
| text += child.text | |
| else: | |
| text += ( | |
| f"{child.text} (Reference url: {doc_link}{url}) " | |
| ) | |
| elif child.string and child.string.strip(): | |
| text += child.string.strip() + " " | |
| except IndexError: | |
| text = None | |
| logger.error(f"Could not extract text from {doc_link}") | |
| continue | |
| texts.append("\n".join([t for t in text.split("\n") if t])) | |
| return "\n".join(texts), {} | |
| def _gitbook_reader( | |
| soup: Any, url: str, include_url_in_text: bool = True | |
| ) -> Tuple[str, Dict[str, Any]]: | |
| """Extract text from a ReadMe documentation site.""" | |
| import requests | |
| from bs4 import BeautifulSoup | |
| links = soup.find_all("a") | |
| docs_links = [link["href"] for link in links if "/docs/" in link["href"]] | |
| docs_links = list(set(docs_links)) | |
| for i in range(len(docs_links)): | |
| if not docs_links[i].startswith("http"): | |
| docs_links[i] = urljoin(url, docs_links[i]) | |
| texts = [] | |
| for doc_link in docs_links: | |
| page_link = requests.get(doc_link) | |
| soup = BeautifulSoup(page_link.text, "html.parser") | |
| try: | |
| text = "" | |
| text = soup.find("main") | |
| clean_text = clean_text = ", ".join([tag.get_text() for tag in text]) | |
| except IndexError: | |
| text = None | |
| logger.error(f"Could not extract text from {doc_link}") | |
| continue | |
| texts.append(clean_text) | |
| return "\n".join(texts), {} | |
| DEFAULT_WEBSITE_EXTRACTOR: Dict[ | |
| str, Callable[[Any, str], Tuple[str, Dict[str, Any]]] | |
| ] = { | |
| "substack.com": _substack_reader, | |
| "readthedocs.io": _readthedocs_reader, | |
| "readme.com": _readmedocs_reader, | |
| "gitbook.io": _gitbook_reader, | |
| } | |
| class BeautifulSoupWebReader(BasePydanticReader): | |
| """BeautifulSoup web page reader. | |
| Reads pages from the web. | |
| Requires the `bs4` and `urllib` packages. | |
| Args: | |
| website_extractor (Optional[Dict[str, Callable]]): A mapping of website | |
| hostname (e.g. google.com) to a function that specifies how to | |
| extract text from the BeautifulSoup obj. See DEFAULT_WEBSITE_EXTRACTOR. | |
| """ | |
| is_remote: bool = True | |
| _website_extractor: Dict[str, Callable] = PrivateAttr() | |
| def __init__(self, website_extractor: Optional[Dict[str, Callable]] = None) -> None: | |
| self._website_extractor = website_extractor or DEFAULT_WEBSITE_EXTRACTOR | |
| super().__init__() | |
| def class_name(cls) -> str: | |
| """Get the name identifier of the class.""" | |
| return "BeautifulSoupWebReader" | |
| def load_data( | |
| self, | |
| urls: List[str], | |
| custom_hostname: Optional[str] = None, | |
| include_url_in_text: Optional[bool] = True, | |
| ) -> List[Document]: | |
| """Load data from the urls. | |
| Args: | |
| urls (List[str]): List of URLs to scrape. | |
| custom_hostname (Optional[str]): Force a certain hostname in the case | |
| a website is displayed under custom URLs (e.g. Substack blogs) | |
| include_url_in_text (Optional[bool]): Include the reference url in the text of the document | |
| Returns: | |
| List[Document]: List of documents. | |
| """ | |
| from urllib.parse import urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| documents = [] | |
| for url in urls: | |
| try: | |
| page = requests.get(url) | |
| except Exception: | |
| raise ValueError(f"One of the inputs is not a valid url: {url}") | |
| hostname = custom_hostname or urlparse(url).hostname or "" | |
| soup = BeautifulSoup(page.content, "html.parser") | |
| data = "" | |
| extra_info = {"URL": url} | |
| if hostname in self._website_extractor: | |
| data, metadata = self._website_extractor[hostname]( | |
| soup=soup, url=url, include_url_in_text=include_url_in_text | |
| ) | |
| extra_info.update(metadata) | |
| else: | |
| data = soup.getText() | |
| documents.append(Document(text=data, id_=url, extra_info=extra_info)) | |
| return documents | |