Spaces:

vip11017
/

auro_chatbot_backend

Running

File size: 19,412 Bytes

abd032e

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "from urllib.parse import urlparse\n",
    "import time\n",
    "import os\n",
    "\n",
    "session = requests.Session()\n",
    "session.headers.update({\"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"})\n",
    "\n",
    "data_path = \"../data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def faq_crawl(url):\n",
    "    data_path_faq = data_path + '/faqs'\n",
    "    os.makedirs(data_path_faq, exist_ok=True)\n",
    "\n",
    "    print(f\"Crawling: {url}\")\n",
    "\n",
    "    time.sleep(10)\n",
    "    response = session.get(url)\n",
    "    if response.status_code != 200:\n",
    "        print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n",
    "        return\n",
    "    \n",
    "    filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n",
    "    filepath = f\"{data_path_faq}/{filename}.txt\"\n",
    "\n",
    "\n",
    "    soup = BeautifulSoup(response.content, \"html.parser\")\n",
    "    \n",
    "    general = soup.find_all(['h1'])\n",
    "    faq_question = soup.find_all('span', class_=['uagb-question'])\n",
    "    faq_content = soup.find_all('div', class_=['uagb-faq-content'])\n",
    "\n",
    "    with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(f\"Source URL: {url}\\n\\n\")\n",
    "        for gen in general:\n",
    "            text = gen.get_text(strip=True)\n",
    "            f.write(f'{text}\\n')\n",
    "        \n",
    "        for question,answer in zip(faq_content,faq_question):\n",
    "            q_text = question.get_text(strip=True)\n",
    "            a_text = answer.get_text(strip=True)\n",
    "            if q_text and a_text:\n",
    "                f.write(f\"Q: {q_text}\\n\")\n",
    "                f.write(f\"A: {a_text}\\n\")\n",
    "            f.write(\"---\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Crawling: https://aurowellness.com/faqs/\n"
     ]
    }
   ],
   "source": [
    "faq_crawl(\"https://aurowellness.com/faqs/\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_blog_urls_from_sitemap(sitemap_url):\n",
    "    response = session.get(sitemap_url)\n",
    "    soup = BeautifulSoup(response.content, features='lxml')\n",
    "    \n",
    "    urls = []\n",
    "    for url_tag in soup.find_all('url'):\n",
    "        loc_tag = url_tag.find('loc')\n",
    "        if loc_tag and loc_tag.text:\n",
    "            urls.append(loc_tag.text.strip())\n",
    "    return urls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/z9/c66hmjx12w3g41xmv5xvlyqw0000gn/T/ipykernel_20457/3923953040.py:3: XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.\n",
      "\n",
      "Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n",
      "\n",
      "If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that, run this code before calling the BeautifulSoup constructor:\n",
      "\n",
      "    from bs4 import XMLParsedAsHTMLWarning\n",
      "    import warnings\n",
      "\n",
      "    warnings.filterwarnings(\"ignore\", category=XMLParsedAsHTMLWarning)\n",
      "\n",
      "  soup = BeautifulSoup(response.content, features='lxml')\n"
     ]
    }
   ],
   "source": [
    "blog_urls = get_blog_urls_from_sitemap('https://aurowellness.com/post-sitemap.xml')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def crawl_blogs(url):\n",
    "    data_path_blogs = data_path + '/blogs'\n",
    "    os.makedirs(data_path_blogs, exist_ok=True)\n",
    "\n",
    "    print(f\"Crawling: {url}\")\n",
    "\n",
    "    response = session.get(url)\n",
    "    if response.status_code != 200:\n",
    "        print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n",
    "        return\n",
    "    \n",
    "    filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n",
    "    filepath = f\"{data_path_blogs}/{filename}.txt\"\n",
    "\n",
    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
    "\n",
    "    blog_content = soup.find('div', class_='entry-content')\n",
    "    if not blog_content:\n",
    "        print(f\"No entry-content found in {url}\")\n",
    "        return\n",
    "\n",
    "    tags = blog_content.find_all(['h1','h2', 'p'])\n",
    "    with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(f\"Source URL: {url}\\n\\n\")\n",
    "        for tag in tags:\n",
    "            text = tag.get_text(strip=False)\n",
    "            if text:\n",
    "                f.write(f'{text}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Crawling: https://aurowellness.com/blog/\n",
      "No entry-content found in https://aurowellness.com/blog/\n",
      "Crawling: https://aurowellness.com/blog/tocotrienols/\n",
      "Crawling: https://aurowellness.com/blog/prevent-forehead-wrinkles/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-lotion/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-supplements/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-for-skin-2/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-vitamin-c/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-shots/\n",
      "Crawling: https://aurowellness.com/blog/oxidized-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/best-clean-skincare-for-rosacea/\n",
      "Crawling: https://aurowellness.com/blog/nano-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-for-skin/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-cosmetics/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-cream/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-and-cancer/\n",
      "Crawling: https://aurowellness.com/blog/best-way-to-absorb-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/acne-scar-treatment/\n",
      "Crawling: https://aurowellness.com/blog/anti-aging-skincare-for-men/\n",
      "Crawling: https://aurowellness.com/blog/benefits-of-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/best-anti-aging-skincare-products-after-30/\n",
      "Crawling: https://aurowellness.com/blog/forehead-wrinkles/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-covid/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-ivs/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-skincare/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-spray/\n",
      "Crawling: https://aurowellness.com/blog/liposomal-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/best-absorbed-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/best-skincare-for-sensitive-acne-prone-skin/\n",
      "Crawling: https://aurowellness.com/blog/buy-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/forehead-wrinkles-causes/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-antioxidant/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-injections/\n",
      "Crawling: https://aurowellness.com/blog/help-with-age-spots/\n",
      "Crawling: https://aurowellness.com/blog/how-to-prevent-wrinkles/\n",
      "Crawling: https://aurowellness.com/blog/magnesium-for-muscle-pain-2/\n",
      "Crawling: https://aurowellness.com/blog/natural-alternatives-to-botox/\n",
      "Crawling: https://aurowellness.com/blog/natural-antiaging-skincare/\n",
      "Crawling: https://aurowellness.com/blog/natural-skincare-for-acne/\n",
      "Crawling: https://aurowellness.com/blog/natural-ways-to-increase-glutathione-levels/\n",
      "Crawling: https://aurowellness.com/blog/skincare-for-40s/\n",
      "Crawling: https://aurowellness.com/blog/skincare-for-acne-and-antiaging/\n",
      "Crawling: https://aurowellness.com/blog/acetaminophen-toxicity/\n",
      "Crawling: https://aurowellness.com/blog/alcoholism/\n",
      "Crawling: https://aurowellness.com/blog/alzheimers-disease/\n",
      "Crawling: https://aurowellness.com/blog/autism/\n",
      "Crawling: https://aurowellness.com/blog/cancer-chemotherapy-2/\n",
      "Crawling: https://aurowellness.com/blog/chronic-fatigue-syndrome/\n",
      "Crawling: https://aurowellness.com/blog/customer-spotlight-melody-guy/\n",
      "Crawling: https://aurowellness.com/blog/cystic-fibrosis/\n",
      "Crawling: https://aurowellness.com/blog/diabetes/\n",
      "Crawling: https://aurowellness.com/blog/gastrointestinal/\n",
      "Crawling: https://aurowellness.com/blog/heart-disease-stroke-2/\n",
      "Crawling: https://aurowellness.com/blog/heavy-metal-toxicity/\n",
      "Crawling: https://aurowellness.com/blog/hiv/\n",
      "Crawling: https://aurowellness.com/blog/how-to-boost-energy-fight-disease-and-slow-aging-with-glutathione-the-fit-mess-podcast/\n",
      "Crawling: https://aurowellness.com/blog/influenza/\n",
      "Crawling: https://aurowellness.com/blog/lung-disorders/\n",
      "Crawling: https://aurowellness.com/blog/mgb-health-the-link-between-glutathione-oxidative-stress-mental-health/\n",
      "Crawling: https://aurowellness.com/blog/mold-mycotoxins/\n",
      "Crawling: https://aurowellness.com/blog/oxidative-stress-aging-2/\n",
      "Crawling: https://aurowellness.com/blog/parkinsons-disease/\n",
      "Crawling: https://aurowellness.com/blog/rhinovirus/\n",
      "Crawling: https://aurowellness.com/blog/the-benefit-of-glutathione-for-athletes/\n",
      "Crawling: https://aurowellness.com/blog/the-glutathione-revolution-by-nayan-patel-pharm-d-beauty-news-nyc/\n",
      "Crawling: https://aurowellness.com/blog/the-hidden-benefits-of-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/top-expert-antioxidant-glutathione-crucial-to-preventing-disease-read-newsmax-top-expert-antioxidant-glutathione-crucial-to-preventing-disease-newsmax-com-urgent-your-heart-attack-risk-determine/\n",
      "Crawling: https://aurowellness.com/blog/glutathione/\n",
      "Crawling: https://aurowellness.com/blog/acid-and-enzymes-for-indigestion/\n",
      "Crawling: https://aurowellness.com/blog/the-results-are-in-and-the-pictures-dont-lie-auro-skincares-citrine-vitamin-c-radiance-complex-is-transforming-skin/\n",
      "Crawling: https://aurowellness.com/blog/riserevivedeepdive/\n",
      "Crawling: https://aurowellness.com/blog/ali-landry-review/\n",
      "Crawling: https://aurowellness.com/blog/does-glutathione-helps-with-sleep/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-uv-damage/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-2/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-energy-boost-natural-energy-glutaryl-energy-producing-low-energy/\n",
      "Crawling: https://aurowellness.com/blog/newsmax-health-suggests-glutathione-for-healthy-aging/\n",
      "Crawling: https://aurowellness.com/blog/resveratrol-glutathione-and-cycloastragenol-for-skincare/\n",
      "Crawling: https://aurowellness.com/blog/oxidativestress/\n",
      "Crawling: https://aurowellness.com/blog/glutathioneandlongevity/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-study/\n",
      "Crawling: https://aurowellness.com/blog/glutathioneyourtravelbuddy/\n",
      "Crawling: https://aurowellness.com/blog/why-do-athletes-take-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-and-aging/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-cleansing-the-body/\n",
      "Crawling: https://aurowellness.com/blog/depletion-of-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/understanding-glutathiones-role-in-cleansing-the-body/\n",
      "Crawling: https://aurowellness.com/blog/the-ultimate-immunity-kit-for-the-holidays/\n",
      "Crawling: https://aurowellness.com/blog/customer-spotlight-rory-carruthers/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-help-liver-health/\n",
      "Crawling: https://aurowellness.com/blog/glutaryl-stands-out-as-the-preferred-approach-for-glutathione-intake/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-acts-as-a-shield-in-oxidative-stress-and-exercise/\n",
      "Crawling: https://aurowellness.com/blog/can-glutathione-guard-nerves-to-avoid-peripheral-neuropathy/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-acne/\n",
      "Crawling: https://aurowellness.com/blog/detox-live-with-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/thyroid-health-and-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-and-alcohol-safeguarding-the-liver/\n",
      "Crawling: https://aurowellness.com/blog/glutathiones-impact-on-aging-cells/\n",
      "Crawling: https://aurowellness.com/blog/nac-vs-glutathione/\n",
      "Crawling: https://aurowellness.com/blog/cellular-vitality/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-ingredient-in-skincare-routine/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-improves-brain-health-and-prevent-cognitive-decl/\n",
      "Crawling: https://aurowellness.com/blog/the-role-of-glutathione-in-reducing-inflammation-a-key-factor-in-chronic-disease-prevention/\n",
      "Crawling: https://aurowellness.com/blog/how-stress-affects-glutathione-levels/\n",
      "Crawling: https://aurowellness.com/blog/5-waysto-increase-glutathione-levels/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-and-anxiety-a-natural-approach/\n",
      "Crawling: https://aurowellness.com/blog/glutathione-benefits-your-essential-antioxidant/\n",
      "Crawling: https://aurowellness.com/blog/benefits-side-effects-of-glutathione/\n"
     ]
    }
   ],
   "source": [
    "for blog_url in blog_urls:\n",
    "    crawl_blogs(blog_url)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def technology_crawl(url):\n",
    "    data_path_tech = data_path + '/technology'\n",
    "    os.makedirs(data_path_tech, exist_ok=True)\n",
    "\n",
    "    print(f\"Crawling: {url}\")\n",
    "    \n",
    "    response = session.get(url)\n",
    "    if response.status_code != 200:\n",
    "        print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n",
    "        return\n",
    "    \n",
    "    filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n",
    "    filepath = f\"{data_path_tech}/{filename}.txt\"\n",
    "\n",
    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
    "\n",
    "    technology_content = soup.find('div', class_='entry-content')\n",
    "    if not technology_content:\n",
    "        print(f\"No entry-content found in {url}\")\n",
    "        return\n",
    "    \n",
    "    tags = technology_content.find_all(['h1', 'h2', 'h3', 'p'])\n",
    "    with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(f\"Source URL: {url}\\n\\n\")\n",
    "        for tag in tags:\n",
    "            text = tag.get_text(strip=False)\n",
    "            if text:\n",
    "                f.write(f'{text}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Crawling: https://aurowellness.com/technology/\n"
     ]
    }
   ],
   "source": [
    "technology_crawl('https://aurowellness.com/technology/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "def revolution_crawl(url):\n",
    "    data_path_rev = data_path + '/revolution'\n",
    "    os.makedirs(data_path_rev, exist_ok=True)\n",
    "\n",
    "    print(f\"Crawling: {url}\")\n",
    "    \n",
    "    response = session.get(url)\n",
    "    if response.status_code != 200:\n",
    "        print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n",
    "        return\n",
    "    \n",
    "    filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n",
    "    filepath = f\"{data_path_rev}/{filename}.txt\"\n",
    "\n",
    "    soup = BeautifulSoup(response.content, 'html.parser')\n",
    "    rev_content = soup.find('div', class_='entry-content')\n",
    "\n",
    "    excluded_classes = [\n",
    "        \"custom-related-products-title\",\n",
    "        \"custom-related-products\",\n",
    "        \"has-white-color\"\n",
    "    ]\n",
    "\n",
    "    content_divs = [\n",
    "        child for child in rev_content.find_all('div', recursive=False)\n",
    "        if not any(cls in child.get('class', []) for cls in excluded_classes)\n",
    "    ]\n",
    "\n",
    "\n",
    "    if not rev_content:\n",
    "        print(f\"No entry-content found in {url}\")\n",
    "        return\n",
    "    \n",
    "    with open(filepath, \"w\", encoding=\"utf-8\") as f:\n",
    "        f.write(f\"Source URL: {url}\\n\\n\")\n",
    "        for div in content_divs:\n",
    "            tags = div.find_all(lambda tag: (\n",
    "                tag.name in ['h1', 'h2', 'p'] or\n",
    "                (tag.name == 'div' and 'uagb-tm__desc' in tag.get('class', []))\n",
    "                ))\n",
    "            for tag in tags:\n",
    "                text = tag.get_text(strip=False)\n",
    "                if text:\n",
    "                    f.write(f'{text}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Crawling: https://aurowellness.com/glutathione-revolution/\n"
     ]
    }
   ],
   "source": [
    "revolution_crawl('https://aurowellness.com/glutathione-revolution/')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}