{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import requests\n", "from bs4 import BeautifulSoup\n", "from urllib.parse import urlparse\n", "import time\n", "import os\n", "\n", "session = requests.Session()\n", "session.headers.update({\"User-Agent\": \"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36\"})\n", "\n", "data_path = \"../data\"" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def faq_crawl(url):\n", " data_path_faq = data_path + '/faqs'\n", " os.makedirs(data_path_faq, exist_ok=True)\n", "\n", " print(f\"Crawling: {url}\")\n", "\n", " time.sleep(10)\n", " response = session.get(url)\n", " if response.status_code != 200:\n", " print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n", " return\n", " \n", " filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n", " filepath = f\"{data_path_faq}/{filename}.txt\"\n", "\n", "\n", " soup = BeautifulSoup(response.content, \"html.parser\")\n", " \n", " general = soup.find_all(['h1'])\n", " faq_question = soup.find_all('span', class_=['uagb-question'])\n", " faq_content = soup.find_all('div', class_=['uagb-faq-content'])\n", "\n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " f.write(f\"Source URL: {url}\\n\\n\")\n", " for gen in general:\n", " text = gen.get_text(strip=True)\n", " f.write(f'{text}\\n')\n", " \n", " for question,answer in zip(faq_content,faq_question):\n", " q_text = question.get_text(strip=True)\n", " a_text = answer.get_text(strip=True)\n", " if q_text and a_text:\n", " f.write(f\"Q: {q_text}\\n\")\n", " f.write(f\"A: {a_text}\\n\")\n", " f.write(\"---\\n\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crawling: https://aurowellness.com/faqs/\n" ] } ], "source": [ "faq_crawl(\"https://aurowellness.com/faqs/\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def get_blog_urls_from_sitemap(sitemap_url):\n", " response = session.get(sitemap_url)\n", " soup = BeautifulSoup(response.content, features='lxml')\n", " \n", " urls = []\n", " for url_tag in soup.find_all('url'):\n", " loc_tag = url_tag.find('loc')\n", " if loc_tag and loc_tag.text:\n", " urls.append(loc_tag.text.strip())\n", " return urls" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/z9/c66hmjx12w3g41xmv5xvlyqw0000gn/T/ipykernel_20457/3923953040.py:3: XMLParsedAsHTMLWarning: It looks like you're using an HTML parser to parse an XML document.\n", "\n", "Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features=\"xml\"` into the BeautifulSoup constructor.\n", "\n", "If you want or need to use an HTML parser on this document, you can make this warning go away by filtering it. To do that, run this code before calling the BeautifulSoup constructor:\n", "\n", " from bs4 import XMLParsedAsHTMLWarning\n", " import warnings\n", "\n", " warnings.filterwarnings(\"ignore\", category=XMLParsedAsHTMLWarning)\n", "\n", " soup = BeautifulSoup(response.content, features='lxml')\n" ] } ], "source": [ "blog_urls = get_blog_urls_from_sitemap('https://aurowellness.com/post-sitemap.xml')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def crawl_blogs(url):\n", " data_path_blogs = data_path + '/blogs'\n", " os.makedirs(data_path_blogs, exist_ok=True)\n", "\n", " print(f\"Crawling: {url}\")\n", "\n", " response = session.get(url)\n", " if response.status_code != 200:\n", " print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n", " return\n", " \n", " filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n", " filepath = f\"{data_path_blogs}/{filename}.txt\"\n", "\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " blog_content = soup.find('div', class_='entry-content')\n", " if not blog_content:\n", " print(f\"No entry-content found in {url}\")\n", " return\n", "\n", " tags = blog_content.find_all(['h1','h2', 'p'])\n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " f.write(f\"Source URL: {url}\\n\\n\")\n", " for tag in tags:\n", " text = tag.get_text(strip=False)\n", " if text:\n", " f.write(f'{text}\\n')" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crawling: https://aurowellness.com/blog/\n", "No entry-content found in https://aurowellness.com/blog/\n", "Crawling: https://aurowellness.com/blog/tocotrienols/\n", "Crawling: https://aurowellness.com/blog/prevent-forehead-wrinkles/\n", "Crawling: https://aurowellness.com/blog/glutathione-lotion/\n", "Crawling: https://aurowellness.com/blog/glutathione-supplements/\n", "Crawling: https://aurowellness.com/blog/glutathione-for-skin-2/\n", "Crawling: https://aurowellness.com/blog/glutathione-vitamin-c/\n", "Crawling: https://aurowellness.com/blog/glutathione-shots/\n", "Crawling: https://aurowellness.com/blog/oxidized-glutathione/\n", "Crawling: https://aurowellness.com/blog/best-clean-skincare-for-rosacea/\n", "Crawling: https://aurowellness.com/blog/nano-glutathione/\n", "Crawling: https://aurowellness.com/blog/glutathione-for-skin/\n", "Crawling: https://aurowellness.com/blog/glutathione-cosmetics/\n", "Crawling: https://aurowellness.com/blog/glutathione-cream/\n", "Crawling: https://aurowellness.com/blog/glutathione-and-cancer/\n", "Crawling: https://aurowellness.com/blog/best-way-to-absorb-glutathione/\n", "Crawling: https://aurowellness.com/blog/acne-scar-treatment/\n", "Crawling: https://aurowellness.com/blog/anti-aging-skincare-for-men/\n", "Crawling: https://aurowellness.com/blog/benefits-of-glutathione/\n", "Crawling: https://aurowellness.com/blog/best-anti-aging-skincare-products-after-30/\n", "Crawling: https://aurowellness.com/blog/forehead-wrinkles/\n", "Crawling: https://aurowellness.com/blog/glutathione-covid/\n", "Crawling: https://aurowellness.com/blog/glutathione-ivs/\n", "Crawling: https://aurowellness.com/blog/glutathione-skincare/\n", "Crawling: https://aurowellness.com/blog/glutathione-spray/\n", "Crawling: https://aurowellness.com/blog/liposomal-glutathione/\n", "Crawling: https://aurowellness.com/blog/best-absorbed-glutathione/\n", "Crawling: https://aurowellness.com/blog/best-skincare-for-sensitive-acne-prone-skin/\n", "Crawling: https://aurowellness.com/blog/buy-glutathione/\n", "Crawling: https://aurowellness.com/blog/forehead-wrinkles-causes/\n", "Crawling: https://aurowellness.com/blog/glutathione-antioxidant/\n", "Crawling: https://aurowellness.com/blog/glutathione-injections/\n", "Crawling: https://aurowellness.com/blog/help-with-age-spots/\n", "Crawling: https://aurowellness.com/blog/how-to-prevent-wrinkles/\n", "Crawling: https://aurowellness.com/blog/magnesium-for-muscle-pain-2/\n", "Crawling: https://aurowellness.com/blog/natural-alternatives-to-botox/\n", "Crawling: https://aurowellness.com/blog/natural-antiaging-skincare/\n", "Crawling: https://aurowellness.com/blog/natural-skincare-for-acne/\n", "Crawling: https://aurowellness.com/blog/natural-ways-to-increase-glutathione-levels/\n", "Crawling: https://aurowellness.com/blog/skincare-for-40s/\n", "Crawling: https://aurowellness.com/blog/skincare-for-acne-and-antiaging/\n", "Crawling: https://aurowellness.com/blog/acetaminophen-toxicity/\n", "Crawling: https://aurowellness.com/blog/alcoholism/\n", "Crawling: https://aurowellness.com/blog/alzheimers-disease/\n", "Crawling: https://aurowellness.com/blog/autism/\n", "Crawling: https://aurowellness.com/blog/cancer-chemotherapy-2/\n", "Crawling: https://aurowellness.com/blog/chronic-fatigue-syndrome/\n", "Crawling: https://aurowellness.com/blog/customer-spotlight-melody-guy/\n", "Crawling: https://aurowellness.com/blog/cystic-fibrosis/\n", "Crawling: https://aurowellness.com/blog/diabetes/\n", "Crawling: https://aurowellness.com/blog/gastrointestinal/\n", "Crawling: https://aurowellness.com/blog/heart-disease-stroke-2/\n", "Crawling: https://aurowellness.com/blog/heavy-metal-toxicity/\n", "Crawling: https://aurowellness.com/blog/hiv/\n", "Crawling: https://aurowellness.com/blog/how-to-boost-energy-fight-disease-and-slow-aging-with-glutathione-the-fit-mess-podcast/\n", "Crawling: https://aurowellness.com/blog/influenza/\n", "Crawling: https://aurowellness.com/blog/lung-disorders/\n", "Crawling: https://aurowellness.com/blog/mgb-health-the-link-between-glutathione-oxidative-stress-mental-health/\n", "Crawling: https://aurowellness.com/blog/mold-mycotoxins/\n", "Crawling: https://aurowellness.com/blog/oxidative-stress-aging-2/\n", "Crawling: https://aurowellness.com/blog/parkinsons-disease/\n", "Crawling: https://aurowellness.com/blog/rhinovirus/\n", "Crawling: https://aurowellness.com/blog/the-benefit-of-glutathione-for-athletes/\n", "Crawling: https://aurowellness.com/blog/the-glutathione-revolution-by-nayan-patel-pharm-d-beauty-news-nyc/\n", "Crawling: https://aurowellness.com/blog/the-hidden-benefits-of-glutathione/\n", "Crawling: https://aurowellness.com/blog/top-expert-antioxidant-glutathione-crucial-to-preventing-disease-read-newsmax-top-expert-antioxidant-glutathione-crucial-to-preventing-disease-newsmax-com-urgent-your-heart-attack-risk-determine/\n", "Crawling: https://aurowellness.com/blog/glutathione/\n", "Crawling: https://aurowellness.com/blog/acid-and-enzymes-for-indigestion/\n", "Crawling: https://aurowellness.com/blog/the-results-are-in-and-the-pictures-dont-lie-auro-skincares-citrine-vitamin-c-radiance-complex-is-transforming-skin/\n", "Crawling: https://aurowellness.com/blog/riserevivedeepdive/\n", "Crawling: https://aurowellness.com/blog/ali-landry-review/\n", "Crawling: https://aurowellness.com/blog/does-glutathione-helps-with-sleep/\n", "Crawling: https://aurowellness.com/blog/glutathione-uv-damage/\n", "Crawling: https://aurowellness.com/blog/glutathione-2/\n", "Crawling: https://aurowellness.com/blog/glutathione-energy-boost-natural-energy-glutaryl-energy-producing-low-energy/\n", "Crawling: https://aurowellness.com/blog/newsmax-health-suggests-glutathione-for-healthy-aging/\n", "Crawling: https://aurowellness.com/blog/resveratrol-glutathione-and-cycloastragenol-for-skincare/\n", "Crawling: https://aurowellness.com/blog/oxidativestress/\n", "Crawling: https://aurowellness.com/blog/glutathioneandlongevity/\n", "Crawling: https://aurowellness.com/blog/glutathione-study/\n", "Crawling: https://aurowellness.com/blog/glutathioneyourtravelbuddy/\n", "Crawling: https://aurowellness.com/blog/why-do-athletes-take-glutathione/\n", "Crawling: https://aurowellness.com/blog/glutathione-and-aging/\n", "Crawling: https://aurowellness.com/blog/glutathione-cleansing-the-body/\n", "Crawling: https://aurowellness.com/blog/depletion-of-glutathione/\n", "Crawling: https://aurowellness.com/blog/understanding-glutathiones-role-in-cleansing-the-body/\n", "Crawling: https://aurowellness.com/blog/the-ultimate-immunity-kit-for-the-holidays/\n", "Crawling: https://aurowellness.com/blog/customer-spotlight-rory-carruthers/\n", "Crawling: https://aurowellness.com/blog/glutathione-help-liver-health/\n", "Crawling: https://aurowellness.com/blog/glutaryl-stands-out-as-the-preferred-approach-for-glutathione-intake/\n", "Crawling: https://aurowellness.com/blog/glutathione-acts-as-a-shield-in-oxidative-stress-and-exercise/\n", "Crawling: https://aurowellness.com/blog/can-glutathione-guard-nerves-to-avoid-peripheral-neuropathy/\n", "Crawling: https://aurowellness.com/blog/glutathione-acne/\n", "Crawling: https://aurowellness.com/blog/detox-live-with-glutathione/\n", "Crawling: https://aurowellness.com/blog/thyroid-health-and-glutathione/\n", "Crawling: https://aurowellness.com/blog/glutathione-and-alcohol-safeguarding-the-liver/\n", "Crawling: https://aurowellness.com/blog/glutathiones-impact-on-aging-cells/\n", "Crawling: https://aurowellness.com/blog/nac-vs-glutathione/\n", "Crawling: https://aurowellness.com/blog/cellular-vitality/\n", "Crawling: https://aurowellness.com/blog/glutathione-ingredient-in-skincare-routine/\n", "Crawling: https://aurowellness.com/blog/glutathione-improves-brain-health-and-prevent-cognitive-decl/\n", "Crawling: https://aurowellness.com/blog/the-role-of-glutathione-in-reducing-inflammation-a-key-factor-in-chronic-disease-prevention/\n", "Crawling: https://aurowellness.com/blog/how-stress-affects-glutathione-levels/\n", "Crawling: https://aurowellness.com/blog/5-waysto-increase-glutathione-levels/\n", "Crawling: https://aurowellness.com/blog/glutathione-and-anxiety-a-natural-approach/\n", "Crawling: https://aurowellness.com/blog/glutathione-benefits-your-essential-antioxidant/\n", "Crawling: https://aurowellness.com/blog/benefits-side-effects-of-glutathione/\n" ] } ], "source": [ "for blog_url in blog_urls:\n", " crawl_blogs(blog_url)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def technology_crawl(url):\n", " data_path_tech = data_path + '/technology'\n", " os.makedirs(data_path_tech, exist_ok=True)\n", "\n", " print(f\"Crawling: {url}\")\n", " \n", " response = session.get(url)\n", " if response.status_code != 200:\n", " print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n", " return\n", " \n", " filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n", " filepath = f\"{data_path_tech}/{filename}.txt\"\n", "\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", "\n", " technology_content = soup.find('div', class_='entry-content')\n", " if not technology_content:\n", " print(f\"No entry-content found in {url}\")\n", " return\n", " \n", " tags = technology_content.find_all(['h1', 'h2', 'h3', 'p'])\n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " f.write(f\"Source URL: {url}\\n\\n\")\n", " for tag in tags:\n", " text = tag.get_text(strip=False)\n", " if text:\n", " f.write(f'{text}\\n')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crawling: https://aurowellness.com/technology/\n" ] } ], "source": [ "technology_crawl('https://aurowellness.com/technology/')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "def revolution_crawl(url):\n", " data_path_rev = data_path + '/revolution'\n", " os.makedirs(data_path_rev, exist_ok=True)\n", "\n", " print(f\"Crawling: {url}\")\n", " \n", " response = session.get(url)\n", " if response.status_code != 200:\n", " print(f\"Failed to fetch the website, {url}. Status Code: {response.status_code}\")\n", " return\n", " \n", " filename = urlparse(url).path.strip(\"/\").replace(\"/\", \"_\") or \"home\"\n", " filepath = f\"{data_path_rev}/{filename}.txt\"\n", "\n", " soup = BeautifulSoup(response.content, 'html.parser')\n", " rev_content = soup.find('div', class_='entry-content')\n", "\n", " excluded_classes = [\n", " \"custom-related-products-title\",\n", " \"custom-related-products\",\n", " \"has-white-color\"\n", " ]\n", "\n", " content_divs = [\n", " child for child in rev_content.find_all('div', recursive=False)\n", " if not any(cls in child.get('class', []) for cls in excluded_classes)\n", " ]\n", "\n", "\n", " if not rev_content:\n", " print(f\"No entry-content found in {url}\")\n", " return\n", " \n", " with open(filepath, \"w\", encoding=\"utf-8\") as f:\n", " f.write(f\"Source URL: {url}\\n\\n\")\n", " for div in content_divs:\n", " tags = div.find_all(lambda tag: (\n", " tag.name in ['h1', 'h2', 'p'] or\n", " (tag.name == 'div' and 'uagb-tm__desc' in tag.get('class', []))\n", " ))\n", " for tag in tags:\n", " text = tag.get_text(strip=False)\n", " if text:\n", " f.write(f'{text}\\n')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Crawling: https://aurowellness.com/glutathione-revolution/\n" ] } ], "source": [ "revolution_crawl('https://aurowellness.com/glutathione-revolution/')" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.5" } }, "nbformat": 4, "nbformat_minor": 2 }