Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| import bs4 | |
| def link_find(url): | |
| out = [] | |
| source = requests.get(url) | |
| if source.status_code ==200: | |
| #soup = bs4.BeautifulSoup(source.content,'lxml') | |
| soup = bs4.BeautifulSoup(source.content,'html.parser') | |
| rawp=(f'RAW TEXT RETURNED: {soup.text}') | |
| cnt=0 | |
| cnt+=len(rawp) | |
| #out.append(rawp) | |
| #out.append("HTML fragments: ") | |
| q=("a","p","span","content","article") | |
| for p in soup.find_all("a"): | |
| out.append({"LINK TITLE":p.get('title'),"URL":p.get('href'),"STRING":p.string,"TREE":[]}) | |
| else: | |
| pass | |
| return out | |
| #https://huggingface.co/spaces/Omnibus/crawl | |
| def sitemap(url,level): | |
| uri="" | |
| if url != "" and url != None: | |
| link1=link_find(url) | |
| if level >=2: | |
| for i,ea in enumerate(link1): | |
| print(ea) | |
| try: | |
| if not ea['URL'].startswith("http"): | |
| uri1=url.split("//")[0] | |
| uri2=url.split("//")[1] | |
| uri3=uri2.split("/")[0] | |
| uri=f'{uri1}//{uri3}' | |
| print(uri) | |
| out_list=link_find(f"{uri}{ea['URL']}") | |
| link1[i]['TREE']=out_list | |
| if level>=3: | |
| for n,na in enumerate(link1[i]['TREE']): | |
| print(na) | |
| try: | |
| if not na['URL'].startswith("http"): | |
| uri11=url.split("//")[0] | |
| uri22=url.split("//")[1] | |
| uri33=uri22.split("/")[0] | |
| uri0=f'{uri11}//{uri33}' | |
| print(uri0) | |
| out_list1=link_find(f"{uri0}{na['URL']}") | |
| link1[i]['TREE'][n]['TREE']=out_list1 | |
| except Exception as e: | |
| print (e) | |
| except Exception as e: | |
| print (e) | |
| return link1 | |
| with gr.Blocks() as app: | |
| inp=gr.Textbox(label="URL") | |
| level=gr.Slider(minimum=1,maximum=3,step=1,value=2) | |
| btn=gr.Button() | |
| outp=gr.JSON() | |
| btn.click(sitemap,[inp,level],outp) | |
| app.launch() |