Spaces:
Build error
Build error
| import gradio as gr | |
| import requests | |
| import bs4 | |
| def sort_doc(in_list,steps_in=8,control=None): | |
| control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62} | |
| text=str(in_list) | |
| ######################################## | |
| sen_list=in_list | |
| ###################################### | |
| key_cnt=len(in_list) | |
| print(key_cnt) | |
| control_char=list(control_json['control']) | |
| char_len=len(control_char) | |
| if not steps_in: | |
| n_cnt=0 | |
| nx=key_cnt | |
| while True: | |
| if nx >= 1: | |
| n_cnt+=1 | |
| nx = nx/char_len | |
| else: | |
| print("#######") | |
| print(n_cnt) | |
| print(nx) | |
| print("#######") | |
| steps=n_cnt | |
| break | |
| if steps_in: | |
| steps=steps_in | |
| if control: | |
| control_len=control_json['leng']-steps | |
| control_char_val=list(control_json['control'][:control_len]) | |
| control_val=list(control_json['control'][control_len:]) | |
| val_len=len(control_val) | |
| json_out={} | |
| noun_list={} | |
| step_list=[] | |
| big_cnt=0 | |
| cnt=0 | |
| go=True | |
| step_cont_box=[] | |
| for ii in range(steps): | |
| print(ii) | |
| step_cont_box.append(0) | |
| #print (step_cont_box) | |
| mod=0 | |
| pos=len(step_cont_box)-1 | |
| if go: | |
| for i, ea in enumerate(in_list): | |
| if go: | |
| if cnt > char_len-1: | |
| #print(step_cont_box) | |
| go1=True | |
| for ii,ev in enumerate(step_cont_box): | |
| if go: | |
| if ev >= char_len-1: | |
| step_cont_box[ii]=0 | |
| if go1==True: | |
| step_cont_box[ii-1]=step_cont_box[ii-1]+1 | |
| go1=False | |
| cnt=1 | |
| else: | |
| step_cont_box[pos]=cnt | |
| cnt+=1 | |
| print(step_cont_box) | |
| out_js="" | |
| for iii,j in enumerate(step_cont_box): | |
| print(j) | |
| out_js = out_js+control_char[j] | |
| sen_obj=in_list[i] | |
| #sen_obj=proc_sen(sen_list,i) | |
| #json_out[out_js]={'nouns':ea} | |
| json_out[out_js]=sen_obj | |
| print ("#################") | |
| print (out_js) | |
| print (sen_obj) | |
| print ("#################") | |
| big_cnt+=1 | |
| if big_cnt==key_cnt: | |
| print("DONE") | |
| go=False | |
| #noun_list=proc_nouns(json_out) | |
| return json_out | |
| link_box = [] | |
| def link_find(url): | |
| out = [] | |
| source = requests.get(url) | |
| if source.status_code ==200: | |
| print("YES") | |
| #soup = bs4.BeautifulSoup(source.content,'lxml') | |
| soup = bs4.BeautifulSoup(source.content,'html.parser') | |
| rawp=(f'RAW TEXT RETURNED: {soup.text}') | |
| cnt=0 | |
| cnt+=len(rawp) | |
| rawt=soup.text | |
| #out.append(rawp) | |
| #out.append("HTML fragments: ") | |
| node1 = {"URL":url,"TITLE":soup.title,"STRING":soup.description,"TEXT":rawt,"LINKS":[],"TREE":[]} | |
| node2 = {"URL":url,"LINK_KEY":[],"LINKS":[],"TREE":[]} | |
| q=("a","p","span","content","article") | |
| for p in soup.find_all("a"): | |
| url0=p.get('href') | |
| if url0.startswith("//"): | |
| print(url0) | |
| uri1=url.split("//")[0] | |
| #uri2=url.split("//")[1] | |
| #uri3=uri2.split("/")[0] | |
| #uri=f'{uri1}//{uri3}' | |
| uri=f'{uri1}{url0}' | |
| print(uri) | |
| elif url0.startswith("/") and not url0.startswith("//"): | |
| uri1=url.split("//")[0] | |
| uri2=url.split("//")[1] | |
| uri3=uri2.split("/")[0] | |
| uri=f'{uri1}//{uri3}' | |
| uri=f'{uri}{url0}' | |
| print(uri) | |
| else: | |
| uri=url0 | |
| node1['LINKS'].append(uri) | |
| node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]}) | |
| node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]}) | |
| node2['LINKS'].append(uri) | |
| #node2['LINK_KEY'].append(uri_key) | |
| link_box.append(uri) | |
| #out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]}) | |
| else: | |
| print("NO") | |
| pass | |
| return node1,node2 | |
| #https://huggingface.co/spaces/Omnibus/crawl | |
| def sitemap(url,level): | |
| uri="" | |
| uri0="" | |
| if url != "" and url != None: | |
| link1,link2=link_find(url) | |
| if level >=2: | |
| for i,ea in enumerate(link1['TREE']): | |
| print(ea) | |
| try: | |
| #if not ea['URL'].startswith("http"): | |
| # uri1=url.split("//")[0] | |
| # uri2=url.split("//")[1] | |
| # uri3=uri2.split("/")[0] | |
| # uri=f'{uri1}//{uri3}' | |
| # print(uri) | |
| out_list1,out_list2=link_find(f"{uri}{ea['URL']}") | |
| link1['TREE'][i]=out_list1 | |
| link2['TREE'][i]=out_list2 | |
| #link1['TREE'].append(out_list) | |
| if level>=3: | |
| for n,na in enumerate(link1['TREE'][i]['TREE']): | |
| print(na) | |
| try: | |
| #if not na['URL'].startswith("http"): | |
| # uri11=url.split("//")[0] | |
| # uri22=url.split("//")[1] | |
| # uri33=uri22.split("/")[0] | |
| # uri0=f'{uri11}//{uri33}' | |
| # print(uri0) | |
| out_list1,out_list2=link_find(f"{uri0}{na['URL']}") | |
| link1['TREE'][i]['TREE'][n]=out_list1 | |
| link2['TREE'][i]['TREE'][n]=out_list2 | |
| #link1['TREE'][i]['TREE'].append(out_list1) | |
| except Exception as e: | |
| print (e) | |
| except Exception as e: | |
| print (e) | |
| uri_key=sort_doc(link_box) | |
| return link1,link2,uri_key | |
| def sitemap_OG(url,level): | |
| uri="" | |
| if url != "" and url != None: | |
| link1=link_find(url) | |
| if level >=2: | |
| for i,ea in enumerate(link1): | |
| print(ea) | |
| try: | |
| if not ea['URL'].startswith("http"): | |
| uri1=url.split("//")[0] | |
| uri2=url.split("//")[1] | |
| uri3=uri2.split("/")[0] | |
| uri=f'{uri1}//{uri3}' | |
| print(uri) | |
| out_list=link_find(f"{uri}{ea['URL']}") | |
| link1[i]['TREE']=out_list | |
| if level>=3: | |
| for n,na in enumerate(link1[i]['TREE']): | |
| print(na) | |
| try: | |
| if not na['URL'].startswith("http"): | |
| uri11=url.split("//")[0] | |
| uri22=url.split("//")[1] | |
| uri33=uri22.split("/")[0] | |
| uri0=f'{uri11}//{uri33}' | |
| print(uri0) | |
| out_list1=link_find(f"{uri0}{na['URL']}") | |
| link1[i]['TREE'][n]['TREE']=out_list1 | |
| except Exception as e: | |
| print (e) | |
| except Exception as e: | |
| print (e) | |
| return link1 | |
| with gr.Blocks(theme="Nymbo/Nymbo_Theme") as app: | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| with gr.Row(): | |
| inp=gr.Textbox(label="URL") | |
| level=gr.Slider(minimum=1,maximum=2,step=1,value=1) | |
| btn=gr.Button() | |
| key_json=gr.JSON() | |
| outp=gr.JSON() | |
| with gr.Column(scale=1): | |
| outmap=gr.JSON() | |
| btn.click(sitemap,[inp,level],[outp,outmap,key_json]) | |
| app.launch() |