Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -21,10 +21,18 @@ def link_find(url):
|
|
| 21 |
|
| 22 |
q=("a","p","span","content","article")
|
| 23 |
for p in soup.find_all("a"):
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
|
| 30 |
|
|
@@ -36,18 +44,19 @@ def link_find(url):
|
|
| 36 |
|
| 37 |
def sitemap(url,level):
|
| 38 |
uri=""
|
|
|
|
| 39 |
if url != "" and url != None:
|
| 40 |
link1,link2=link_find(url)
|
| 41 |
if level >=2:
|
| 42 |
for i,ea in enumerate(link1['TREE']):
|
| 43 |
print(ea)
|
| 44 |
try:
|
| 45 |
-
if not ea['URL'].startswith("http"):
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
|
| 52 |
link1['TREE'][i]=out_list1
|
| 53 |
link2['TREE'][i]=out_list2
|
|
@@ -57,12 +66,12 @@ def sitemap(url,level):
|
|
| 57 |
for n,na in enumerate(link1['TREE'][i]['TREE']):
|
| 58 |
print(na)
|
| 59 |
try:
|
| 60 |
-
if not na['URL'].startswith("http"):
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
|
| 67 |
link1['TREE'][i]['TREE'][n]=out_list1
|
| 68 |
link2['TREE'][i]['TREE'][n]=out_list2
|
|
|
|
| 21 |
|
| 22 |
q=("a","p","span","content","article")
|
| 23 |
for p in soup.find_all("a"):
|
| 24 |
+
url0=p.get('href')
|
| 25 |
+
if not url0.startswith("http"):
|
| 26 |
+
uri1=url0.split("//")[0]
|
| 27 |
+
uri2=url0.split("//")[1]
|
| 28 |
+
uri3=uri2.split("/")[0]
|
| 29 |
+
uri=f'{uri1}//{uri3}'
|
| 30 |
+
print(uri)
|
| 31 |
+
|
| 32 |
+
node1['LINKS'].append(uri)
|
| 33 |
+
node1['TREE'].append({"URL":uri,"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","LINKS":[],"TREE":[]})
|
| 34 |
+
node2['TREE'].append({"URL":uri,"LINKS":[],"TREE":[]})
|
| 35 |
+
node2['LINKS'].append(uri)
|
| 36 |
|
| 37 |
#out.append({"URL":p.get('href'),"TITLE":p.get('title'),"STRING":p.string,"TEXT":"","TREE":[]})
|
| 38 |
|
|
|
|
| 44 |
|
| 45 |
def sitemap(url,level):
|
| 46 |
uri=""
|
| 47 |
+
uri0=""
|
| 48 |
if url != "" and url != None:
|
| 49 |
link1,link2=link_find(url)
|
| 50 |
if level >=2:
|
| 51 |
for i,ea in enumerate(link1['TREE']):
|
| 52 |
print(ea)
|
| 53 |
try:
|
| 54 |
+
#if not ea['URL'].startswith("http"):
|
| 55 |
+
# uri1=url.split("//")[0]
|
| 56 |
+
# uri2=url.split("//")[1]
|
| 57 |
+
# uri3=uri2.split("/")[0]
|
| 58 |
+
# uri=f'{uri1}//{uri3}'
|
| 59 |
+
# print(uri)
|
| 60 |
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
|
| 61 |
link1['TREE'][i]=out_list1
|
| 62 |
link2['TREE'][i]=out_list2
|
|
|
|
| 66 |
for n,na in enumerate(link1['TREE'][i]['TREE']):
|
| 67 |
print(na)
|
| 68 |
try:
|
| 69 |
+
#if not na['URL'].startswith("http"):
|
| 70 |
+
# uri11=url.split("//")[0]
|
| 71 |
+
# uri22=url.split("//")[1]
|
| 72 |
+
# uri33=uri22.split("/")[0]
|
| 73 |
+
# uri0=f'{uri11}//{uri33}'
|
| 74 |
+
# print(uri0)
|
| 75 |
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
|
| 76 |
link1['TREE'][i]['TREE'][n]=out_list1
|
| 77 |
link2['TREE'][i]['TREE'][n]=out_list2
|