Spaces:
Runtime error
Runtime error
Commit
·
92bcd1d
1
Parent(s):
78f8e89
updategradiofrontend
Browse files- app.py +63 -39
- applocal.py +63 -39
- config.ini +5 -5
- huixiangdou/service/findarticles.py +90 -31
- huixiangdou/service/worker.py +3 -2
- requirements.txt +0 -10
app.py
CHANGED
@@ -167,21 +167,28 @@ def update_repo_info():
|
|
167 |
if os.path.exists(repodir):
|
168 |
pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
|
169 |
number_of_pdf = len(pdffiles)
|
|
|
170 |
if os.path.exists(os.path.join(repodir,'info.json')):
|
171 |
|
172 |
with open(os.path.join(repodir,'info.json'), 'r') as f:
|
173 |
repo_info = json.load(f)
|
174 |
|
175 |
keywords = repo_info['keywords']
|
176 |
-
length = repo_info['len']
|
177 |
retmax = repo_info['retmax']
|
178 |
-
|
|
|
|
|
179 |
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
181 |
else:
|
182 |
-
return None,None,None,None,number_of_pdf
|
183 |
else:
|
184 |
-
return None,None,None,None,None
|
185 |
|
186 |
def upload_file(files):
|
187 |
repodir, workdir, _ = get_ready('repo_work')
|
@@ -196,12 +203,11 @@ def upload_file(files):
|
|
196 |
|
197 |
return files
|
198 |
|
199 |
-
def generate_articles_repo(
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
pmids = [k for k in
|
204 |
-
keys = [k for k in string if not k.isdigit()]
|
205 |
|
206 |
repodir, _, _ = get_ready('repo_work')
|
207 |
|
@@ -225,15 +231,26 @@ def delete_articles_repo():
|
|
225 |
visible = True)
|
226 |
|
227 |
def update_repo():
|
228 |
-
keys,
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
232 |
if pdflen:
|
233 |
-
newinfo
|
234 |
else:
|
235 |
-
newinfo
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
return gr.Textbox(label="文献库概况",lines =1,
|
238 |
value = newinfo,
|
239 |
visible = True)
|
@@ -464,11 +481,12 @@ def main_interface():
|
|
464 |
gr.Markdown("""
|
465 |
#### 查找文献 📚
|
466 |
|
467 |
-
1.
|
468 |
- 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
|
469 |
-
- 设置查找数量(0-
|
470 |
-
-
|
471 |
-
|
|
|
472 |
2. **上传PDF**
|
473 |
- 通过“上传PDF”按钮上传您已有的PDF文献文件。
|
474 |
|
@@ -492,36 +510,43 @@ def main_interface():
|
|
492 |
""")
|
493 |
with gr.Row(equal_height=True):
|
494 |
with gr.Column(scale=1):
|
495 |
-
|
496 |
-
|
|
|
497 |
lines = 5)
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
file_output = gr.File(scale=2)
|
508 |
upload_button = gr.UploadButton("上传PDF",
|
509 |
-
file_types=[".pdf"
|
510 |
-
file_count="multiple",scale=
|
511 |
|
512 |
with gr.Row(equal_height=True):
|
513 |
with gr.Column(scale=0):
|
514 |
delete_repo_button = gr.Button("删除文献库")
|
515 |
update_repo_button = gr.Button("更新文献库情况")
|
516 |
with gr.Column(scale=2):
|
517 |
-
|
518 |
-
|
519 |
|
520 |
generate_repo_button.click(generate_articles_repo,
|
521 |
-
inputs=[input_keys,retmax],
|
522 |
outputs = [repo_summary])
|
523 |
|
524 |
-
|
525 |
delete_repo_button.click(delete_articles_repo, inputs=None,
|
526 |
outputs = repo_summary)
|
527 |
update_repo_button.click(update_repo, inputs=None,
|
@@ -535,7 +560,6 @@ def main_interface():
|
|
535 |
minimum=128, maximum=4096,value=1024,step=1,
|
536 |
interactive=True)
|
537 |
ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
|
538 |
-
# default=["20", "50", '100'],
|
539 |
label="Number of Clusters",
|
540 |
info="How many Clusters you want to generate")
|
541 |
|
|
|
167 |
if os.path.exists(repodir):
|
168 |
pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
|
169 |
number_of_pdf = len(pdffiles)
|
170 |
+
# 判断info.json是否存在
|
171 |
if os.path.exists(os.path.join(repodir,'info.json')):
|
172 |
|
173 |
with open(os.path.join(repodir,'info.json'), 'r') as f:
|
174 |
repo_info = json.load(f)
|
175 |
|
176 |
keywords = repo_info['keywords']
|
|
|
177 |
retmax = repo_info['retmax']
|
178 |
+
search_len = len(repo_info['search_pmids'])
|
179 |
+
import_len = len(repo_info['import_pmids'])
|
180 |
+
failed_pmid_len = len(repo_info['failed_pmids'])
|
181 |
|
182 |
+
pmc_success = repo_info['pmc_success_d']
|
183 |
+
scihub_success = repo_info['scihub_success_d']
|
184 |
+
failed_download = repo_info['failed_download']
|
185 |
+
|
186 |
+
number_of_upload = number_of_pdf-scihub_success
|
187 |
+
return keywords, retmax, search_len, import_len, failed_pmid_len, pmc_success, scihub_success, number_of_pdf, failed_download, number_of_upload
|
188 |
else:
|
189 |
+
return None,None,None,None,None,None,None,None,None,number_of_pdf
|
190 |
else:
|
191 |
+
return None,None,None,None,None,None,None,None,None,None
|
192 |
|
193 |
def upload_file(files):
|
194 |
repodir, workdir, _ = get_ready('repo_work')
|
|
|
203 |
|
204 |
return files
|
205 |
|
206 |
+
def generate_articles_repo(keys:str,pmids,retmax:int):
|
207 |
|
208 |
+
keys = [k.strip() for k in keys.split('\n')]
|
209 |
+
pmids = [k.strip() for k in pmids.split('\n')]
|
210 |
+
pmids = [k for k in pmids if k.isdigit()]
|
|
|
211 |
|
212 |
repodir, _, _ = get_ready('repo_work')
|
213 |
|
|
|
231 |
visible = True)
|
232 |
|
233 |
def update_repo():
|
234 |
+
keys, retmax, search_len, import_len, _, pmc_success, scihub_success, pdflen, failed, pdflen = update_repo_info()
|
235 |
+
newinfo = ""
|
236 |
+
if keys == None:
|
237 |
+
newinfo += '无关键词搜索相关信息\n'
|
238 |
+
newinfo += '无导入的PMID\n'
|
239 |
if pdflen:
|
240 |
+
newinfo += f'上传的PDF数量: {pdflen}\n'
|
241 |
else:
|
242 |
+
newinfo += '无上传的PDF\n'
|
243 |
+
else:
|
244 |
+
newinfo += f'关键词搜索:'
|
245 |
+
newinfo += f' 关键词: {keys}\n'
|
246 |
+
newinfo += f' 搜索上限: {retmax}\n'
|
247 |
+
newinfo += f' 搜索到的PMID数量: {search_len}\n'
|
248 |
+
newinfo += f'导入的PMID数量: {import_len}\n'
|
249 |
+
newinfo += f'成功获取PMC全文数量: {pmc_success}\n'
|
250 |
+
newinfo += f'成功获取SciHub全文数量: {scihub_success}\n'
|
251 |
+
newinfo += f"下载失败的ID: {failed}\n"
|
252 |
+
newinfo += f'上传的PDF数量: {pdflen}\n'
|
253 |
+
|
254 |
return gr.Textbox(label="文献库概况",lines =1,
|
255 |
value = newinfo,
|
256 |
visible = True)
|
|
|
481 |
gr.Markdown("""
|
482 |
#### 查找文献 📚
|
483 |
|
484 |
+
1. **输入关键词或PMID批量PubMed PMC文献**
|
485 |
- 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
|
486 |
+
- 设置查找数量(0-500)。
|
487 |
+
- 在“输入PMID”框中输入在PubMed中导出的PMID,每行一个。
|
488 |
+
- 点击“搜索PubMed 并拉取全文”按钮进行文献查找。目前主要基于PMC数据库和scihub, 在PMC中未收录的文献将使用scihub下载,scihub近年文献未收录
|
489 |
+
|
490 |
2. **上传PDF**
|
491 |
- 通过“上传PDF”按钮上传您已有的PDF文献文件。
|
492 |
|
|
|
510 |
""")
|
511 |
with gr.Row(equal_height=True):
|
512 |
with gr.Column(scale=1):
|
513 |
+
with gr.Row():
|
514 |
+
with gr.Column(scale=1):
|
515 |
+
input_keys = gr.Textbox(label="感兴趣的关键词, 换行分隔, 不太好用别用等我改改",
|
516 |
lines = 5)
|
517 |
+
retmax = gr.Slider(
|
518 |
+
minimum=0,
|
519 |
+
maximum=500,
|
520 |
+
value=250,
|
521 |
+
interactive=True,
|
522 |
+
label="搜索上限",
|
523 |
+
info="How many articles you want to retrieve?"
|
524 |
+
)
|
525 |
+
|
526 |
+
with gr.Column(scale=1):
|
527 |
+
input_pmids = gr.Textbox(label="输入PMID, 换行分隔",
|
528 |
+
lines = 5)
|
529 |
+
|
530 |
+
generate_repo_button = gr.Button("搜索PubMed并拉取全文")
|
531 |
+
|
532 |
+
with gr.Column(scale=1):
|
533 |
file_output = gr.File(scale=2)
|
534 |
upload_button = gr.UploadButton("上传PDF",
|
535 |
+
file_types=[".pdf"],
|
536 |
+
file_count="multiple",scale=1)
|
537 |
|
538 |
with gr.Row(equal_height=True):
|
539 |
with gr.Column(scale=0):
|
540 |
delete_repo_button = gr.Button("删除文献库")
|
541 |
update_repo_button = gr.Button("更新文献库情况")
|
542 |
with gr.Column(scale=2):
|
543 |
+
repo_summary =gr.Textbox(label= '文献库概况',
|
544 |
+
value="目前还没有文献库")
|
545 |
|
546 |
generate_repo_button.click(generate_articles_repo,
|
547 |
+
inputs=[input_keys,input_pmids,retmax],
|
548 |
outputs = [repo_summary])
|
549 |
|
|
|
550 |
delete_repo_button.click(delete_articles_repo, inputs=None,
|
551 |
outputs = repo_summary)
|
552 |
update_repo_button.click(update_repo, inputs=None,
|
|
|
560 |
minimum=128, maximum=4096,value=1024,step=1,
|
561 |
interactive=True)
|
562 |
ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
|
|
|
563 |
label="Number of Clusters",
|
564 |
info="How many Clusters you want to generate")
|
565 |
|
applocal.py
CHANGED
@@ -167,21 +167,28 @@ def update_repo_info():
|
|
167 |
if os.path.exists(repodir):
|
168 |
pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
|
169 |
number_of_pdf = len(pdffiles)
|
|
|
170 |
if os.path.exists(os.path.join(repodir,'info.json')):
|
171 |
|
172 |
with open(os.path.join(repodir,'info.json'), 'r') as f:
|
173 |
repo_info = json.load(f)
|
174 |
|
175 |
keywords = repo_info['keywords']
|
176 |
-
length = repo_info['len']
|
177 |
retmax = repo_info['retmax']
|
178 |
-
|
|
|
|
|
179 |
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
181 |
else:
|
182 |
-
return None,None,None,None,number_of_pdf
|
183 |
else:
|
184 |
-
return None,None,None,None,None
|
185 |
|
186 |
def upload_file(files):
|
187 |
repodir, workdir, _ = get_ready('repo_work')
|
@@ -196,12 +203,11 @@ def upload_file(files):
|
|
196 |
|
197 |
return files
|
198 |
|
199 |
-
def generate_articles_repo(
|
200 |
|
201 |
-
|
202 |
-
|
203 |
-
pmids = [k for k in
|
204 |
-
keys = [k for k in string if not k.isdigit()]
|
205 |
|
206 |
repodir, _, _ = get_ready('repo_work')
|
207 |
|
@@ -225,15 +231,26 @@ def delete_articles_repo():
|
|
225 |
visible = True)
|
226 |
|
227 |
def update_repo():
|
228 |
-
keys,
|
229 |
-
|
230 |
-
|
231 |
-
|
|
|
232 |
if pdflen:
|
233 |
-
newinfo
|
234 |
else:
|
235 |
-
newinfo
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
return gr.Textbox(label="文献库概况",lines =1,
|
238 |
value = newinfo,
|
239 |
visible = True)
|
@@ -464,11 +481,12 @@ def main_interface():
|
|
464 |
gr.Markdown("""
|
465 |
#### 查找文献 📚
|
466 |
|
467 |
-
1.
|
468 |
- 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
|
469 |
-
- 设置查找数量(0-
|
470 |
-
-
|
471 |
-
|
|
|
472 |
2. **上传PDF**
|
473 |
- 通过“上传PDF”按钮上传您已有的PDF文献文件。
|
474 |
|
@@ -492,36 +510,43 @@ def main_interface():
|
|
492 |
""")
|
493 |
with gr.Row(equal_height=True):
|
494 |
with gr.Column(scale=1):
|
495 |
-
|
496 |
-
|
|
|
497 |
lines = 5)
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
507 |
file_output = gr.File(scale=2)
|
508 |
upload_button = gr.UploadButton("上传PDF",
|
509 |
-
file_types=[".pdf"
|
510 |
-
file_count="multiple",scale=
|
511 |
|
512 |
with gr.Row(equal_height=True):
|
513 |
with gr.Column(scale=0):
|
514 |
delete_repo_button = gr.Button("删除文献库")
|
515 |
update_repo_button = gr.Button("更新文献库情况")
|
516 |
with gr.Column(scale=2):
|
517 |
-
|
518 |
-
|
519 |
|
520 |
generate_repo_button.click(generate_articles_repo,
|
521 |
-
inputs=[input_keys,retmax],
|
522 |
outputs = [repo_summary])
|
523 |
|
524 |
-
|
525 |
delete_repo_button.click(delete_articles_repo, inputs=None,
|
526 |
outputs = repo_summary)
|
527 |
update_repo_button.click(update_repo, inputs=None,
|
@@ -535,7 +560,6 @@ def main_interface():
|
|
535 |
minimum=128, maximum=4096,value=1024,step=1,
|
536 |
interactive=True)
|
537 |
ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
|
538 |
-
# default=["20", "50", '100'],
|
539 |
label="Number of Clusters",
|
540 |
info="How many Clusters you want to generate")
|
541 |
|
|
|
167 |
if os.path.exists(repodir):
|
168 |
pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
|
169 |
number_of_pdf = len(pdffiles)
|
170 |
+
# 判断info.json是否存在
|
171 |
if os.path.exists(os.path.join(repodir,'info.json')):
|
172 |
|
173 |
with open(os.path.join(repodir,'info.json'), 'r') as f:
|
174 |
repo_info = json.load(f)
|
175 |
|
176 |
keywords = repo_info['keywords']
|
|
|
177 |
retmax = repo_info['retmax']
|
178 |
+
search_len = len(repo_info['search_pmids'])
|
179 |
+
import_len = len(repo_info['import_pmids'])
|
180 |
+
failed_pmid_len = len(repo_info['failed_pmids'])
|
181 |
|
182 |
+
pmc_success = repo_info['pmc_success_d']
|
183 |
+
scihub_success = repo_info['scihub_success_d']
|
184 |
+
failed_download = repo_info['failed_download']
|
185 |
+
|
186 |
+
number_of_upload = number_of_pdf-scihub_success
|
187 |
+
return keywords, retmax, search_len, import_len, failed_pmid_len, pmc_success, scihub_success, number_of_pdf, failed_download, number_of_upload
|
188 |
else:
|
189 |
+
return None,None,None,None,None,None,None,None,None,number_of_pdf
|
190 |
else:
|
191 |
+
return None,None,None,None,None,None,None,None,None,None
|
192 |
|
193 |
def upload_file(files):
|
194 |
repodir, workdir, _ = get_ready('repo_work')
|
|
|
203 |
|
204 |
return files
|
205 |
|
206 |
+
def generate_articles_repo(keys:str,pmids,retmax:int):
|
207 |
|
208 |
+
keys = [k.strip() for k in keys.split('\n')]
|
209 |
+
pmids = [k.strip() for k in pmids.split('\n')]
|
210 |
+
pmids = [k for k in pmids if k.isdigit()]
|
|
|
211 |
|
212 |
repodir, _, _ = get_ready('repo_work')
|
213 |
|
|
|
231 |
visible = True)
|
232 |
|
233 |
def update_repo():
|
234 |
+
keys, retmax, search_len, import_len, _, pmc_success, scihub_success, pdflen, failed, pdflen = update_repo_info()
|
235 |
+
newinfo = ""
|
236 |
+
if keys == None:
|
237 |
+
newinfo += '无关键词搜索相关信息\n'
|
238 |
+
newinfo += '无导入的PMID\n'
|
239 |
if pdflen:
|
240 |
+
newinfo += f'上传的PDF数量: {pdflen}\n'
|
241 |
else:
|
242 |
+
newinfo += '无上传的PDF\n'
|
243 |
+
else:
|
244 |
+
newinfo += f'关键词搜索:'
|
245 |
+
newinfo += f' 关键词: {keys}\n'
|
246 |
+
newinfo += f' 搜索上限: {retmax}\n'
|
247 |
+
newinfo += f' 搜索到的PMID数量: {search_len}\n'
|
248 |
+
newinfo += f'导入的PMID数量: {import_len}\n'
|
249 |
+
newinfo += f'成功获取PMC全文数量: {pmc_success}\n'
|
250 |
+
newinfo += f'成功获取SciHub全文数量: {scihub_success}\n'
|
251 |
+
newinfo += f"下载失败的ID: {failed}\n"
|
252 |
+
newinfo += f'上传的PDF数量: {pdflen}\n'
|
253 |
+
|
254 |
return gr.Textbox(label="文献库概况",lines =1,
|
255 |
value = newinfo,
|
256 |
visible = True)
|
|
|
481 |
gr.Markdown("""
|
482 |
#### 查找文献 📚
|
483 |
|
484 |
+
1. **输入关键词或PMID批量PubMed PMC文献**
|
485 |
- 在“感兴趣的关键词”框中输入您感兴趣的关键词,每行一个。
|
486 |
+
- 设置查找数量(0-500)。
|
487 |
+
- 在“输入PMID”框中输入在PubMed中导出的PMID,每行一个。
|
488 |
+
- 点击“搜索PubMed 并拉取全文”按钮进行文献查找。目前主要基于PMC数据库和scihub, 在PMC中未收录的文献将使用scihub下载,scihub近年文献未收录
|
489 |
+
|
490 |
2. **上传PDF**
|
491 |
- 通过“上传PDF”按钮上传您已有的PDF文献文件。
|
492 |
|
|
|
510 |
""")
|
511 |
with gr.Row(equal_height=True):
|
512 |
with gr.Column(scale=1):
|
513 |
+
with gr.Row():
|
514 |
+
with gr.Column(scale=1):
|
515 |
+
input_keys = gr.Textbox(label="感兴趣的关键词, 换行分隔, 不太好用别用等我改改",
|
516 |
lines = 5)
|
517 |
+
retmax = gr.Slider(
|
518 |
+
minimum=0,
|
519 |
+
maximum=500,
|
520 |
+
value=250,
|
521 |
+
interactive=True,
|
522 |
+
label="搜索上限",
|
523 |
+
info="How many articles you want to retrieve?"
|
524 |
+
)
|
525 |
+
|
526 |
+
with gr.Column(scale=1):
|
527 |
+
input_pmids = gr.Textbox(label="输入PMID, 换行分隔",
|
528 |
+
lines = 5)
|
529 |
+
|
530 |
+
generate_repo_button = gr.Button("搜索PubMed并拉取全文")
|
531 |
+
|
532 |
+
with gr.Column(scale=1):
|
533 |
file_output = gr.File(scale=2)
|
534 |
upload_button = gr.UploadButton("上传PDF",
|
535 |
+
file_types=[".pdf"],
|
536 |
+
file_count="multiple",scale=1)
|
537 |
|
538 |
with gr.Row(equal_height=True):
|
539 |
with gr.Column(scale=0):
|
540 |
delete_repo_button = gr.Button("删除文献库")
|
541 |
update_repo_button = gr.Button("更新文献库情况")
|
542 |
with gr.Column(scale=2):
|
543 |
+
repo_summary =gr.Textbox(label= '文献库概况',
|
544 |
+
value="目前还没有文献库")
|
545 |
|
546 |
generate_repo_button.click(generate_articles_repo,
|
547 |
+
inputs=[input_keys,input_pmids,retmax],
|
548 |
outputs = [repo_summary])
|
549 |
|
|
|
550 |
delete_repo_button.click(delete_articles_repo, inputs=None,
|
551 |
outputs = repo_summary)
|
552 |
update_repo_button.click(update_repo, inputs=None,
|
|
|
560 |
minimum=128, maximum=4096,value=1024,step=1,
|
561 |
interactive=True)
|
562 |
ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
|
|
|
563 |
label="Number of Clusters",
|
564 |
info="How many Clusters you want to generate")
|
565 |
|
config.ini
CHANGED
@@ -4,7 +4,7 @@ embedding_model_path = "/root/models/bce-embedding-base_v1"
|
|
4 |
reranker_model_path = "/root/models/bce-reranker-base_v1"
|
5 |
repo_dir = "repodir"
|
6 |
work_dir = "workdir"
|
7 |
-
n_clusters = [
|
8 |
chunk_size = 1024
|
9 |
|
10 |
[web_search]
|
@@ -13,7 +13,7 @@ domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.c
|
|
13 |
save_dir = "logs/web_search_result"
|
14 |
|
15 |
[llm]
|
16 |
-
enable_local =
|
17 |
enable_remote = 1
|
18 |
client_url = "http://127.0.0.1:8888/inference"
|
19 |
|
@@ -21,11 +21,11 @@ client_url = "http://127.0.0.1:8888/inference"
|
|
21 |
local_llm_path = "/root/models/Qwen1.5-7B-Chat"
|
22 |
local_llm_max_text_length = 32000
|
23 |
local_llm_bind_port = 8888
|
24 |
-
remote_type = "
|
25 |
-
remote_api_key = "
|
26 |
remote_base_url = ""
|
27 |
remote_llm_max_text_length = 32000
|
28 |
-
remote_llm_model = "
|
29 |
rpm = 500
|
30 |
|
31 |
[worker]
|
|
|
4 |
reranker_model_path = "/root/models/bce-reranker-base_v1"
|
5 |
repo_dir = "repodir"
|
6 |
work_dir = "workdir"
|
7 |
+
n_clusters = [10, 20]
|
8 |
chunk_size = 1024
|
9 |
|
10 |
[web_search]
|
|
|
13 |
save_dir = "logs/web_search_result"
|
14 |
|
15 |
[llm]
|
16 |
+
enable_local = 1
|
17 |
enable_remote = 1
|
18 |
client_url = "http://127.0.0.1:8888/inference"
|
19 |
|
|
|
21 |
local_llm_path = "/root/models/Qwen1.5-7B-Chat"
|
22 |
local_llm_max_text_length = 32000
|
23 |
local_llm_bind_port = 8888
|
24 |
+
remote_type = ""
|
25 |
+
remote_api_key = ""
|
26 |
remote_base_url = ""
|
27 |
remote_llm_max_text_length = 32000
|
28 |
+
remote_llm_model = ""
|
29 |
rpm = 500
|
30 |
|
31 |
[worker]
|
huixiangdou/service/findarticles.py
CHANGED
@@ -7,19 +7,56 @@ import json
|
|
7 |
import shutil
|
8 |
from loguru import logger
|
9 |
from lxml import etree
|
10 |
-
import
|
11 |
-
from
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
|
25 |
class ArticleRetrieval:
|
@@ -83,11 +120,15 @@ class ArticleRetrieval:
|
|
83 |
response = requests.get(base_url, params=params)
|
84 |
root = ET.fromstring(response.content)
|
85 |
idlist = root.find('.//IdList')
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
|
|
|
88 |
self.pmids.extend(pmids)
|
89 |
|
90 |
-
|
91 |
# 解析XML文件
|
92 |
def _get_all_text(self, element):
|
93 |
"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
|
@@ -115,8 +156,16 @@ class ArticleRetrieval:
|
|
115 |
if not os.path.exists(self.repo_dir):
|
116 |
os.makedirs(self.repo_dir)
|
117 |
print(f"Saving articles to {self.repo_dir}.")
|
118 |
-
self.
|
|
|
|
|
|
|
119 |
for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
|
|
|
|
|
|
|
|
|
|
|
120 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
121 |
params = {
|
122 |
"db": "pmc",
|
@@ -127,20 +176,33 @@ class ArticleRetrieval:
|
|
127 |
response = requests.get(base_url, params=params)
|
128 |
full_text = self._clean_xml(response.text)
|
129 |
if full_text.strip() == '':
|
|
|
130 |
continue
|
131 |
else:
|
132 |
-
logger.info(full_text[:
|
133 |
with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
|
134 |
f.write(full_text)
|
135 |
-
self.
|
136 |
for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
|
140 |
def save_config(self):
|
141 |
config = {
|
142 |
-
'keywords': self.keywords,
|
143 |
'repo_dir': self.repo_dir,
|
|
|
|
|
|
|
|
|
|
|
144 |
'result': [
|
145 |
{
|
146 |
'pmid': r[0],
|
@@ -148,9 +210,10 @@ class ArticleRetrieval:
|
|
148 |
'doi': r[2]
|
149 |
} for r in self.esummary
|
150 |
],
|
151 |
-
|
152 |
-
|
153 |
-
|
|
|
154 |
}
|
155 |
with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
|
156 |
json.dump(config, f, indent=4, ensure_ascii=False)
|
@@ -169,14 +232,10 @@ if __name__ == '__main__':
|
|
169 |
shutil.rmtree('repodir')
|
170 |
|
171 |
strings = """
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
38398096
|
177 |
-
38255885
|
178 |
-
38035547
|
179 |
-
38734498"""
|
180 |
string = [k.strip() for k in strings.split('\n')]
|
181 |
|
182 |
pmids = [k for k in string if k.isdigit()]
|
|
|
7 |
import shutil
|
8 |
from loguru import logger
|
9 |
from lxml import etree
|
10 |
+
import requests
|
11 |
+
from bs4 import BeautifulSoup
|
12 |
+
import os
|
13 |
+
|
14 |
+
def download_pdfs(path, doi_list): #fox dalao contribution https://github.com/BigWhiteFox
|
15 |
+
# 确保下载目录存在
|
16 |
+
if not os.path.exists(path):
|
17 |
+
os.makedirs(path)
|
18 |
+
if isinstance(doi_list, str):
|
19 |
+
doi_list = [doi_list]
|
20 |
+
href_list = []
|
21 |
+
|
22 |
+
for doi in doi_list:
|
23 |
+
url = f"https://sci-hub.se/{doi}"
|
24 |
+
response = requests.get(url)
|
25 |
|
26 |
+
# 检查请求是否成功
|
27 |
+
if response.status_code == 200:
|
28 |
+
print(f"成功请求:{url}")
|
29 |
+
else:
|
30 |
+
print(f"请求失败:{url},状态码:{response.status_code}")
|
31 |
+
continue # 如果请求失败,跳过本次循环
|
32 |
+
|
33 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
34 |
+
buttons = soup.find_all('button', onclick=True)
|
35 |
+
|
36 |
+
for button in buttons:
|
37 |
+
onclick = button.get('onclick')
|
38 |
+
if onclick:
|
39 |
+
pdf_url = onclick.split("'")[1]
|
40 |
+
href_list.append((pdf_url, doi))
|
41 |
+
print("pdf_url:", pdf_url)
|
42 |
+
print("href_list:", href_list)
|
43 |
+
|
44 |
+
# 遍历href_list中的每个URL
|
45 |
+
for href, doi in href_list:
|
46 |
+
pdf_url = f"https:{href}"
|
47 |
+
try:
|
48 |
+
response = requests.get(pdf_url, stream=True)
|
49 |
+
if response.status_code == 200:
|
50 |
+
filename = doi.replace("/", "_") + ".pdf"
|
51 |
+
file_path = os.path.join(path, filename)
|
52 |
+
with open(file_path, 'wb') as f:
|
53 |
+
for chunk in response.iter_content(chunk_size=8192):
|
54 |
+
f.write(chunk)
|
55 |
+
print(f"File downloaded and saved as: {file_path}")
|
56 |
+
else:
|
57 |
+
print(f"Download failed, Status Code: {response.status_code}, URL: {pdf_url}")
|
58 |
+
except requests.RequestException as e:
|
59 |
+
print(f"Failed to download due to an exception: {e}")
|
60 |
|
61 |
|
62 |
class ArticleRetrieval:
|
|
|
120 |
response = requests.get(base_url, params=params)
|
121 |
root = ET.fromstring(response.content)
|
122 |
idlist = root.find('.//IdList')
|
123 |
+
try:
|
124 |
+
pmids = [id_element.text for id_element in idlist.findall('.//Id')]
|
125 |
+
except:
|
126 |
+
pmids = []
|
127 |
+
|
128 |
print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
|
129 |
+
self.search_pmid = pmids
|
130 |
self.pmids.extend(pmids)
|
131 |
|
|
|
132 |
# 解析XML文件
|
133 |
def _get_all_text(self, element):
|
134 |
"""递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
|
|
|
156 |
if not os.path.exists(self.repo_dir):
|
157 |
os.makedirs(self.repo_dir)
|
158 |
print(f"Saving articles to {self.repo_dir}.")
|
159 |
+
self.pmc_success = 0
|
160 |
+
self.scihub_success = 0
|
161 |
+
self.failed_download = []
|
162 |
+
downloaded = os.listdir(self.repo_dir)
|
163 |
for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
|
164 |
+
# check if file already downloaded
|
165 |
+
if f"{id}.txt" in downloaded:
|
166 |
+
print(f"File already downloaded: {id}")
|
167 |
+
self.pmc_success += 1
|
168 |
+
continue
|
169 |
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
|
170 |
params = {
|
171 |
"db": "pmc",
|
|
|
176 |
response = requests.get(base_url, params=params)
|
177 |
full_text = self._clean_xml(response.text)
|
178 |
if full_text.strip() == '':
|
179 |
+
self.failed_download.append(id)
|
180 |
continue
|
181 |
else:
|
182 |
+
logger.info(full_text[:200])
|
183 |
with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
|
184 |
f.write(full_text)
|
185 |
+
self.pmc_success += 1
|
186 |
for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
|
187 |
+
# check if file already downloaded
|
188 |
+
if f"{doi.replace('/','_')}.pdf" in downloaded:
|
189 |
+
print(f"File already downloaded: {doi}")
|
190 |
+
self.scihub_success += 1
|
191 |
+
continue
|
192 |
+
|
193 |
+
if download_pdfs(path=self.repo_dir,doi_list = doi):
|
194 |
+
self.scihub_success += 1
|
195 |
+
else:
|
196 |
+
self.failed_download.append(doi)
|
197 |
|
198 |
def save_config(self):
|
199 |
config = {
|
|
|
200 |
'repo_dir': self.repo_dir,
|
201 |
+
'keywords': self.keywords,
|
202 |
+
'retmax': self.retmax,
|
203 |
+
"search_pmids": self.search_pmid,
|
204 |
+
'import_pmids': [id for id in self.pmids if id not in self.search_pmid],
|
205 |
+
'failed_pmids': self.failed_pmids,
|
206 |
'result': [
|
207 |
{
|
208 |
'pmid': r[0],
|
|
|
210 |
'doi': r[2]
|
211 |
} for r in self.esummary
|
212 |
],
|
213 |
+
"pmc_success_d": self.pmc_success,
|
214 |
+
"scihub_success_d": self.scihub_success,
|
215 |
+
"failed_download": self.failed_download,
|
216 |
+
|
217 |
}
|
218 |
with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
|
219 |
json.dump(config, f, indent=4, ensure_ascii=False)
|
|
|
232 |
shutil.rmtree('repodir')
|
233 |
|
234 |
strings = """
|
235 |
+
34536239
|
236 |
+
7760895
|
237 |
+
36109602
|
238 |
+
24766875"""
|
|
|
|
|
|
|
|
|
239 |
string = [k.strip() for k in strings.split('\n')]
|
240 |
|
241 |
pmids = [k for k in string if k.isdigit()]
|
huixiangdou/service/worker.py
CHANGED
@@ -25,6 +25,7 @@ def convertid2url(text):
|
|
25 |
# Replace all occurrences in the text
|
26 |
formatted_text = re.sub(pattern, replacement, text)
|
27 |
return formatted_text
|
|
|
28 |
class Worker:
|
29 |
"""The Worker class orchestrates the logic of handling user queries,
|
30 |
generating responses and managing several aspects of a chat assistant. It
|
@@ -82,7 +83,7 @@ class Worker:
|
|
82 |
self.PERPLESITY_TEMPLATE = '“question:{} answer:{}”\n阅读以上对话,answer 是否在表达自己不知道,回答越全面得分越少,用0~10表示,不要解释直接给出得分。\n判断标准:准确回答问题得 0 分;答案详尽得 1 分;知道部分答案但有不确定信息得 8 分;知道小部分答案但推荐求助其他人得 9 分;不知道任何答案直接推荐求助别人得 10 分。直接打分不要解释。' # noqa E501
|
83 |
self.SUMMARIZE_TEMPLATE = '{} \n 仔细阅读以上内容,总结得简短有力点' # noqa E501
|
84 |
self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题,材料可能和问题无关。如果材料和问题无关,尝试用你自己的理解来回答问题。如果无法确定答案,直接回答不知道。' # noqa E501
|
85 |
-
self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n
|
86 |
self.ANNOTATE_CLUSTER = '这是关于{}的不同论文的分块句子,它们通过相似性进行了聚类,以下是其中一个聚类的10个样本:“{}”\n 请用一句话标注这个聚类。' # noqa E501
|
87 |
self.INSPIRATION_TEMPLATE = '以下是一些有关{0}的文章内容的总结 {1},请提出一个关于{0}的综述子问题,一个问题即可。'
|
88 |
else:
|
@@ -93,7 +94,7 @@ class Worker:
|
|
93 |
self.SECURITY_TEMAPLTE = 'Evaluate whether the following sentence involves prohibited content such as politics, insult, pornography, terror, religion, cyber violence, racial discrimination, etc., rate it on a scale of 0-10, do not explain, just give the score. The scoring standard is as follows: any violation directly gets 10 points; completely unrelated gets 0 points. Give the score, no explanation: "{}"' # noqa E501
|
94 |
self.PERPLESITY_TEMPLATE = 'Question: {} Answer: {}\nRead the dialogue above, does the answer express that they don\'t know? The more comprehensive the answer, the lower the score. Rate it on a scale of 0-10, no explanation, just give the score.\nThe scoring standard is as follows: an accurate answer to the question gets 0 points; a detailed answer gets 1 point; knowing some answers but having uncertain information gets 8 points; knowing a small part of the answer but recommends seeking help from others gets 9 points; not knowing any of the answers and directly recommending asking others for help gets 10 points. Just give the score, no explanation.' # noqa E501
|
95 |
self.SUMMARIZE_TEMPLATE = '"{}" \n Read the content above carefully, summarize it in a short and powerful way.' # noqa E501
|
96 |
-
self.GENERATE_TEMPLATE = 'Background Information: "{}"\n Question: "{}"\n Please read the reference material carefully and answer the question
|
97 |
self.ANNOTATE_CLUSTER = 'these are chunklized sentences from different papers about{}, they are clustered by similarity, the following is 10 samples from one of the cluster: "{}"\n Please tag the cluster in one breif sentence.'
|
98 |
self.INSPIRATION_TEMPLATE = 'Given the following summary of the articles content about {0} {1}, give some idea or sub-questions of the review about {0}, one question is sufficient.' # noqa E501
|
99 |
|
|
|
25 |
# Replace all occurrences in the text
|
26 |
formatted_text = re.sub(pattern, replacement, text)
|
27 |
return formatted_text
|
28 |
+
|
29 |
class Worker:
|
30 |
"""The Worker class orchestrates the logic of handling user queries,
|
31 |
generating responses and managing several aspects of a chat assistant. It
|
|
|
83 |
self.PERPLESITY_TEMPLATE = '“question:{} answer:{}”\n阅读以上对话,answer 是否在表达自己不知道,回答越全面得分越少,用0~10表示,不要解释直接给出得分。\n判断标准:准确回答问题得 0 分;答案详尽得 1 分;知道部分答案但有不确定信息得 8 分;知道小部分答案但推荐求助其他人得 9 分;不知道任何答案直接推荐求助别人得 10 分。直接打分不要解释。' # noqa E501
|
84 |
self.SUMMARIZE_TEMPLATE = '{} \n 仔细阅读以上内容,总结得简短有力点' # noqa E501
|
85 |
self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题,材料可能和问题无关。如果材料和问题无关,尝试用你自己的理解来回答问题。如果无法确定答案,直接回答不知道。' # noqa E501
|
86 |
+
self.GENERATE_TEMPLATE = '材料:“{}”\n 问题:“{}” \n 请仔细阅读参考材料回答问题,回答中附上对应内容的参考id,例如:治疗方法的主要决定因素是年龄、合并症和诊断分子特征[PMC9958586]' # yyj
|
87 |
self.ANNOTATE_CLUSTER = '这是关于{}的不同论文的分块句子,它们通过相似性进行了聚类,以下是其中一个聚类的10个样本:“{}”\n 请用一句话标注这个聚类。' # noqa E501
|
88 |
self.INSPIRATION_TEMPLATE = '以下是一些有关{0}的文章内容的总结 {1},请提出一个关于{0}的综述子问题,一个问题即可。'
|
89 |
else:
|
|
|
94 |
self.SECURITY_TEMAPLTE = 'Evaluate whether the following sentence involves prohibited content such as politics, insult, pornography, terror, religion, cyber violence, racial discrimination, etc., rate it on a scale of 0-10, do not explain, just give the score. The scoring standard is as follows: any violation directly gets 10 points; completely unrelated gets 0 points. Give the score, no explanation: "{}"' # noqa E501
|
95 |
self.PERPLESITY_TEMPLATE = 'Question: {} Answer: {}\nRead the dialogue above, does the answer express that they don\'t know? The more comprehensive the answer, the lower the score. Rate it on a scale of 0-10, no explanation, just give the score.\nThe scoring standard is as follows: an accurate answer to the question gets 0 points; a detailed answer gets 1 point; knowing some answers but having uncertain information gets 8 points; knowing a small part of the answer but recommends seeking help from others gets 9 points; not knowing any of the answers and directly recommending asking others for help gets 10 points. Just give the score, no explanation.' # noqa E501
|
96 |
self.SUMMARIZE_TEMPLATE = '"{}" \n Read the content above carefully, summarize it in a short and powerful way.' # noqa E501
|
97 |
+
self.GENERATE_TEMPLATE = 'Background Information: "{}"\n Question: "{}"\n Please read the reference material carefully and answer the question with reference id at the end of the corresponding content for example: Primary determinants of the therapeutic approach are age, comorbidities, and diagnostic molecular profile [PMC9958586]' # yyj
|
98 |
self.ANNOTATE_CLUSTER = 'these are chunklized sentences from different papers about{}, they are clustered by similarity, the following is 10 samples from one of the cluster: "{}"\n Please tag the cluster in one breif sentence.'
|
99 |
self.INSPIRATION_TEMPLATE = 'Given the following summary of the articles content about {0} {1}, give some idea or sub-questions of the review about {0}, one question is sufficient.' # noqa E501
|
100 |
|
requirements.txt
CHANGED
@@ -47,13 +47,3 @@ pyclipper==1.3.0.post5
|
|
47 |
xpinyin==0.7.6
|
48 |
opencv-python==4.9.0.80
|
49 |
beautifulsoup4==4.10.0
|
50 |
-
requests==2.26.0
|
51 |
-
retrying==1.3.3
|
52 |
-
# PyYaml==5.4
|
53 |
-
PyYaml
|
54 |
-
bibtexparser==1.2.0
|
55 |
-
aiohttp==3.8.3
|
56 |
-
lxml==4.7.1
|
57 |
-
pytest==7.1.3
|
58 |
-
dataclasses
|
59 |
-
scihub_cn
|
|
|
47 |
xpinyin==0.7.6
|
48 |
opencv-python==4.9.0.80
|
49 |
beautifulsoup4==4.10.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|