Spaces:

Yijun-Yang
/

ReadReview

Runtime error

App Files Files Community

Yijun-Yang commited on Jun 12, 2024

Commit

92bcd1d

1 Parent(s): 78f8e89

updategradiofrontend

Browse files

Files changed (6) hide show

app.py +63 -39
applocal.py +63 -39
config.ini +5 -5
huixiangdou/service/findarticles.py +90 -31
huixiangdou/service/worker.py +3 -2
requirements.txt +0 -10

app.py CHANGED Viewed

@@ -167,21 +167,28 @@ def update_repo_info():
     if os.path.exists(repodir):
         pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
         number_of_pdf = len(pdffiles)
         if os.path.exists(os.path.join(repodir,'info.json')):
             with open(os.path.join(repodir,'info.json'), 'r') as f:
                 repo_info = json.load(f)
             keywords = repo_info['keywords']
-            length = repo_info['len']
             retmax = repo_info['retmax']
-            failed = repo_info['failed_pmids']
-            return keywords,length,retmax,failed,number_of_pdf
         else:
-            return None,None,None,None,number_of_pdf
     else:
-        return None,None,None,None,None
 def upload_file(files):
     repodir, workdir, _ = get_ready('repo_work')
@@ -196,12 +203,11 @@ def upload_file(files):
     return files
-def generate_articles_repo(strings:str,retmax:int):
-    string = [k.strip() for k in strings.split('\n')]
-    pmids = [k for k in string if k.isdigit()]
-    keys = [k for k in string if not k.isdigit()]
     repodir, _, _ = get_ready('repo_work')
@@ -225,15 +231,26 @@ def delete_articles_repo():
                       visible = True)
 def update_repo():
-    keys,len,retmax,failed,pdflen = update_repo_info()
-    if keys or len:
-        newinfo = f"搜索得到文献：\n    关键词：{keys}\n    文献数量：{len}\n    获取上限：{retmax}\n    失败PMID：{failed}\n\n上传文献：\n    数量：{pdflen}"
-    else:
         if pdflen:
-            newinfo = f'搜索得到文献：无\n上传文献：\n    数量：{pdflen}'
         else:
-            newinfo = '目前还没有文献库'
     return gr.Textbox(label="文献库概况",lines =1,
                       value = newinfo,
                       visible = True)
@@ -464,11 +481,12 @@ def main_interface():
             gr.Markdown("""
 #### 查找文献 📚
-1. **输入关键词批量PubMed PMC文献**
    - 在“感兴趣的关键词”框中输入您感兴趣的关键词，每行一个。
-   - 设置查找数量（0-1000）。
-   - 点击“搜索PubMed PMC”按钮进行文献查找。
 2. **上传PDF**
    - 通过“上传PDF”按钮上传您已有的PDF文献文件。
@@ -492,36 +510,43 @@ def main_interface():
 """)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
-                    input_keys = gr.Textbox(label="感兴趣的关键词",
-                                            value = "输入关键词或者PMID, 换行分隔",
                                                     lines = 5)
-                    retmax = gr.Slider(
-                            minimum=0,
-                            maximum=1000,
-                            value=500,
-                            interactive=True,
-                            label="查多少",
-                        )
-                    generate_repo_button = gr.Button("搜索PubMed PMC")
-                with gr.Column(scale=2):
                     file_output = gr.File(scale=2)
                     upload_button = gr.UploadButton("上传PDF",
-                                    file_types=[".pdf",".csv",".doc"],
-                                    file_count="multiple",scale=0)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=0):
                     delete_repo_button = gr.Button("删除文献库")
                     update_repo_button = gr.Button("更新文献库情况")
                 with gr.Column(scale=2):
-                    repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
             generate_repo_button.click(generate_articles_repo,
-                                inputs=[input_keys,retmax],
                                 outputs = [repo_summary])
             delete_repo_button.click(delete_articles_repo, inputs=None,
                                 outputs = repo_summary)
             update_repo_button.click(update_repo, inputs=None,
@@ -535,7 +560,6 @@ def main_interface():
                                         minimum=128, maximum=4096,value=1024,step=1,
                                         interactive=True)
                 ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
-                                            # default=["20", "50", '100'],
                                             label="Number of Clusters",
                                             info="How many Clusters you want to generate")

     if os.path.exists(repodir):
         pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
         number_of_pdf = len(pdffiles)
+        # 判断info.json是否存在
         if os.path.exists(os.path.join(repodir,'info.json')):
             with open(os.path.join(repodir,'info.json'), 'r') as f:
                 repo_info = json.load(f)
             keywords = repo_info['keywords']
             retmax = repo_info['retmax']
+            search_len = len(repo_info['search_pmids'])
+            import_len = len(repo_info['import_pmids'])
+            failed_pmid_len = len(repo_info['failed_pmids'])
+            pmc_success = repo_info['pmc_success_d']
+            scihub_success = repo_info['scihub_success_d']
+            failed_download = repo_info['failed_download']
+            number_of_upload = number_of_pdf-scihub_success
+            return keywords, retmax, search_len, import_len, failed_pmid_len, pmc_success, scihub_success, number_of_pdf, failed_download, number_of_upload
         else:
+            return None,None,None,None,None,None,None,None,None,number_of_pdf
     else:
+        return None,None,None,None,None,None,None,None,None,None
 def upload_file(files):
     repodir, workdir, _ = get_ready('repo_work')
     return files
+def generate_articles_repo(keys:str,pmids,retmax:int):
+    keys = [k.strip() for k in keys.split('\n')]
+    pmids = [k.strip() for k in pmids.split('\n')]
+    pmids = [k for k in pmids if k.isdigit()]
     repodir, _, _ = get_ready('repo_work')
                       visible = True)
 def update_repo():
+    keys, retmax, search_len, import_len, _, pmc_success, scihub_success, pdflen, failed, pdflen = update_repo_info()
+    newinfo = ""
+    if keys == None:
+        newinfo += '无关键词搜索相关信息\n'
+        newinfo += '无导入的PMID\n'
         if pdflen:
+            newinfo += f'上传的PDF数量: {pdflen}\n'
         else:
+            newinfo += '无上传的PDF\n'
+    else:
+        newinfo += f'关键词搜索:'
+        newinfo += f'   关键词: {keys}\n'
+        newinfo += f'   搜索上限: {retmax}\n'
+        newinfo += f'   搜索到的PMID数量: {search_len}\n'
+        newinfo += f'导入的PMID数量: {import_len}\n'
+        newinfo += f'成功获取PMC全文数量: {pmc_success}\n'
+        newinfo += f'成功获取SciHub全文数量: {scihub_success}\n'
+        newinfo += f"下载失败的ID: {failed}\n"
+        newinfo += f'上传的PDF数量: {pdflen}\n'
     return gr.Textbox(label="文献库概况",lines =1,
                       value = newinfo,
                       visible = True)
             gr.Markdown("""
 #### 查找文献 📚
+1. **输入关键词或PMID批量PubMed PMC文献**
    - 在“感兴趣的关键词”框中输入您感兴趣的关键词，每行一个。
+   - 设置查找数量（0-500）。
+   - 在“输入PMID”框中输入在PubMed中导出的PMID，每行一个。
+   - 点击“搜索PubMed 并拉取全文”按钮进行文献查找。目前主要基于PMC数据库和scihub, 在PMC中未收录的文献将使用scihub下载，scihub近年文献未收录
 2. **上传PDF**
    - 通过“上传PDF”按钮上传您已有的PDF文献文件。
 """)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            input_keys = gr.Textbox(label="感兴趣的关键词, 换行分隔, 不太好用别用等我改改",
                                                     lines = 5)
+                            retmax = gr.Slider(
+                                    minimum=0,
+                                    maximum=500,
+                                    value=250,
+                                    interactive=True,
+                                    label="搜索上限",
+                                    info="How many articles you want to retrieve?"
+                                )
+                        with gr.Column(scale=1):
+                            input_pmids = gr.Textbox(label="输入PMID, 换行分隔",
+                                                    lines = 5)
+                    generate_repo_button = gr.Button("搜索PubMed并拉取全文")
+                with gr.Column(scale=1):
                     file_output = gr.File(scale=2)
                     upload_button = gr.UploadButton("上传PDF",
+                                    file_types=[".pdf"],
+                                    file_count="multiple",scale=1)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=0):
                     delete_repo_button = gr.Button("删除文献库")
                     update_repo_button = gr.Button("更新文献库情况")
                 with gr.Column(scale=2):
+                    repo_summary =gr.Textbox(label= '文献库概况',
+                                             value="目前还没有文献库")
             generate_repo_button.click(generate_articles_repo,
+                                inputs=[input_keys,input_pmids,retmax],
                                 outputs = [repo_summary])
             delete_repo_button.click(delete_articles_repo, inputs=None,
                                 outputs = repo_summary)
             update_repo_button.click(update_repo, inputs=None,
                                         minimum=128, maximum=4096,value=1024,step=1,
                                         interactive=True)
                 ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
                                             label="Number of Clusters",
                                             info="How many Clusters you want to generate")

applocal.py CHANGED Viewed

@@ -167,21 +167,28 @@ def update_repo_info():
     if os.path.exists(repodir):
         pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
         number_of_pdf = len(pdffiles)
         if os.path.exists(os.path.join(repodir,'info.json')):
             with open(os.path.join(repodir,'info.json'), 'r') as f:
                 repo_info = json.load(f)
             keywords = repo_info['keywords']
-            length = repo_info['len']
             retmax = repo_info['retmax']
-            failed = repo_info['failed_pmids']
-            return keywords,length,retmax,failed,number_of_pdf
         else:
-            return None,None,None,None,number_of_pdf
     else:
-        return None,None,None,None,None
 def upload_file(files):
     repodir, workdir, _ = get_ready('repo_work')
@@ -196,12 +203,11 @@ def upload_file(files):
     return files
-def generate_articles_repo(strings:str,retmax:int):
-    string = [k.strip() for k in strings.split('\n')]
-    pmids = [k for k in string if k.isdigit()]
-    keys = [k for k in string if not k.isdigit()]
     repodir, _, _ = get_ready('repo_work')
@@ -225,15 +231,26 @@ def delete_articles_repo():
                       visible = True)
 def update_repo():
-    keys,len,retmax,failed,pdflen = update_repo_info()
-    if keys or len:
-        newinfo = f"搜索得到文献：\n    关键词：{keys}\n    文献数量：{len}\n    获取上限：{retmax}\n    失败PMID：{failed}\n\n上传文献：\n    数量：{pdflen}"
-    else:
         if pdflen:
-            newinfo = f'搜索得到文献：无\n上传文献：\n    数量：{pdflen}'
         else:
-            newinfo = '目前还没有文献库'
     return gr.Textbox(label="文献库概况",lines =1,
                       value = newinfo,
                       visible = True)
@@ -464,11 +481,12 @@ def main_interface():
             gr.Markdown("""
 #### 查找文献 📚
-1. **输入关键词批量PubMed PMC文献**
    - 在“感兴趣的关键词”框中输入您感兴趣的关键词，每行一个。
-   - 设置查找数量（0-1000）。
-   - 点击“搜索PubMed PMC”按钮进行文献查找。
 2. **上传PDF**
    - 通过“上传PDF”按钮上传您已有的PDF文献文件。
@@ -492,36 +510,43 @@ def main_interface():
 """)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
-                    input_keys = gr.Textbox(label="感兴趣的关键词",
-                                            value = "输入关键词或者PMID, 换行分隔",
                                                     lines = 5)
-                    retmax = gr.Slider(
-                            minimum=0,
-                            maximum=1000,
-                            value=500,
-                            interactive=True,
-                            label="查多少",
-                        )
-                    generate_repo_button = gr.Button("搜索PubMed PMC")
-                with gr.Column(scale=2):
                     file_output = gr.File(scale=2)
                     upload_button = gr.UploadButton("上传PDF",
-                                    file_types=[".pdf",".csv",".doc"],
-                                    file_count="multiple",scale=0)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=0):
                     delete_repo_button = gr.Button("删除文献库")
                     update_repo_button = gr.Button("更新文献库情况")
                 with gr.Column(scale=2):
-                    repo_summary =gr.Textbox(label= '文献库概况', value="目前还没有文献库")
             generate_repo_button.click(generate_articles_repo,
-                                inputs=[input_keys,retmax],
                                 outputs = [repo_summary])
             delete_repo_button.click(delete_articles_repo, inputs=None,
                                 outputs = repo_summary)
             update_repo_button.click(update_repo, inputs=None,
@@ -535,7 +560,6 @@ def main_interface():
                                         minimum=128, maximum=4096,value=1024,step=1,
                                         interactive=True)
                 ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
-                                            # default=["20", "50", '100'],
                                             label="Number of Clusters",
                                             info="How many Clusters you want to generate")

     if os.path.exists(repodir):
         pdffiles = glob.glob(os.path.join(repodir, '*.pdf'))
         number_of_pdf = len(pdffiles)
+        # 判断info.json是否存在
         if os.path.exists(os.path.join(repodir,'info.json')):
             with open(os.path.join(repodir,'info.json'), 'r') as f:
                 repo_info = json.load(f)
             keywords = repo_info['keywords']
             retmax = repo_info['retmax']
+            search_len = len(repo_info['search_pmids'])
+            import_len = len(repo_info['import_pmids'])
+            failed_pmid_len = len(repo_info['failed_pmids'])
+            pmc_success = repo_info['pmc_success_d']
+            scihub_success = repo_info['scihub_success_d']
+            failed_download = repo_info['failed_download']
+            number_of_upload = number_of_pdf-scihub_success
+            return keywords, retmax, search_len, import_len, failed_pmid_len, pmc_success, scihub_success, number_of_pdf, failed_download, number_of_upload
         else:
+            return None,None,None,None,None,None,None,None,None,number_of_pdf
     else:
+        return None,None,None,None,None,None,None,None,None,None
 def upload_file(files):
     repodir, workdir, _ = get_ready('repo_work')
     return files
+def generate_articles_repo(keys:str,pmids,retmax:int):
+    keys = [k.strip() for k in keys.split('\n')]
+    pmids = [k.strip() for k in pmids.split('\n')]
+    pmids = [k for k in pmids if k.isdigit()]
     repodir, _, _ = get_ready('repo_work')
                       visible = True)
 def update_repo():
+    keys, retmax, search_len, import_len, _, pmc_success, scihub_success, pdflen, failed, pdflen = update_repo_info()
+    newinfo = ""
+    if keys == None:
+        newinfo += '无关键词搜索相关信息\n'
+        newinfo += '无导入的PMID\n'
         if pdflen:
+            newinfo += f'上传的PDF数量: {pdflen}\n'
         else:
+            newinfo += '无上传的PDF\n'
+    else:
+        newinfo += f'关键词搜索:'
+        newinfo += f'   关键词: {keys}\n'
+        newinfo += f'   搜索上限: {retmax}\n'
+        newinfo += f'   搜索到的PMID数量: {search_len}\n'
+        newinfo += f'导入的PMID数量: {import_len}\n'
+        newinfo += f'成功获取PMC全文数量: {pmc_success}\n'
+        newinfo += f'成功获取SciHub全文数量: {scihub_success}\n'
+        newinfo += f"下载失败的ID: {failed}\n"
+        newinfo += f'上传的PDF数量: {pdflen}\n'
     return gr.Textbox(label="文献库概况",lines =1,
                       value = newinfo,
                       visible = True)
             gr.Markdown("""
 #### 查找文献 📚
+1. **输入关键词或PMID批量PubMed PMC文献**
    - 在“感兴趣的关键词”框中输入您感兴趣的关键词，每行一个。
+   - 设置查找数量（0-500）。
+   - 在“输入PMID”框中输入在PubMed中导出的PMID，每行一个。
+   - 点击“搜索PubMed 并拉取全文”按钮进行文献查找。目前主要基于PMC数据库和scihub, 在PMC中未收录的文献将使用scihub下载，scihub近年文献未收录
 2. **上传PDF**
    - 通过“上传PDF”按钮上传您已有的PDF文献文件。
 """)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=1):
+                    with gr.Row():
+                        with gr.Column(scale=1):
+                            input_keys = gr.Textbox(label="感兴趣的关键词, 换行分隔, 不太好用别用等我改改",
                                                     lines = 5)
+                            retmax = gr.Slider(
+                                    minimum=0,
+                                    maximum=500,
+                                    value=250,
+                                    interactive=True,
+                                    label="搜索上限",
+                                    info="How many articles you want to retrieve?"
+                                )
+                        with gr.Column(scale=1):
+                            input_pmids = gr.Textbox(label="输入PMID, 换行分隔",
+                                                    lines = 5)
+                    generate_repo_button = gr.Button("搜索PubMed并拉取全文")
+                with gr.Column(scale=1):
                     file_output = gr.File(scale=2)
                     upload_button = gr.UploadButton("上传PDF",
+                                    file_types=[".pdf"],
+                                    file_count="multiple",scale=1)
             with gr.Row(equal_height=True):
                 with gr.Column(scale=0):
                     delete_repo_button = gr.Button("删除文献库")
                     update_repo_button = gr.Button("更新文献库情况")
                 with gr.Column(scale=2):
+                    repo_summary =gr.Textbox(label= '文献库概况',
+                                             value="目前还没有文献库")
             generate_repo_button.click(generate_articles_repo,
+                                inputs=[input_keys,input_pmids,retmax],
                                 outputs = [repo_summary])
             delete_repo_button.click(delete_articles_repo, inputs=None,
                                 outputs = repo_summary)
             update_repo_button.click(update_repo, inputs=None,
                                         minimum=128, maximum=4096,value=1024,step=1,
                                         interactive=True)
                 ncluster = gr.CheckboxGroup(["10", "20", "50", '100','200','500','1000'],
                                             label="Number of Clusters",
                                             info="How many Clusters you want to generate")

config.ini CHANGED Viewed

@@ -4,7 +4,7 @@ embedding_model_path = "/root/models/bce-embedding-base_v1"
 reranker_model_path = "/root/models/bce-reranker-base_v1"
 repo_dir = "repodir"
 work_dir = "workdir"
-n_clusters = [20, 50]
 chunk_size = 1024
 [web_search]
@@ -13,7 +13,7 @@ domain_partial_order = ["openai.com", "pytorch.org", "readthedocs.io", "nvidia.c
 save_dir = "logs/web_search_result"
 [llm]
-enable_local = 0
 enable_remote = 1
 client_url = "http://127.0.0.1:8888/inference"
@@ -21,11 +21,11 @@ client_url = "http://127.0.0.1:8888/inference"
 local_llm_path = "/root/models/Qwen1.5-7B-Chat"
 local_llm_max_text_length = 32000
 local_llm_bind_port = 8888
-remote_type = "deepseek"
-remote_api_key = "sk-f36f5336010841399abccdfeb6bd1f54"
 remote_base_url = ""
 remote_llm_max_text_length = 32000
-remote_llm_model = "deepseek-chat"
 rpm = 500
 [worker]

 reranker_model_path = "/root/models/bce-reranker-base_v1"
 repo_dir = "repodir"
 work_dir = "workdir"
+n_clusters = [10, 20]
 chunk_size = 1024
 [web_search]
 save_dir = "logs/web_search_result"
 [llm]
+enable_local = 1
 enable_remote = 1
 client_url = "http://127.0.0.1:8888/inference"
 local_llm_path = "/root/models/Qwen1.5-7B-Chat"
 local_llm_max_text_length = 32000
 local_llm_bind_port = 8888
+remote_type = ""
+remote_api_key = ""
 remote_base_url = ""
 remote_llm_max_text_length = 32000
+remote_llm_model = ""
 rpm = 500
 [worker]

huixiangdou/service/findarticles.py CHANGED Viewed

@@ -7,19 +7,56 @@ import json
 import shutil
 from loguru import logger
 from lxml import etree
-import sys
-from scihub_cn.scihub import main
-def scihub_download(doi_file_Path = None,doi = None,output_dir = None):
-    args = ["scihub-cn"]  # This is the program name as expected in argv[0]
-    if doi is not None:
-        args.extend(["-d", doi])
-    if doi_file_Path is not None:
-        args.extend(["-i", doi_file_Path, "--doi"])
-    if output_dir is not None:
-        args.extend(["-o", output_dir])
-    sys.argv = args  # Set sys.argv to our simulated command line arguments
-    sys.exit(main())
 class ArticleRetrieval:
@@ -83,11 +120,15 @@ class ArticleRetrieval:
         response = requests.get(base_url, params=params)
         root = ET.fromstring(response.content)
         idlist = root.find('.//IdList')
-        pmids = [id_element.text for id_element in idlist.findall('.//Id')]
         print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
         self.pmids.extend(pmids)
     # 解析XML文件
     def _get_all_text(self, element):
         """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
@@ -115,8 +156,16 @@ class ArticleRetrieval:
         if not os.path.exists(self.repo_dir):
             os.makedirs(self.repo_dir)
         print(f"Saving articles to {self.repo_dir}.")
-        self.success = 0
         for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
             base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
             params = {
                 "db": "pmc",
@@ -127,20 +176,33 @@ class ArticleRetrieval:
             response = requests.get(base_url, params=params)
             full_text = self._clean_xml(response.text)
             if full_text.strip() == '':
                 continue
             else:
-                logger.info(full_text[:500])
                 with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
                     f.write(full_text)
-                self.success += 1
         for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
-            scihub_download(doi = doi,output_dir=self.repo_dir)
-            self.success += 1
     def save_config(self):
         config = {
-            'keywords': self.keywords,
             'repo_dir': self.repo_dir,
             'result': [
                 {
                     'pmid': r[0],
@@ -148,9 +210,10 @@ class ArticleRetrieval:
                     'doi': r[2]
                 } for r in self.esummary
             ],
-            'len': self.success,
-            'retmax': self.retmax,
-            'failed_pmids': self.failed_pmids
         }
         with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
             json.dump(config, f, indent=4, ensure_ascii=False)
@@ -169,14 +232,10 @@ if __name__ == '__main__':
         shutil.rmtree('repodir')
     strings = """
-36944324
-38453907
-38300432
-38651453
-38398096
-38255885
-38035547
-38734498"""
     string = [k.strip() for k in strings.split('\n')]
     pmids = [k for k in string if k.isdigit()]

 import shutil
 from loguru import logger
 from lxml import etree
+import requests
+from bs4 import BeautifulSoup
+import os
+def download_pdfs(path, doi_list): #fox dalao contribution https://github.com/BigWhiteFox
+    # 确保下载目录存在
+    if not os.path.exists(path):
+        os.makedirs(path)
+    if isinstance(doi_list, str):
+        doi_list = [doi_list]
+    href_list = []
+    for doi in doi_list:
+        url = f"https://sci-hub.se/{doi}"
+        response = requests.get(url)
+        # 检查请求是否成功
+        if response.status_code == 200:
+            print(f"成功请求：{url}")
+        else:
+            print(f"请求失败：{url}，状态码：{response.status_code}")
+            continue  # 如果请求失败，跳过本次循环
+        soup = BeautifulSoup(response.text, 'html.parser')
+        buttons = soup.find_all('button', onclick=True)
+        for button in buttons:
+            onclick = button.get('onclick')
+            if onclick:
+                pdf_url = onclick.split("'")[1]
+                href_list.append((pdf_url, doi))
+                print("pdf_url:", pdf_url)
+        print("href_list:", href_list)
+    # 遍历href_list中的每个URL
+    for href, doi in href_list:
+        pdf_url = f"https:{href}"
+        try:
+            response = requests.get(pdf_url, stream=True)
+            if response.status_code == 200:
+                filename = doi.replace("/", "_") + ".pdf"
+                file_path = os.path.join(path, filename)
+                with open(file_path, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        f.write(chunk)
+                print(f"File downloaded and saved as: {file_path}")
+            else:
+                print(f"Download failed, Status Code: {response.status_code}, URL: {pdf_url}")
+        except requests.RequestException as e:
+            print(f"Failed to download due to an exception: {e}")
 class ArticleRetrieval:
         response = requests.get(base_url, params=params)
         root = ET.fromstring(response.content)
         idlist = root.find('.//IdList')
+        try:
+            pmids = [id_element.text for id_element in idlist.findall('.//Id')]
+        except:
+            pmids = []
         print(f"Found {len(pmids)} articles for keywords {self.keywords}.")
+        self.search_pmid = pmids
         self.pmids.extend(pmids)
     # 解析XML文件
     def _get_all_text(self, element):
         """递归获取XML元素及其所有子元素的文本内容。确保element不为None."""
         if not os.path.exists(self.repo_dir):
             os.makedirs(self.repo_dir)
         print(f"Saving articles to {self.repo_dir}.")
+        self.pmc_success = 0
+        self.scihub_success = 0
+        self.failed_download = []
+        downloaded = os.listdir(self.repo_dir)
         for id in tqdm(self.pmc_ids, desc="Fetching full texts", unit="article"):
+            # check if file already downloaded
+            if f"{id}.txt" in downloaded:
+                print(f"File already downloaded: {id}")
+                self.pmc_success += 1
+                continue
             base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
             params = {
                 "db": "pmc",
             response = requests.get(base_url, params=params)
             full_text = self._clean_xml(response.text)
             if full_text.strip() == '':
+                self.failed_download.append(id)
                 continue
             else:
+                logger.info(full_text[:200])
                 with open(os.path.join(self.repo_dir,f'{id}.txt'), 'w') as f:
                     f.write(full_text)
+                self.pmc_success += 1
         for doi in tqdm(self.scihub_doi, desc="Fetching full texts", unit="article"):
+            # check if file already downloaded
+            if f"{doi.replace('/','_')}.pdf" in downloaded:
+                print(f"File already downloaded: {doi}")
+                self.scihub_success += 1
+                continue
+            if download_pdfs(path=self.repo_dir,doi_list = doi):
+                self.scihub_success += 1
+            else:
+                self.failed_download.append(doi)
     def save_config(self):
         config = {
             'repo_dir': self.repo_dir,
+            'keywords': self.keywords,
+            'retmax': self.retmax,
+            "search_pmids": self.search_pmid,
+            'import_pmids': [id for id in self.pmids if id not in self.search_pmid],
+            'failed_pmids': self.failed_pmids,
             'result': [
                 {
                     'pmid': r[0],
                     'doi': r[2]
                 } for r in self.esummary
             ],
+            "pmc_success_d": self.pmc_success,
+            "scihub_success_d": self.scihub_success,
+            "failed_download": self.failed_download,
         }
         with open(os.path.join(self.repo_dir, 'info.json'), 'w') as f:
             json.dump(config, f, indent=4, ensure_ascii=False)
         shutil.rmtree('repodir')
     strings = """
+34536239
+7760895
+36109602
+24766875"""
     string = [k.strip() for k in strings.split('\n')]
     pmids = [k for k in string if k.isdigit()]

huixiangdou/service/worker.py CHANGED Viewed

@@ -25,6 +25,7 @@ def convertid2url(text):
     # Replace all occurrences in the text
     formatted_text = re.sub(pattern, replacement, text)
     return formatted_text
 class Worker:
     """The Worker class orchestrates the logic of handling user queries,
     generating responses and managing several aspects of a chat assistant. It
@@ -82,7 +83,7 @@ class Worker:
             self.PERPLESITY_TEMPLATE = '“question:{} answer:{}”\n阅读以上对话，answer 是否在表达自己不知道，回答越全面得分越少，用0～10表示，不要解释直接给出得分。\n判断标准：准确回答问题得 0 分；答案详尽得 1 分；知道部分答案但有不确定信息得 8 分；知道小部分答案但推荐求助其他人得 9 分；不知道任何答案直接推荐求助别人得 10 分。直接打分不要解释。'  # noqa E501
             self.SUMMARIZE_TEMPLATE = '{} \n 仔细阅读以上内容，总结得简短有力点'  # noqa E501
             self.GENERATE_TEMPLATE = '材料：“{}”\n 问题：“{}” \n 请仔细阅读参考材料回答问题，材料可能和问题无关。如果材料和问题无关，尝试用你自己的理解来回答问题。如果无法确定答案，直接回答不知道。'  # noqa E501
-            self.GENERATE_TEMPLATE = '材料：“{}”\n 问题：“{}” \n 请仔细阅读参考材料回答问题。'  # noqa E501
             self.ANNOTATE_CLUSTER = '这是关于{}的不同论文的分块句子，它们通过相似性进行了聚类，以下是其中一个聚类的10个样本：“{}”\n 请用一句话标注这个聚类。'  # noqa E501
             self.INSPIRATION_TEMPLATE = '以下是一些有关{0}的文章内容的总结 {1}，请提出一个关于{0}的综述子问题，一个问题即可。'
         else:
@@ -93,7 +94,7 @@ class Worker:
             self.SECURITY_TEMAPLTE = 'Evaluate whether the following sentence involves prohibited content such as politics, insult, pornography, terror, religion, cyber violence, racial discrimination, etc., rate it on a scale of 0-10, do not explain, just give the score. The scoring standard is as follows: any violation directly gets 10 points; completely unrelated gets 0 points. Give the score, no explanation: "{}"'  # noqa E501
             self.PERPLESITY_TEMPLATE = 'Question: {} Answer: {}\nRead the dialogue above, does the answer express that they don\'t know? The more comprehensive the answer, the lower the score. Rate it on a scale of 0-10, no explanation, just give the score.\nThe scoring standard is as follows: an accurate answer to the question gets 0 points; a detailed answer gets 1 point; knowing some answers but having uncertain information gets 8 points; knowing a small part of the answer but recommends seeking help from others gets 9 points; not knowing any of the answers and directly recommending asking others for help gets 10 points. Just give the score, no explanation.'  # noqa E501
             self.SUMMARIZE_TEMPLATE = '"{}" \n Read the content above carefully, summarize it in a short and powerful way.'  # noqa E501
-            self.GENERATE_TEMPLATE = 'Background Information: "{}"\n Question: "{}"\n Please read the reference material carefully and answer the question.  with reference id at the end of the corresponding content for example:  Primary determinants of the therapeutic approach are age, comorbidities, and diagnostic molecular profile [PMC9958584]'  # noqa E501
             self.ANNOTATE_CLUSTER = 'these are chunklized sentences from different papers about{}, they are clustered by similarity, the following is 10 samples from one of the cluster: "{}"\n Please tag the cluster in one breif sentence.'
             self.INSPIRATION_TEMPLATE = 'Given the following summary of the articles content about {0} {1}, give some idea or sub-questions of the review about {0}, one question is sufficient.'  # noqa E501

     # Replace all occurrences in the text
     formatted_text = re.sub(pattern, replacement, text)
     return formatted_text
 class Worker:
     """The Worker class orchestrates the logic of handling user queries,
     generating responses and managing several aspects of a chat assistant. It
             self.PERPLESITY_TEMPLATE = '“question:{} answer:{}”\n阅读以上对话，answer 是否在表达自己不知道，回答越全面得分越少，用0～10表示，不要解释直接给出得分。\n判断标准：准确回答问题得 0 分；答案详尽得 1 分；知道部分答案但有不确定信息得 8 分；知道小部分答案但推荐求助其他人得 9 分；不知道任何答案直接推荐求助别人得 10 分。直接打分不要解释。'  # noqa E501
             self.SUMMARIZE_TEMPLATE = '{} \n 仔细阅读以上内容，总结得简短有力点'  # noqa E501
             self.GENERATE_TEMPLATE = '材料：“{}”\n 问题：“{}” \n 请仔细阅读参考材料回答问题，材料可能和问题无关。如果材料和问题无关，尝试用你自己的理解来回答问题。如果无法确定答案，直接回答不知道。'  # noqa E501
+            self.GENERATE_TEMPLATE = '材料：“{}”\n 问题：“{}” \n 请仔细阅读参考材料回答问题，回答中附上对应内容的参考id，例如：治疗方法的主要决定因素是年龄、合并症和诊断分子特征[PMC9958586]'  # yyj
             self.ANNOTATE_CLUSTER = '这是关于{}的不同论文的分块句子，它们通过相似性进行了聚类，以下是其中一个聚类的10个样本：“{}”\n 请用一句话标注这个聚类。'  # noqa E501
             self.INSPIRATION_TEMPLATE = '以下是一些有关{0}的文章内容的总结 {1}，请提出一个关于{0}的综述子问题，一个问题即可。'
         else:
             self.SECURITY_TEMAPLTE = 'Evaluate whether the following sentence involves prohibited content such as politics, insult, pornography, terror, religion, cyber violence, racial discrimination, etc., rate it on a scale of 0-10, do not explain, just give the score. The scoring standard is as follows: any violation directly gets 10 points; completely unrelated gets 0 points. Give the score, no explanation: "{}"'  # noqa E501
             self.PERPLESITY_TEMPLATE = 'Question: {} Answer: {}\nRead the dialogue above, does the answer express that they don\'t know? The more comprehensive the answer, the lower the score. Rate it on a scale of 0-10, no explanation, just give the score.\nThe scoring standard is as follows: an accurate answer to the question gets 0 points; a detailed answer gets 1 point; knowing some answers but having uncertain information gets 8 points; knowing a small part of the answer but recommends seeking help from others gets 9 points; not knowing any of the answers and directly recommending asking others for help gets 10 points. Just give the score, no explanation.'  # noqa E501
             self.SUMMARIZE_TEMPLATE = '"{}" \n Read the content above carefully, summarize it in a short and powerful way.'  # noqa E501
+            self.GENERATE_TEMPLATE = 'Background Information: "{}"\n Question: "{}"\n Please read the reference material carefully and answer the question with reference id at the end of the corresponding content for example:  Primary determinants of the therapeutic approach are age, comorbidities, and diagnostic molecular profile [PMC9958586]'  # yyj
             self.ANNOTATE_CLUSTER = 'these are chunklized sentences from different papers about{}, they are clustered by similarity, the following is 10 samples from one of the cluster: "{}"\n Please tag the cluster in one breif sentence.'
             self.INSPIRATION_TEMPLATE = 'Given the following summary of the articles content about {0} {1}, give some idea or sub-questions of the review about {0}, one question is sufficient.'  # noqa E501

requirements.txt CHANGED Viewed

@@ -47,13 +47,3 @@ pyclipper==1.3.0.post5
 xpinyin==0.7.6
 opencv-python==4.9.0.80
 beautifulsoup4==4.10.0
-requests==2.26.0
-retrying==1.3.3
-# PyYaml==5.4
-PyYaml
-bibtexparser==1.2.0
-aiohttp==3.8.3
-lxml==4.7.1
-pytest==7.1.3
-dataclasses
-scihub_cn

 xpinyin==0.7.6
 opencv-python==4.9.0.80
 beautifulsoup4==4.10.0