Yingfeng
		
	commited on
		
		
					Commit 
							
							·
						
						f23a141
	
1
								Parent(s):
							
							f05a941
								
Synchronize with enterprise version (#4325)
Browse files### Type of change
- [x] Refactoring
- agent/templates/customer_service.json +3 -3
- rag/app/knowledge_graph.py +2 -2
- rag/app/manual.py +2 -2
- rag/llm/chat_model.py +1 -1
- rag/llm/tts_model.py +27 -2
- rag/svr/cache_file_svr.py +59 -59
    	
        agent/templates/customer_service.json
    CHANGED
    
    | @@ -336,7 +336,7 @@ | |
| 336 | 
             
                            "parameters": [],
         | 
| 337 | 
             
                            "presencePenaltyEnabled": true,
         | 
| 338 | 
             
                            "presence_penalty": 0.4,
         | 
| 339 | 
            -
                            "prompt": "Role: You are a customer support.  \n\nTask: Please answer the question based on content of knowledge base. \n\ | 
| 340 | 
             
                            "temperature": 0.1,
         | 
| 341 | 
             
                            "temperatureEnabled": true,
         | 
| 342 | 
             
                            "topPEnabled": true,
         | 
| @@ -603,7 +603,7 @@ | |
| 603 | 
             
                      {
         | 
| 604 | 
             
                        "data": {
         | 
| 605 | 
             
                          "form": {
         | 
| 606 | 
            -
                            "text": "Static messages.\nDefine  | 
| 607 | 
             
                          },
         | 
| 608 | 
             
                          "label": "Note",
         | 
| 609 | 
             
                          "name": "N: What else?"
         | 
| @@ -691,7 +691,7 @@ | |
| 691 | 
             
                      {
         | 
| 692 | 
             
                        "data": {
         | 
| 693 | 
             
                          "form": {
         | 
| 694 | 
            -
                            "text": "Complete questions by conversation history.\nUser: What's RAGFlow?\nAssistant: RAGFlow is xxx.\nUser: How to  | 
| 695 | 
             
                          },
         | 
| 696 | 
             
                          "label": "Note",
         | 
| 697 | 
             
                          "name": "N: Refine Question"
         | 
|  | |
| 336 | 
             
                            "parameters": [],
         | 
| 337 | 
             
                            "presencePenaltyEnabled": true,
         | 
| 338 | 
             
                            "presence_penalty": 0.4,
         | 
| 339 | 
            +
                            "prompt": "Role: You are a customer support.  \n\nTask: Please answer the question based on content of knowledge base. \n\nRequirements & restrictions:\n  -  DO NOT make things up when all knowledge base content is irrelevant to the question. \n  - Answers need to consider chat history.\n  - Request about customer's contact information like, Wechat number, LINE number, twitter, discord, etc,. , when knowledge base content can't answer his question. So,  product expert could contact him soon to solve his problem.\n\n      Knowledge base content is as following:\n      {input}\n      The above is the content of knowledge base.",
         | 
| 340 | 
             
                            "temperature": 0.1,
         | 
| 341 | 
             
                            "temperatureEnabled": true,
         | 
| 342 | 
             
                            "topPEnabled": true,
         | 
|  | |
| 603 | 
             
                      {
         | 
| 604 | 
             
                        "data": {
         | 
| 605 | 
             
                          "form": {
         | 
| 606 | 
            +
                            "text": "Static messages.\nDefine response after receive user's contact information."
         | 
| 607 | 
             
                          },
         | 
| 608 | 
             
                          "label": "Note",
         | 
| 609 | 
             
                          "name": "N: What else?"
         | 
|  | |
| 691 | 
             
                      {
         | 
| 692 | 
             
                        "data": {
         | 
| 693 | 
             
                          "form": {
         | 
| 694 | 
            +
                            "text": "Complete questions by conversation history.\nUser: What's RAGFlow?\nAssistant: RAGFlow is xxx.\nUser: How to deploy it?\n\nRefine it: How to deploy RAGFlow?"
         | 
| 695 | 
             
                          },
         | 
| 696 | 
             
                          "label": "Note",
         | 
| 697 | 
             
                          "name": "N: Refine Question"
         | 
    	
        rag/app/knowledge_graph.py
    CHANGED
    
    | @@ -9,7 +9,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, | |
| 9 | 
             
                      lang="Chinese", callback=None, **kwargs):
         | 
| 10 | 
             
                parser_config = kwargs.get(
         | 
| 11 | 
             
                    "parser_config", {
         | 
| 12 | 
            -
                        "chunk_token_num": 512, "delimiter": "\n | 
| 13 | 
             
                eng = lang.lower() == "english"
         | 
| 14 |  | 
| 15 | 
             
                parser_config["layout_recognize"] = True
         | 
| @@ -29,4 +29,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, | |
| 29 | 
             
                doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
         | 
| 30 | 
             
                chunks.extend(tokenize_chunks(sections, doc, eng))
         | 
| 31 |  | 
| 32 | 
            -
                return chunks
         | 
|  | |
| 9 | 
             
                      lang="Chinese", callback=None, **kwargs):
         | 
| 10 | 
             
                parser_config = kwargs.get(
         | 
| 11 | 
             
                    "parser_config", {
         | 
| 12 | 
            +
                        "chunk_token_num": 512, "delimiter": "\n!?;。;!?", "layout_recognize": True})
         | 
| 13 | 
             
                eng = lang.lower() == "english"
         | 
| 14 |  | 
| 15 | 
             
                parser_config["layout_recognize"] = True
         | 
|  | |
| 29 | 
             
                doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
         | 
| 30 | 
             
                chunks.extend(tokenize_chunks(sections, doc, eng))
         | 
| 31 |  | 
| 32 | 
            +
                return chunks
         | 
    	
        rag/app/manual.py
    CHANGED
    
    | @@ -256,7 +256,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, | |
| 256 | 
             
                    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
         | 
| 257 | 
             
                    return res
         | 
| 258 |  | 
| 259 | 
            -
                elif re.search(r"\.docx | 
| 260 | 
             
                    docx_parser = Docx()
         | 
| 261 | 
             
                    ti_list, tbls = docx_parser(filename, binary,
         | 
| 262 | 
             
                                                from_page=0, to_page=10000, callback=callback)
         | 
| @@ -279,4 +279,4 @@ if __name__ == "__main__": | |
| 279 | 
             
                    pass
         | 
| 280 |  | 
| 281 |  | 
| 282 | 
            -
                chunk(sys.argv[1], callback=dummy)
         | 
|  | |
| 256 | 
             
                    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
         | 
| 257 | 
             
                    return res
         | 
| 258 |  | 
| 259 | 
            +
                elif re.search(r"\.docx?$", filename, re.IGNORECASE):
         | 
| 260 | 
             
                    docx_parser = Docx()
         | 
| 261 | 
             
                    ti_list, tbls = docx_parser(filename, binary,
         | 
| 262 | 
             
                                                from_page=0, to_page=10000, callback=callback)
         | 
|  | |
| 279 | 
             
                    pass
         | 
| 280 |  | 
| 281 |  | 
| 282 | 
            +
                chunk(sys.argv[1], callback=dummy)
         | 
    	
        rag/llm/chat_model.py
    CHANGED
    
    | @@ -24,7 +24,6 @@ import openai | |
| 24 | 
             
            from ollama import Client
         | 
| 25 | 
             
            from rag.nlp import is_chinese, is_english
         | 
| 26 | 
             
            from rag.utils import num_tokens_from_string
         | 
| 27 | 
            -
            from groq import Groq
         | 
| 28 | 
             
            import os
         | 
| 29 | 
             
            import json
         | 
| 30 | 
             
            import requests
         | 
| @@ -840,6 +839,7 @@ class GeminiChat(Base): | |
| 840 |  | 
| 841 | 
             
            class GroqChat:
         | 
| 842 | 
             
                def __init__(self, key, model_name, base_url=''):
         | 
|  | |
| 843 | 
             
                    self.client = Groq(api_key=key)
         | 
| 844 | 
             
                    self.model_name = model_name
         | 
| 845 |  | 
|  | |
| 24 | 
             
            from ollama import Client
         | 
| 25 | 
             
            from rag.nlp import is_chinese, is_english
         | 
| 26 | 
             
            from rag.utils import num_tokens_from_string
         | 
|  | |
| 27 | 
             
            import os
         | 
| 28 | 
             
            import json
         | 
| 29 | 
             
            import requests
         | 
|  | |
| 839 |  | 
| 840 | 
             
            class GroqChat:
         | 
| 841 | 
             
                def __init__(self, key, model_name, base_url=''):
         | 
| 842 | 
            +
                    from groq import Groq
         | 
| 843 | 
             
                    self.client = Groq(api_key=key)
         | 
| 844 | 
             
                    self.model_name = model_name
         | 
| 845 |  | 
    	
        rag/llm/tts_model.py
    CHANGED
    
    | @@ -299,8 +299,6 @@ class SparkTTS: | |
| 299 | 
             
                        yield audio_chunk
         | 
| 300 |  | 
| 301 |  | 
| 302 | 
            -
             | 
| 303 | 
            -
             | 
| 304 | 
             
            class XinferenceTTS:
         | 
| 305 | 
             
                def __init__(self, key, model_name, **kwargs):
         | 
| 306 | 
             
                    self.base_url = kwargs.get("base_url", None)
         | 
| @@ -330,3 +328,30 @@ class XinferenceTTS: | |
| 330 | 
             
                    for chunk in response.iter_content(chunk_size=1024):
         | 
| 331 | 
             
                        if chunk:
         | 
| 332 | 
             
                            yield chunk
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 299 | 
             
                        yield audio_chunk
         | 
| 300 |  | 
| 301 |  | 
|  | |
|  | |
| 302 | 
             
            class XinferenceTTS:
         | 
| 303 | 
             
                def __init__(self, key, model_name, **kwargs):
         | 
| 304 | 
             
                    self.base_url = kwargs.get("base_url", None)
         | 
|  | |
| 328 | 
             
                    for chunk in response.iter_content(chunk_size=1024):
         | 
| 329 | 
             
                        if chunk:
         | 
| 330 | 
             
                            yield chunk
         | 
| 331 | 
            +
             | 
| 332 | 
            +
             | 
| 333 | 
            +
            class OllamaTTS(Base):
         | 
| 334 | 
            +
                def __init__(self, key, model_name="ollama-tts", base_url="https://api.ollama.ai/v1"):
         | 
| 335 | 
            +
                    if not base_url: 
         | 
| 336 | 
            +
                        base_url = "https://api.ollama.ai/v1"
         | 
| 337 | 
            +
                    self.model_name = model_name
         | 
| 338 | 
            +
                    self.base_url = base_url
         | 
| 339 | 
            +
                    self.headers = {
         | 
| 340 | 
            +
                        "Content-Type": "application/json"
         | 
| 341 | 
            +
                    }
         | 
| 342 | 
            +
             | 
| 343 | 
            +
                def tts(self, text, voice="standard-voice"):
         | 
| 344 | 
            +
                    payload = {
         | 
| 345 | 
            +
                        "model": self.model_name,
         | 
| 346 | 
            +
                        "voice": voice,
         | 
| 347 | 
            +
                        "input": text
         | 
| 348 | 
            +
                    }
         | 
| 349 | 
            +
             | 
| 350 | 
            +
                    response = requests.post(f"{self.base_url}/audio/tts", headers=self.headers, json=payload, stream=True)
         | 
| 351 | 
            +
             | 
| 352 | 
            +
                    if response.status_code != 200:
         | 
| 353 | 
            +
                        raise Exception(f"**Error**: {response.status_code}, {response.text}")
         | 
| 354 | 
            +
             | 
| 355 | 
            +
                    for chunk in response.iter_content():
         | 
| 356 | 
            +
                        if chunk:
         | 
| 357 | 
            +
                            yield chunk
         | 
    	
        rag/svr/cache_file_svr.py
    CHANGED
    
    | @@ -1,60 +1,60 @@ | |
| 1 | 
            -
            #
         | 
| 2 | 
            -
            #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
         | 
| 3 | 
            -
            #
         | 
| 4 | 
            -
            #  Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            -
            #  you may not use this file except in compliance with the License.
         | 
| 6 | 
            -
            #  You may obtain a copy of the License at
         | 
| 7 | 
            -
            #
         | 
| 8 | 
            -
            #      http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            -
            #
         | 
| 10 | 
            -
            #  Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            -
            #  distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            -
            #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            -
            #  See the License for the specific language governing permissions and
         | 
| 14 | 
            -
            #  limitations under the License.
         | 
| 15 | 
            -
            #
         | 
| 16 | 
            -
            import logging
         | 
| 17 | 
            -
            import time
         | 
| 18 | 
            -
            import traceback
         | 
| 19 | 
            -
             | 
| 20 | 
            -
            from api.db.db_models import close_connection
         | 
| 21 | 
            -
            from api.db.services.task_service import TaskService
         | 
| 22 | 
            -
            from rag.utils. | 
| 23 | 
            -
            from rag.utils.redis_conn import REDIS_CONN
         | 
| 24 | 
            -
             | 
| 25 | 
            -
             | 
| 26 | 
            -
            def collect():
         | 
| 27 | 
            -
                doc_locations = TaskService.get_ongoing_doc_name()
         | 
| 28 | 
            -
                logging.debug(doc_locations)
         | 
| 29 | 
            -
                if len(doc_locations) == 0:
         | 
| 30 | 
            -
                    time.sleep(1)
         | 
| 31 | 
            -
                    return
         | 
| 32 | 
            -
                return doc_locations
         | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
| 36 | 
            -
                 | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
                 | 
| 40 | 
            -
             | 
| 41 | 
            -
             | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
            -
                                 | 
| 45 | 
            -
             | 
| 46 | 
            -
             | 
| 47 | 
            -
                                 | 
| 48 | 
            -
                                 | 
| 49 | 
            -
             | 
| 50 | 
            -
             | 
| 51 | 
            -
             | 
| 52 | 
            -
             | 
| 53 | 
            -
             | 
| 54 | 
            -
             | 
| 55 | 
            -
             | 
| 56 | 
            -
            if __name__ == "__main__":
         | 
| 57 | 
            -
                while True:
         | 
| 58 | 
            -
                    main()
         | 
| 59 | 
            -
                    close_connection()
         | 
| 60 | 
             
                    time.sleep(1)
         | 
|  | |
| 1 | 
            +
            #
         | 
| 2 | 
            +
            #  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            #  Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            #  you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            #  You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #      http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            #  Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            #  distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            #  See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            #  limitations under the License.
         | 
| 15 | 
            +
            #
         | 
| 16 | 
            +
            import logging
         | 
| 17 | 
            +
            import time
         | 
| 18 | 
            +
            import traceback
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            from api.db.db_models import close_connection
         | 
| 21 | 
            +
            from api.db.services.task_service import TaskService
         | 
| 22 | 
            +
            from rag.utils.minio_conn import MINIOs
         | 
| 23 | 
            +
            from rag.utils.redis_conn import REDIS_CONN
         | 
| 24 | 
            +
             | 
| 25 | 
            +
             | 
| 26 | 
            +
            def collect():
         | 
| 27 | 
            +
                doc_locations = TaskService.get_ongoing_doc_name()
         | 
| 28 | 
            +
                logging.debug(doc_locations)
         | 
| 29 | 
            +
                if len(doc_locations) == 0:
         | 
| 30 | 
            +
                    time.sleep(1)
         | 
| 31 | 
            +
                    return
         | 
| 32 | 
            +
                return doc_locations
         | 
| 33 | 
            +
             | 
| 34 | 
            +
             | 
| 35 | 
            +
            def main():
         | 
| 36 | 
            +
                locations = collect()
         | 
| 37 | 
            +
                if not locations:
         | 
| 38 | 
            +
                    return
         | 
| 39 | 
            +
                logging.info(f"TASKS: {len(locations)}")
         | 
| 40 | 
            +
                for kb_id, loc in locations:
         | 
| 41 | 
            +
                    try:
         | 
| 42 | 
            +
                        if REDIS_CONN.is_alive():
         | 
| 43 | 
            +
                            try:
         | 
| 44 | 
            +
                                key = "{}/{}".format(kb_id, loc)
         | 
| 45 | 
            +
                                if REDIS_CONN.exist(key):
         | 
| 46 | 
            +
                                    continue
         | 
| 47 | 
            +
                                file_bin = MINIOs.get(kb_id, loc)
         | 
| 48 | 
            +
                                REDIS_CONN.transaction(key, file_bin, 12 * 60)
         | 
| 49 | 
            +
                                logging.info("CACHE: {}".format(loc))
         | 
| 50 | 
            +
                            except Exception as e:
         | 
| 51 | 
            +
                                traceback.print_stack(e)
         | 
| 52 | 
            +
                    except Exception as e:
         | 
| 53 | 
            +
                        traceback.print_stack(e)
         | 
| 54 | 
            +
             | 
| 55 | 
            +
             | 
| 56 | 
            +
            if __name__ == "__main__":
         | 
| 57 | 
            +
                while True:
         | 
| 58 | 
            +
                    main()
         | 
| 59 | 
            +
                    close_connection()
         | 
| 60 | 
             
                    time.sleep(1)
         |