KevinHuSh
commited on
Commit
·
ce95639
1
Parent(s):
6a5fa84
resolve the issue of naive parser (#87)
Browse files- README.md +30 -10
- api/apps/user_app.py +4 -4
- api/db/init_data.py +1 -1
- api/ragflow_server.py +0 -1
- api/settings.py +1 -1
- deepdoc/parser/pdf_parser.py +4 -3
- rag/app/naive.py +1 -2
README.md
CHANGED
|
@@ -8,8 +8,7 @@ English | [简体中文](./README_zh.md)
|
|
| 8 |
If your machine doesn't have *Docker* installed, please refer to [Install Docker Engine](https://docs.docker.com/engine/install/)
|
| 9 |
|
| 10 |
### OS Setups
|
| 11 |
-
|
| 12 |
-
you need to check the following command:
|
| 13 |
```bash
|
| 14 |
121:/ragflow# sysctl vm.max_map_count
|
| 15 |
vm.max_map_count = 262144
|
|
@@ -25,23 +24,44 @@ Add or update the following line in the file:
|
|
| 25 |
vm.max_map_count=262144
|
| 26 |
```
|
| 27 |
|
| 28 |
-
|
| 29 |
> If you want to change the basic setups, like port, password .etc., please refer to [.env](./docker/.env) before starting the system.
|
| 30 |
|
| 31 |
-
> If you change anything in [.env](./docker/.env), please check [service_conf.yaml](./
|
| 32 |
> configuration of the back-end service and should be consistent with [.env](./docker/.env).
|
| 33 |
|
| 34 |
-
> - In [service_conf.yaml](./
|
| 35 |
-
> In **user_default_llm** of [service_conf.yaml](./
|
| 36 |
> It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system.
|
| 37 |
> - We have supported the flowing LLM factory, and the others is coming soon:
|
| 38 |
> [OpenAI](https://platform.openai.com/login?launch), [通义千问/QWen](https://dashscope.console.aliyun.com/model),
|
| 39 |
-
> [
|
| 40 |
```bash
|
| 41 |
121:/ragflow# cd docker
|
| 42 |
-
121:/ragflow/docker# docker compose up
|
| 43 |
```
|
| 44 |
-
If after a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
| 46 |
-
<img src="https://github.com/infiniflow/ragflow/assets/12318111/
|
| 47 |
</div>
|
|
|
|
| 8 |
If your machine doesn't have *Docker* installed, please refer to [Install Docker Engine](https://docs.docker.com/engine/install/)
|
| 9 |
|
| 10 |
### OS Setups
|
| 11 |
+
Firstly, you need to check the following command:
|
|
|
|
| 12 |
```bash
|
| 13 |
121:/ragflow# sysctl vm.max_map_count
|
| 14 |
vm.max_map_count = 262144
|
|
|
|
| 24 |
vm.max_map_count=262144
|
| 25 |
```
|
| 26 |
|
| 27 |
+
## Here we go!
|
| 28 |
> If you want to change the basic setups, like port, password .etc., please refer to [.env](./docker/.env) before starting the system.
|
| 29 |
|
| 30 |
+
> If you change anything in [.env](./docker/.env), please check [service_conf.yaml](./docker/service_conf.yaml) which is a
|
| 31 |
> configuration of the back-end service and should be consistent with [.env](./docker/.env).
|
| 32 |
|
| 33 |
+
> - In [service_conf.yaml](./docker/service_conf.yaml), configuration of *LLM* in **user_default_llm** is strongly recommended.
|
| 34 |
+
> In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_.
|
| 35 |
> It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system.
|
| 36 |
> - We have supported the flowing LLM factory, and the others is coming soon:
|
| 37 |
> [OpenAI](https://platform.openai.com/login?launch), [通义千问/QWen](https://dashscope.console.aliyun.com/model),
|
| 38 |
+
> [智谱AI/ZhipuAI](https://open.bigmodel.cn/)
|
| 39 |
```bash
|
| 40 |
121:/ragflow# cd docker
|
| 41 |
+
121:/ragflow/docker# docker compose up -d
|
| 42 |
```
|
| 43 |
+
If after about a half of minutes, use the following command to check the server status. If you can have the following outputs,
|
| 44 |
+
_**Hallelujah!**_ You have successfully launched the system.
|
| 45 |
+
```bash
|
| 46 |
+
121:/ragflow# docker logs -f ragflow-server
|
| 47 |
+
|
| 48 |
+
____ ______ __
|
| 49 |
+
/ __ \ ____ _ ____ _ / ____// /____ _ __
|
| 50 |
+
/ /_/ // __ `// __ `// /_ / // __ \| | /| / /
|
| 51 |
+
/ _, _// /_/ // /_/ // __/ / // /_/ /| |/ |/ /
|
| 52 |
+
/_/ |_| \__,_/ \__, //_/ /_/ \____/ |__/|__/
|
| 53 |
+
/____/
|
| 54 |
+
|
| 55 |
+
* Running on all addresses (0.0.0.0)
|
| 56 |
+
* Running on http://127.0.0.1:9380
|
| 57 |
+
* Running on http://172.22.0.5:9380
|
| 58 |
+
INFO:werkzeug:Press CTRL+C to quit
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
Open your browser, after entering the IP address of your server, if you see the flowing in your browser, _**Hallelujah**_ again!
|
| 62 |
+
> The default serving port is 80, if you want to change that, please refer to [ragflow.conf](./nginx/ragflow.conf),
|
| 63 |
+
> and change the *listen* value.
|
| 64 |
+
|
| 65 |
<div align="center" style="margin-top:20px;margin-bottom:20px;">
|
| 66 |
+
<img src="https://github.com/infiniflow/ragflow/assets/12318111/b24a7a5f-4d1d-4a30-90b1-7b0ec558b79d" width="1000"/>
|
| 67 |
</div>
|
api/apps/user_app.py
CHANGED
|
@@ -70,11 +70,10 @@ def github_callback():
|
|
| 70 |
}, headers={"Accept": "application/json"})
|
| 71 |
res = res.json()
|
| 72 |
if "error" in res:
|
| 73 |
-
return
|
| 74 |
-
retmsg=res["error_description"])
|
| 75 |
|
| 76 |
if "user:email" not in res["scope"].split(","):
|
| 77 |
-
return
|
| 78 |
|
| 79 |
session["access_token"] = res["access_token"]
|
| 80 |
session["access_token_from"] = "github"
|
|
@@ -104,8 +103,9 @@ def github_callback():
|
|
| 104 |
except Exception as e:
|
| 105 |
rollback_user_registration(user_id)
|
| 106 |
stat_logger.exception(e)
|
|
|
|
| 107 |
|
| 108 |
-
return redirect("
|
| 109 |
|
| 110 |
|
| 111 |
def user_info_from_github(access_token):
|
|
|
|
| 70 |
}, headers={"Accept": "application/json"})
|
| 71 |
res = res.json()
|
| 72 |
if "error" in res:
|
| 73 |
+
return redirect("/?error=%s" % res["error_description"])
|
|
|
|
| 74 |
|
| 75 |
if "user:email" not in res["scope"].split(","):
|
| 76 |
+
return redirect("/?error=user:email not in scope")
|
| 77 |
|
| 78 |
session["access_token"] = res["access_token"]
|
| 79 |
session["access_token_from"] = "github"
|
|
|
|
| 103 |
except Exception as e:
|
| 104 |
rollback_user_registration(user_id)
|
| 105 |
stat_logger.exception(e)
|
| 106 |
+
return redirect("/?error=%s"%str(e))
|
| 107 |
|
| 108 |
+
return redirect("/?auth=%s"%user_id)
|
| 109 |
|
| 110 |
|
| 111 |
def user_info_from_github(access_token):
|
api/db/init_data.py
CHANGED
|
@@ -85,7 +85,7 @@ def init_llm_factory():
|
|
| 85 |
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
| 86 |
"status": "1",
|
| 87 |
},{
|
| 88 |
-
"name": "
|
| 89 |
"logo": "",
|
| 90 |
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
| 91 |
"status": "1",
|
|
|
|
| 85 |
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
| 86 |
"status": "1",
|
| 87 |
},{
|
| 88 |
+
"name": "智谱AI",
|
| 89 |
"logo": "",
|
| 90 |
"tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
|
| 91 |
"status": "1",
|
api/ragflow_server.py
CHANGED
|
@@ -13,7 +13,6 @@
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
| 16 |
-
# init env. must be the first import
|
| 17 |
|
| 18 |
import logging
|
| 19 |
import os
|
|
|
|
| 13 |
# See the License for the specific language governing permissions and
|
| 14 |
# limitations under the License.
|
| 15 |
#
|
|
|
|
| 16 |
|
| 17 |
import logging
|
| 18 |
import os
|
api/settings.py
CHANGED
|
@@ -58,7 +58,7 @@ default_llm = {
|
|
| 58 |
"image2text_model": "gpt-4-vision-preview",
|
| 59 |
"asr_model": "whisper-1",
|
| 60 |
},
|
| 61 |
-
"
|
| 62 |
"chat_model": "glm-3-turbo",
|
| 63 |
"embedding_model": "embedding-2",
|
| 64 |
"image2text_model": "glm-4v",
|
|
|
|
| 58 |
"image2text_model": "gpt-4-vision-preview",
|
| 59 |
"asr_model": "whisper-1",
|
| 60 |
},
|
| 61 |
+
"智谱AI": {
|
| 62 |
"chat_model": "glm-3-turbo",
|
| 63 |
"embedding_model": "embedding-2",
|
| 64 |
"image2text_model": "glm-4v",
|
deepdoc/parser/pdf_parser.py
CHANGED
|
@@ -24,9 +24,10 @@ logging.getLogger("pdfminer").setLevel(logging.WARNING)
|
|
| 24 |
class HuParser:
|
| 25 |
def __init__(self):
|
| 26 |
self.ocr = OCR()
|
| 27 |
-
if
|
| 28 |
-
self.
|
| 29 |
-
|
|
|
|
| 30 |
self.tbl_det = TableStructureRecognizer()
|
| 31 |
|
| 32 |
self.updown_cnt_mdl = xgb.Booster()
|
|
|
|
| 24 |
class HuParser:
|
| 25 |
def __init__(self):
|
| 26 |
self.ocr = OCR()
|
| 27 |
+
if hasattr(self, "model_speciess"):
|
| 28 |
+
self.layouter = LayoutRecognizer("layout."+self.model_speciess)
|
| 29 |
+
else:
|
| 30 |
+
self.layouter = LayoutRecognizer("layout")
|
| 31 |
self.tbl_det = TableStructureRecognizer()
|
| 32 |
|
| 33 |
self.updown_cnt_mdl = xgb.Booster()
|
rag/app/naive.py
CHANGED
|
@@ -30,7 +30,6 @@ class Pdf(PdfParser):
|
|
| 30 |
|
| 31 |
from timeit import default_timer as timer
|
| 32 |
start = timer()
|
| 33 |
-
start = timer()
|
| 34 |
self._layouts_rec(zoomin)
|
| 35 |
callback(0.5, "Layout analysis finished.")
|
| 36 |
print("paddle layouts:", timer() - start)
|
|
@@ -102,7 +101,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|
| 102 |
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
| 103 |
|
| 104 |
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
| 105 |
-
cks = naive_merge(sections, parser_config
|
| 106 |
|
| 107 |
# wrap up to es documents
|
| 108 |
for ck in cks:
|
|
|
|
| 30 |
|
| 31 |
from timeit import default_timer as timer
|
| 32 |
start = timer()
|
|
|
|
| 33 |
self._layouts_rec(zoomin)
|
| 34 |
callback(0.5, "Layout analysis finished.")
|
| 35 |
print("paddle layouts:", timer() - start)
|
|
|
|
| 101 |
raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
|
| 102 |
|
| 103 |
parser_config = kwargs.get("parser_config", {"chunk_token_num": 128, "delimiter": "\n!?。;!?"})
|
| 104 |
+
cks = naive_merge(sections, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?。;!?"))
|
| 105 |
|
| 106 |
# wrap up to es documents
|
| 107 |
for ck in cks:
|