KevinHuSh commited on
Commit
1ed30a6
·
1 Parent(s): 84f80c5

Add 'One' chunk method (#137)

Browse files
README.md CHANGED
@@ -88,8 +88,8 @@ If your machine doesn't have *Docker* installed, please refer to [Install Docker
88
  > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_.
89
  > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system.
90
  > - We have supported the flowing LLM factory, and the others is coming soon:
91
- > [OpenAI](https://platform.openai.com/login?launch), [通义千问/QWen](https://dashscope.console.aliyun.com/model),
92
- > [智谱AI/ZhipuAI](https://open.bigmodel.cn/)
93
  ```bash
94
  121:/# git clone https://github.com/infiniflow/ragflow.git
95
  121:/# cd ragflow/docker
 
88
  > In **user_default_llm** of [service_conf.yaml](./docker/service_conf.yaml), you need to specify LLM factory and your own _API_KEY_.
89
  > It's O.K if you don't have _API_KEY_ at the moment, you can specify it later at the setting part after starting and logging in the system.
90
  > - We have supported the flowing LLM factory, and the others is coming soon:
91
+ > [OpenAI](https://platform.openai.com/login?launch), [Tongyi-Qianwen](https://dashscope.console.aliyun.com/model),
92
+ > [ZHIPU-AI](https://open.bigmodel.cn/), [Moonshot](https://platform.moonshot.cn/docs/docs)
93
  ```bash
94
  121:/# git clone https://github.com/infiniflow/ragflow.git
95
  121:/# cd ragflow/docker
api/db/__init__.py CHANGED
@@ -79,3 +79,4 @@ class ParserType(StrEnum):
79
  TABLE = "table"
80
  NAIVE = "naive"
81
  PICTURE = "picture"
 
 
79
  TABLE = "table"
80
  NAIVE = "naive"
81
  PICTURE = "picture"
82
+ ONE = "one"
api/db/init_data.py CHANGED
@@ -79,12 +79,12 @@ factory_infos = [{
79
  "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
80
  "status": "1",
81
  },{
82
- "name": "通义千问",
83
  "logo": "",
84
  "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
85
  "status": "1",
86
  },{
87
- "name": "智谱AI",
88
  "logo": "",
89
  "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
90
  "status": "1",
@@ -270,6 +270,14 @@ def init_llm_factory():
270
  except Exception as e:
271
  pass
272
 
 
 
 
 
 
 
 
 
273
 
274
  def init_web_data():
275
  start_time = time.time()
 
79
  "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
80
  "status": "1",
81
  },{
82
+ "name": "Tongyi-Qianwen",
83
  "logo": "",
84
  "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
85
  "status": "1",
86
  },{
87
+ "name": "ZHIPU-AI",
88
  "logo": "",
89
  "tags": "LLM,TEXT EMBEDDING,SPEECH2TEXT,MODERATION",
90
  "status": "1",
 
270
  except Exception as e:
271
  pass
272
 
273
+ """
274
+ drop table llm;
275
+ drop table factories;
276
+ update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
277
+ update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
278
+ update tenant set parser_ids='naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture';
279
+ """
280
+
281
 
282
  def init_web_data():
283
  start_time = time.time()
api/settings.py CHANGED
@@ -52,7 +52,7 @@ REQUEST_MAX_WAIT_SEC = 300
52
  USE_REGISTRY = get_base_config("use_registry")
53
 
54
  default_llm = {
55
- "通义千问": {
56
  "chat_model": "qwen-plus",
57
  "embedding_model": "text-embedding-v2",
58
  "image2text_model": "qwen-vl-max",
@@ -64,7 +64,7 @@ default_llm = {
64
  "image2text_model": "gpt-4-vision-preview",
65
  "asr_model": "whisper-1",
66
  },
67
- "智谱AI": {
68
  "chat_model": "glm-3-turbo",
69
  "embedding_model": "embedding-2",
70
  "image2text_model": "glm-4v",
@@ -84,17 +84,17 @@ default_llm = {
84
  }
85
  }
86
  LLM = get_base_config("user_default_llm", {})
87
- LLM_FACTORY = LLM.get("factory", "通义千问")
88
  if LLM_FACTORY not in default_llm:
89
- print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to '通义千问/QWen' automatically, and please check the API_KEY in service_conf.yaml.")
90
- LLM_FACTORY = "通义千问"
91
  CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"]
92
  EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"]
93
  ASR_MDL = default_llm[LLM_FACTORY]["asr_model"]
94
  IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
95
 
96
  API_KEY = LLM.get("api_key", "")
97
- PARSERS = LLM.get("parsers", "naive:General,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
98
 
99
  # distribution
100
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
 
52
  USE_REGISTRY = get_base_config("use_registry")
53
 
54
  default_llm = {
55
+ "Tongyi-Qianwen": {
56
  "chat_model": "qwen-plus",
57
  "embedding_model": "text-embedding-v2",
58
  "image2text_model": "qwen-vl-max",
 
64
  "image2text_model": "gpt-4-vision-preview",
65
  "asr_model": "whisper-1",
66
  },
67
+ "ZHIPU-AI": {
68
  "chat_model": "glm-3-turbo",
69
  "embedding_model": "embedding-2",
70
  "image2text_model": "glm-4v",
 
84
  }
85
  }
86
  LLM = get_base_config("user_default_llm", {})
87
+ LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
88
  if LLM_FACTORY not in default_llm:
89
+ print("\33[91m【ERROR】\33[0m:", f"LLM factory {LLM_FACTORY} has not supported yet, switch to 'Tongyi-Qianwen/QWen' automatically, and please check the API_KEY in service_conf.yaml.")
90
+ LLM_FACTORY = "Tongyi-Qianwen"
91
  CHAT_MDL = default_llm[LLM_FACTORY]["chat_model"]
92
  EMBEDDING_MDL = default_llm[LLM_FACTORY]["embedding_model"]
93
  ASR_MDL = default_llm[LLM_FACTORY]["asr_model"]
94
  IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
95
 
96
  API_KEY = LLM.get("api_key", "")
97
+ PARSERS = LLM.get("parsers", "naive:General,one:One,qa:Q&A,resume:Resume,table:Table,laws:Laws,manual:Manual,book:Book,paper:Paper,presentation:Presentation,picture:Picture")
98
 
99
  # distribution
100
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
rag/app/manual.py CHANGED
@@ -57,7 +57,7 @@ class Pdf(PdfParser):
57
  sec_ids = []
58
  sid = 0
59
  for i, lvl in enumerate(levels):
60
- if lvl <= most_level: sid += 1
61
  sec_ids.append(sid)
62
  #print(lvl, self.boxes[i]["text"], most_level)
63
 
@@ -75,7 +75,7 @@ class Pdf(PdfParser):
75
  continue
76
  chunks.append(txt + poss)
77
  if sec_id >-1: last_sid = sec_id
78
- return chunks
79
 
80
 
81
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
@@ -86,7 +86,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
86
 
87
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
88
  pdf_parser = Pdf()
89
- cks = pdf_parser(filename if not binary else binary,
90
  from_page=from_page, to_page=to_page, callback=callback)
91
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
92
  doc = {
@@ -100,7 +100,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
100
  i = 0
101
  chunk = []
102
  tk_cnt = 0
103
- res = []
104
  def add_chunk():
105
  nonlocal chunk, res, doc, pdf_parser, tk_cnt
106
  d = copy.deepcopy(doc)
 
57
  sec_ids = []
58
  sid = 0
59
  for i, lvl in enumerate(levels):
60
+ if lvl <= most_level and i > 0 and lvl != levels[i-1]: sid += 1
61
  sec_ids.append(sid)
62
  #print(lvl, self.boxes[i]["text"], most_level)
63
 
 
75
  continue
76
  chunks.append(txt + poss)
77
  if sec_id >-1: last_sid = sec_id
78
+ return chunks, tbls
79
 
80
 
81
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
 
86
 
87
  if re.search(r"\.pdf$", filename, re.IGNORECASE):
88
  pdf_parser = Pdf()
89
+ cks, tbls = pdf_parser(filename if not binary else binary,
90
  from_page=from_page, to_page=to_page, callback=callback)
91
  else: raise NotImplementedError("file type not supported yet(pdf supported)")
92
  doc = {
 
100
  i = 0
101
  chunk = []
102
  tk_cnt = 0
103
+ res = tokenize_table(tbls, doc, eng)
104
  def add_chunk():
105
  nonlocal chunk, res, doc, pdf_parser, tk_cnt
106
  d = copy.deepcopy(doc)
rag/app/naive.py CHANGED
@@ -49,7 +49,7 @@ class Pdf(PdfParser):
49
 
50
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
51
  """
52
- Supported file formats are docx, pdf, txt.
53
  This method apply the naive ways to chunk files.
54
  Successive text will be sliced into pieces using 'delimiter'.
55
  Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
 
49
 
50
  def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
51
  """
52
+ Supported file formats are docx, pdf, excel, txt.
53
  This method apply the naive ways to chunk files.
54
  Successive text will be sliced into pieces using 'delimiter'.
55
  Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
rag/app/one.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ import copy
14
+ import re
15
+ from rag.app import laws
16
+ from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions
17
+ from deepdoc.parser import PdfParser, ExcelParser
18
+ from rag.settings import cron_logger
19
+
20
+
21
+ class Pdf(PdfParser):
22
+ def __call__(self, filename, binary=None, from_page=0,
23
+ to_page=100000, zoomin=3, callback=None):
24
+ callback(msg="OCR is running...")
25
+ self.__images__(
26
+ filename if not binary else binary,
27
+ zoomin,
28
+ from_page,
29
+ to_page,
30
+ callback
31
+ )
32
+ callback(msg="OCR finished")
33
+
34
+ from timeit import default_timer as timer
35
+ start = timer()
36
+ self._layouts_rec(zoomin)
37
+ callback(0.63, "Layout analysis finished.")
38
+ print("paddle layouts:", timer() - start)
39
+ self._table_transformer_job(zoomin)
40
+ callback(0.65, "Table analysis finished.")
41
+ self._text_merge()
42
+ callback(0.67, "Text merging finished")
43
+ tbls = self._extract_table_figure(True, zoomin, True, True)
44
+ self._concat_downward()
45
+
46
+ sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
47
+ for (img, rows), poss in tbls:
48
+ sections.append((rows if isinstance(rows, str) else rows[0],
49
+ [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
50
+ return [txt for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))]
51
+
52
+
53
+ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
54
+ """
55
+ Supported file formats are docx, pdf, excel, txt.
56
+ One file forms a chunk which maintains original text order.
57
+ """
58
+
59
+ eng = lang.lower() == "english"#is_english(cks)
60
+
61
+ sections = []
62
+ if re.search(r"\.docx?$", filename, re.IGNORECASE):
63
+ callback(0.1, "Start to parse.")
64
+ for txt in laws.Docx()(filename, binary):
65
+ sections.append(txt)
66
+ callback(0.8, "Finish parsing.")
67
+ elif re.search(r"\.pdf$", filename, re.IGNORECASE):
68
+ pdf_parser = Pdf()
69
+ sections = pdf_parser(filename if not binary else binary, to_page=to_page, callback=callback)
70
+ elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
71
+ callback(0.1, "Start to parse.")
72
+ excel_parser = ExcelParser()
73
+ sections = [excel_parser.html(binary)]
74
+ elif re.search(r"\.txt$", filename, re.IGNORECASE):
75
+ callback(0.1, "Start to parse.")
76
+ txt = ""
77
+ if binary:
78
+ txt = binary.decode("utf-8")
79
+ else:
80
+ with open(filename, "r") as f:
81
+ while True:
82
+ l = f.readline()
83
+ if not l: break
84
+ txt += l
85
+ sections = txt.split("\n")
86
+ sections = [(l, "") for l in sections if l]
87
+ callback(0.8, "Finish parsing.")
88
+ else:
89
+ raise NotImplementedError("file type not supported yet(docx, pdf, txt supported)")
90
+
91
+ doc = {
92
+ "docnm_kwd": filename,
93
+ "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
94
+ }
95
+ doc["title_sm_tks"] = huqie.qieqie(doc["title_tks"])
96
+ tokenize(doc, "\n".join(sections), eng)
97
+ return [doc]
98
+
99
+
100
+ if __name__ == "__main__":
101
+ import sys
102
+
103
+
104
+ def dummy(prog=None, msg=""):
105
+ pass
106
+
107
+
108
+ chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
rag/llm/__init__.py CHANGED
@@ -21,8 +21,8 @@ from .cv_model import *
21
  EmbeddingModel = {
22
  "Local": HuEmbedding,
23
  "OpenAI": OpenAIEmbed,
24
- "通义千问": HuEmbedding, #QWenEmbed,
25
- "智谱AI": ZhipuEmbed,
26
  "Moonshot": HuEmbedding
27
  }
28
 
@@ -30,16 +30,16 @@ EmbeddingModel = {
30
  CvModel = {
31
  "OpenAI": GptV4,
32
  "Local": LocalCV,
33
- "通义千问": QWenCV,
34
- "智谱AI": Zhipu4V,
35
  "Moonshot": LocalCV
36
  }
37
 
38
 
39
  ChatModel = {
40
  "OpenAI": GptTurbo,
41
- "智谱AI": ZhipuChat,
42
- "通义千问": QWenChat,
43
  "Local": LocalLLM,
44
  "Moonshot": MoonshotChat
45
  }
 
21
  EmbeddingModel = {
22
  "Local": HuEmbedding,
23
  "OpenAI": OpenAIEmbed,
24
+ "Tongyi-Qianwen": HuEmbedding, #QWenEmbed,
25
+ "ZHIPU-AI": ZhipuEmbed,
26
  "Moonshot": HuEmbedding
27
  }
28
 
 
30
  CvModel = {
31
  "OpenAI": GptV4,
32
  "Local": LocalCV,
33
+ "Tongyi-Qianwen": QWenCV,
34
+ "ZHIPU-AI": Zhipu4V,
35
  "Moonshot": LocalCV
36
  }
37
 
38
 
39
  ChatModel = {
40
  "OpenAI": GptTurbo,
41
+ "ZHIPU-AI": ZhipuChat,
42
+ "Tongyi-Qianwen": QWenChat,
43
  "Local": LocalLLM,
44
  "Moonshot": MoonshotChat
45
  }
rag/nlp/search.py CHANGED
@@ -194,7 +194,7 @@ class Dealer:
194
  return [float(t) for t in txt.split("\t")]
195
 
196
  def insert_citations(self, answer, chunks, chunk_v,
197
- embd_mdl, tkweight=0.7, vtweight=0.3):
198
  assert len(chunks) == len(chunk_v)
199
  pieces = re.split(r"(```)", answer)
200
  if len(pieces) >= 3:
@@ -243,7 +243,7 @@ class Dealer:
243
  chunks_tks,
244
  tkweight, vtweight)
245
  mx = np.max(sim) * 0.99
246
- if mx < 0.7:
247
  continue
248
  cites[idx[i]] = list(
249
  set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
 
194
  return [float(t) for t in txt.split("\t")]
195
 
196
  def insert_citations(self, answer, chunks, chunk_v,
197
+ embd_mdl, tkweight=0.1, vtweight=0.9):
198
  assert len(chunks) == len(chunk_v)
199
  pieces = re.split(r"(```)", answer)
200
  if len(pieces) >= 3:
 
243
  chunks_tks,
244
  tkweight, vtweight)
245
  mx = np.max(sim) * 0.99
246
+ if mx < 0.65:
247
  continue
248
  cites[idx[i]] = list(
249
  set([str(ii) for ii in range(len(chunk_v)) if sim[ii] > mx]))[:4]
rag/svr/task_broker.py CHANGED
@@ -84,6 +84,7 @@ def dispatch():
84
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
85
  page_size = 5
86
  if r["parser_id"] == "paper": page_size = 12
 
87
  for s,e in r["parser_config"].get("pages", [(0,100000)]):
88
  e = min(e, pages)
89
  for p in range(s, e, page_size):
 
84
  pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
85
  page_size = 5
86
  if r["parser_id"] == "paper": page_size = 12
87
+ if r["parser_id"] == "one": page_size = 1000000000
88
  for s,e in r["parser_config"].get("pages", [(0,100000)]):
89
  e = min(e, pages)
90
  for p in range(s, e, page_size):
rag/svr/task_executor.py CHANGED
@@ -39,7 +39,7 @@ from rag.nlp import search
39
  from io import BytesIO
40
  import pandas as pd
41
 
42
- from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive
43
 
44
  from api.db import LLMType, ParserType
45
  from api.db.services.document_service import DocumentService
@@ -60,6 +60,7 @@ FACTORY = {
60
  ParserType.TABLE.value: table,
61
  ParserType.RESUME.value: resume,
62
  ParserType.PICTURE.value: picture,
 
63
  }
64
 
65
 
 
39
  from io import BytesIO
40
  import pandas as pd
41
 
42
+ from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
43
 
44
  from api.db import LLMType, ParserType
45
  from api.db.services.document_service import DocumentService
 
60
  ParserType.TABLE.value: table,
61
  ParserType.RESUME.value: resume,
62
  ParserType.PICTURE.value: picture,
63
+ ParserType.ONE.value: one,
64
  }
65
 
66