KevinHuSh commited on
Commit
0cfb2df
·
1 Parent(s): 1356753

fix docker compose issue (#238)

Browse files

### What problem does this PR solve?

_Briefly describe what this PR aims to solve. Include background context
that will help reviewers understand the purpose of the PR._

Issue link:#[[Link the issue
here](https://github.com/infiniflow/ragflow/issues/226)]

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

api/apps/document_app.py CHANGED
@@ -65,6 +65,11 @@ def upload():
65
  DocumentService.query,
66
  name=file.filename,
67
  kb_id=kb.id)
 
 
 
 
 
68
  location = filename
69
  while MINIO.obj_exist(kb_id, location):
70
  location += "_"
 
65
  DocumentService.query,
66
  name=file.filename,
67
  kb_id=kb.id)
68
+ filetype = filename_type(filename)
69
+ if not filetype:
70
+ return get_data_error_result(
71
+ retmsg="This type of file has not been supported yet!")
72
+
73
  location = filename
74
  while MINIO.obj_exist(kb_id, location):
75
  location += "_"
api/apps/user_app.py CHANGED
@@ -25,7 +25,7 @@ from api.utils.api_utils import server_error_response, validate_request
25
  from api.utils import get_uuid, get_format_time, decrypt, download_img
26
  from api.db import UserTenantRole, LLMType
27
  from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
28
- LLM_FACTORY
29
  from api.db.services.user_service import UserService, TenantService, UserTenantService
30
  from api.settings import stat_logger
31
  from api.utils.api_utils import get_json_result, cors_reponse
@@ -220,7 +220,9 @@ def user_register(user_id, user):
220
  "llm_factory": LLM_FACTORY,
221
  "llm_name": llm.llm_name,
222
  "model_type": llm.model_type,
223
- "api_key": API_KEY})
 
 
224
 
225
  if not UserService.save(**user):
226
  return
 
25
  from api.utils import get_uuid, get_format_time, decrypt, download_img
26
  from api.db import UserTenantRole, LLMType
27
  from api.settings import RetCode, GITHUB_OAUTH, CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, API_KEY, \
28
+ LLM_FACTORY, LLM_BASE_URL
29
  from api.db.services.user_service import UserService, TenantService, UserTenantService
30
  from api.settings import stat_logger
31
  from api.utils.api_utils import get_json_result, cors_reponse
 
220
  "llm_factory": LLM_FACTORY,
221
  "llm_name": llm.llm_name,
222
  "model_type": llm.model_type,
223
+ "api_key": API_KEY,
224
+ "base_url": LLM_BASE_URL
225
+ })
226
 
227
  if not UserService.save(**user):
228
  return
api/db/init_data.py CHANGED
@@ -13,6 +13,7 @@
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
 
16
  import time
17
  import uuid
18
 
@@ -21,7 +22,7 @@ from api.db.db_models import init_database_tables as init_web_db
21
  from api.db.services import UserService
22
  from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
23
  from api.db.services.user_service import TenantService, UserTenantService
24
- from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY
25
 
26
 
27
  def init_superuser():
@@ -53,7 +54,7 @@ def init_superuser():
53
  for llm in LLMService.query(fid=LLM_FACTORY):
54
  tenant_llm.append(
55
  {"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type,
56
- "api_key": API_KEY})
57
 
58
  if not UserService.save(**user_info):
59
  print("\033[93m【ERROR】\033[0mcan't init admin.")
@@ -282,11 +283,8 @@ def init_llm_factory():
282
  pass
283
 
284
  """
285
- modify service_config
286
  drop table llm;
287
  drop table llm_factories;
288
- update tenant_llm set llm_factory='Tongyi-Qianwen' where llm_factory='通义千问';
289
- update tenant_llm set llm_factory='ZHIPU-AI' where llm_factory='智谱AI';
290
  update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
291
  alter table knowledgebase modify avatar longtext;
292
  alter table user modify avatar longtext;
 
13
  # See the License for the specific language governing permissions and
14
  # limitations under the License.
15
  #
16
+ import os
17
  import time
18
  import uuid
19
 
 
22
  from api.db.services import UserService
23
  from api.db.services.llm_service import LLMFactoriesService, LLMService, TenantLLMService, LLMBundle
24
  from api.db.services.user_service import TenantService, UserTenantService
25
+ from api.settings import CHAT_MDL, EMBEDDING_MDL, ASR_MDL, IMAGE2TEXT_MDL, PARSERS, LLM_FACTORY, API_KEY, LLM_BASE_URL
26
 
27
 
28
  def init_superuser():
 
54
  for llm in LLMService.query(fid=LLM_FACTORY):
55
  tenant_llm.append(
56
  {"tenant_id": user_info["id"], "llm_factory": LLM_FACTORY, "llm_name": llm.llm_name, "model_type": llm.model_type,
57
+ "api_key": API_KEY, "base_url": LLM_BASE_URL})
58
 
59
  if not UserService.save(**user_info):
60
  print("\033[93m【ERROR】\033[0mcan't init admin.")
 
283
  pass
284
 
285
  """
 
286
  drop table llm;
287
  drop table llm_factories;
 
 
288
  update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
289
  alter table knowledgebase modify avatar longtext;
290
  alter table user modify avatar longtext;
api/settings.py CHANGED
@@ -91,6 +91,8 @@ default_llm = {
91
  }
92
  LLM = get_base_config("user_default_llm", {})
93
  LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
 
 
94
  if LLM_FACTORY not in default_llm:
95
  print(
96
  "\33[91m【ERROR】\33[0m:",
 
91
  }
92
  LLM = get_base_config("user_default_llm", {})
93
  LLM_FACTORY = LLM.get("factory", "Tongyi-Qianwen")
94
+ LLM_BASE_URL = LLM.get("base_url")
95
+
96
  if LLM_FACTORY not in default_llm:
97
  print(
98
  "\33[91m【ERROR】\33[0m:",
docker/docker-compose-CN.yml CHANGED
@@ -1,99 +1,12 @@
1
  version: '2.2'
2
- services:
3
- es01:
4
- container_name: ragflow-es-01
5
- image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
6
- volumes:
7
- - esdata01:/usr/share/elasticsearch/data
8
- ports:
9
- - ${ES_PORT}:9200
10
- environment:
11
- - node.name=es01
12
- - cluster.name=${CLUSTER_NAME}
13
- - cluster.initial_master_nodes=es01
14
- - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
15
- - bootstrap.memory_lock=false
16
- - xpack.security.enabled=false
17
- - TZ=${TIMEZONE}
18
- mem_limit: ${MEM_LIMIT}
19
- ulimits:
20
- memlock:
21
- soft: -1
22
- hard: -1
23
- healthcheck:
24
- test: ["CMD-SHELL", "curl http://localhost:9200"]
25
- interval: 10s
26
- timeout: 10s
27
- retries: 120
28
- networks:
29
- - ragflow
30
- restart: always
31
-
32
- kibana:
33
- depends_on:
34
- es01:
35
- condition: service_healthy
36
- image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
37
- container_name: ragflow-kibana
38
- volumes:
39
- - kibanadata:/usr/share/kibana/data
40
- ports:
41
- - ${KIBANA_PORT}:5601
42
- environment:
43
- - SERVERNAME=kibana
44
- - ELASTICSEARCH_HOSTS=http://es01:9200
45
- - TZ=${TIMEZONE}
46
- mem_limit: ${MEM_LIMIT}
47
- networks:
48
- - ragflow
49
-
50
- mysql:
51
- image: mysql:5.7.18
52
- container_name: ragflow-mysql
53
- environment:
54
- - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
55
- - TZ=${TIMEZONE}
56
- command:
57
- --max_connections=1000
58
- --character-set-server=utf8mb4
59
- --collation-server=utf8mb4_general_ci
60
- --default-authentication-plugin=mysql_native_password
61
- --tls_version="TLSv1.2,TLSv1.3"
62
- --init-file /data/application/init.sql
63
- ports:
64
- - ${MYSQL_PORT}:3306
65
- volumes:
66
- - mysql_data:/var/lib/mysql
67
- - ./init.sql:/data/application/init.sql
68
- networks:
69
- - ragflow
70
- healthcheck:
71
- test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
72
- interval: 10s
73
- timeout: 10s
74
- retries: 3
75
- restart: always
76
-
77
 
78
- minio:
79
- image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
80
- container_name: ragflow-minio
81
- command: server --console-address ":9001" /data
82
- ports:
83
- - 9000:9000
84
- - 9001:9001
85
- environment:
86
- - MINIO_ROOT_USER=${MINIO_USER}
87
- - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
88
- - TZ=${TIMEZONE}
89
- volumes:
90
- - minio_data:/data
91
- networks:
92
- - ragflow
93
- restart: always
94
 
 
 
 
95
 
96
- ragflow:
 
97
  depends_on:
98
  mysql:
99
  condition: service_healthy
@@ -116,18 +29,3 @@ services:
116
  networks:
117
  - ragflow
118
  restart: always
119
-
120
-
121
- volumes:
122
- esdata01:
123
- driver: local
124
- kibanadata:
125
- driver: local
126
- mysql_data:
127
- driver: local
128
- minio_data:
129
- driver: local
130
-
131
- networks:
132
- ragflow:
133
- driver: bridge
 
1
  version: '2.2'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
+ include:
5
+ - path: ./docker-compose-base.yml
6
+ env_file: ./.env
7
 
8
+ services:
9
+ ragflow:
10
  depends_on:
11
  mysql:
12
  condition: service_healthy
 
29
  networks:
30
  - ragflow
31
  restart: always
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker/docker-compose-base.yml ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '2.2'
2
+
3
+ services:
4
+ es01:
5
+ container_name: ragflow-es-01
6
+ image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
7
+ volumes:
8
+ - esdata01:/usr/share/elasticsearch/data
9
+ ports:
10
+ - ${ES_PORT}:9200
11
+ environment:
12
+ - node.name=es01
13
+ - cluster.name=${CLUSTER_NAME}
14
+ - cluster.initial_master_nodes=es01
15
+ - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
16
+ - bootstrap.memory_lock=false
17
+ - xpack.security.enabled=false
18
+ - cluster.max_shards_per_node=4096
19
+ - TZ=${TIMEZONE}
20
+ mem_limit: ${MEM_LIMIT}
21
+ ulimits:
22
+ memlock:
23
+ soft: -1
24
+ hard: -1
25
+ healthcheck:
26
+ test: ["CMD-SHELL", "curl http://localhost:9200"]
27
+ interval: 10s
28
+ timeout: 10s
29
+ retries: 120
30
+ networks:
31
+ - ragflow
32
+ restart: always
33
+
34
+ kibana:
35
+ depends_on:
36
+ es01:
37
+ condition: service_healthy
38
+ image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
39
+ container_name: ragflow-kibana
40
+ volumes:
41
+ - kibanadata:/usr/share/kibana/data
42
+ ports:
43
+ - ${KIBANA_PORT}:5601
44
+ environment:
45
+ - SERVERNAME=kibana
46
+ - ELASTICSEARCH_HOSTS=http://es01:9200
47
+ - TZ=${TIMEZONE}
48
+ mem_limit: ${MEM_LIMIT}
49
+ networks:
50
+ - ragflow
51
+
52
+ mysql:
53
+ image: mysql:5.7.18
54
+ container_name: ragflow-mysql
55
+ environment:
56
+ - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
57
+ - TZ=${TIMEZONE}
58
+ command:
59
+ --max_connections=1000
60
+ --character-set-server=utf8mb4
61
+ --collation-server=utf8mb4_general_ci
62
+ --default-authentication-plugin=mysql_native_password
63
+ --tls_version="TLSv1.2,TLSv1.3"
64
+ --init-file /data/application/init.sql
65
+ ports:
66
+ - ${MYSQL_PORT}:3306
67
+ volumes:
68
+ - mysql_data:/var/lib/mysql
69
+ - ./init.sql:/data/application/init.sql
70
+ networks:
71
+ - ragflow
72
+ healthcheck:
73
+ test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
74
+ interval: 10s
75
+ timeout: 10s
76
+ retries: 3
77
+ restart: always
78
+
79
+
80
+ minio:
81
+ image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
82
+ container_name: ragflow-minio
83
+ command: server --console-address ":9001" /data
84
+ ports:
85
+ - 9000:9000
86
+ - 9001:9001
87
+ environment:
88
+ - MINIO_ROOT_USER=${MINIO_USER}
89
+ - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
90
+ - TZ=${TIMEZONE}
91
+ volumes:
92
+ - minio_data:/data
93
+ networks:
94
+ - ragflow
95
+ restart: always
96
+
97
+
98
+ volumes:
99
+ esdata01:
100
+ driver: local
101
+ kibanadata:
102
+ driver: local
103
+ mysql_data:
104
+ driver: local
105
+ minio_data:
106
+ driver: local
107
+
108
+ networks:
109
+ ragflow:
110
+ driver: bridge
docker/docker-compose.yml CHANGED
@@ -1,98 +1,10 @@
1
  version: '2.2'
2
- services:
3
- es01:
4
- container_name: ragflow-es-01
5
- image: docker.elastic.co/elasticsearch/elasticsearch:${STACK_VERSION}
6
- volumes:
7
- - esdata01:/usr/share/elasticsearch/data
8
- ports:
9
- - ${ES_PORT}:9200
10
- environment:
11
- - node.name=es01
12
- - cluster.name=${CLUSTER_NAME}
13
- - cluster.initial_master_nodes=es01
14
- - ELASTIC_PASSWORD=${ELASTIC_PASSWORD}
15
- - bootstrap.memory_lock=false
16
- - xpack.security.enabled=false
17
- - TZ=${TIMEZONE}
18
- mem_limit: ${MEM_LIMIT}
19
- ulimits:
20
- memlock:
21
- soft: -1
22
- hard: -1
23
- healthcheck:
24
- test: ["CMD-SHELL", "curl http://localhost:9200"]
25
- interval: 10s
26
- timeout: 10s
27
- retries: 120
28
- networks:
29
- - ragflow
30
- restart: always
31
-
32
- kibana:
33
- depends_on:
34
- es01:
35
- condition: service_healthy
36
- image: docker.elastic.co/kibana/kibana:${STACK_VERSION}
37
- container_name: ragflow-kibana
38
- volumes:
39
- - kibanadata:/usr/share/kibana/data
40
- ports:
41
- - ${KIBANA_PORT}:5601
42
- environment:
43
- - SERVERNAME=kibana
44
- - ELASTICSEARCH_HOSTS=http://es01:9200
45
- - TZ=${TIMEZONE}
46
- mem_limit: ${MEM_LIMIT}
47
- networks:
48
- - ragflow
49
-
50
- mysql:
51
- image: mysql:5.7.18
52
- container_name: ragflow-mysql
53
- environment:
54
- - MYSQL_ROOT_PASSWORD=${MYSQL_PASSWORD}
55
- - TZ=${TIMEZONE}
56
- command:
57
- --max_connections=1000
58
- --character-set-server=utf8mb4
59
- --collation-server=utf8mb4_general_ci
60
- --default-authentication-plugin=mysql_native_password
61
- --tls_version="TLSv1.2,TLSv1.3"
62
- --init-file /data/application/init.sql
63
- ports:
64
- - ${MYSQL_PORT}:3306
65
- volumes:
66
- - mysql_data:/var/lib/mysql
67
- - ./init.sql:/data/application/init.sql
68
- networks:
69
- - ragflow
70
- healthcheck:
71
- test: ["CMD", "mysqladmin" ,"ping", "-uroot", "-p${MYSQL_PASSWORD}"]
72
- interval: 10s
73
- timeout: 10s
74
- retries: 3
75
- restart: always
76
-
77
-
78
- minio:
79
- image: quay.io/minio/minio:RELEASE.2023-12-20T01-00-02Z
80
- container_name: ragflow-minio
81
- command: server --console-address ":9001" /data
82
- ports:
83
- - 9000:9000
84
- - 9001:9001
85
- environment:
86
- - MINIO_ROOT_USER=${MINIO_USER}
87
- - MINIO_ROOT_PASSWORD=${MINIO_PASSWORD}
88
- - TZ=${TIMEZONE}
89
- volumes:
90
- - minio_data:/data
91
- networks:
92
- - ragflow
93
- restart: always
94
 
 
 
 
95
 
 
96
  ragflow:
97
  depends_on:
98
  mysql:
@@ -107,6 +19,7 @@ services:
107
  - 443:443
108
  volumes:
109
  - ./service_conf.yaml:/ragflow/conf/service_conf.yaml
 
110
  - ./ragflow-logs:/ragflow/logs
111
  - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
112
  - ./nginx/proxy.conf:/etc/nginx/proxy.conf
@@ -116,18 +29,3 @@ services:
116
  networks:
117
  - ragflow
118
  restart: always
119
-
120
-
121
- volumes:
122
- esdata01:
123
- driver: local
124
- kibanadata:
125
- driver: local
126
- mysql_data:
127
- driver: local
128
- minio_data:
129
- driver: local
130
-
131
- networks:
132
- ragflow:
133
- driver: bridge
 
1
  version: '2.2'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ include:
4
+ - path: ./docker-compose-base.yml
5
+ env_file: ./.env
6
 
7
+ services:
8
  ragflow:
9
  depends_on:
10
  mysql:
 
19
  - 443:443
20
  volumes:
21
  - ./service_conf.yaml:/ragflow/conf/service_conf.yaml
22
+ - ./entrypoint.sh:/ragflow/entrypoint.sh
23
  - ./ragflow-logs:/ragflow/logs
24
  - ./nginx/ragflow.conf:/etc/nginx/conf.d/ragflow.conf
25
  - ./nginx/proxy.conf:/etc/nginx/proxy.conf
 
29
  networks:
30
  - ragflow
31
  restart: always
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docker/entrypoint.sh CHANGED
@@ -23,7 +23,7 @@ function watch_broker(){
23
  }
24
 
25
  function task_bro(){
26
- sleep 60;
27
  watch_broker;
28
  }
29
 
 
23
  }
24
 
25
  function task_bro(){
26
+ sleep 160;
27
  watch_broker;
28
  }
29
 
docker/service_conf.yaml CHANGED
@@ -18,6 +18,7 @@ es:
18
  user_default_llm:
19
  factory: 'Tongyi-Qianwen'
20
  api_key: 'sk-xxxxxxxxxxxxx'
 
21
  oauth:
22
  github:
23
  client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
 
18
  user_default_llm:
19
  factory: 'Tongyi-Qianwen'
20
  api_key: 'sk-xxxxxxxxxxxxx'
21
+ base_url: ''
22
  oauth:
23
  github:
24
  client_id: xxxxxxxxxxxxxxxxxxxxxxxxx
rag/app/naive.py CHANGED
@@ -10,14 +10,59 @@
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
13
- import copy
 
14
  import re
15
  from deepdoc.parser.pdf_parser import PlainParser
16
  from rag.app import laws
17
  from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
18
- from deepdoc.parser import PdfParser, ExcelParser
19
  from rag.settings import cron_logger
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  class Pdf(PdfParser):
23
  def __call__(self, filename, binary=None, from_page=0,
@@ -75,8 +120,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
75
  sections = []
76
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
77
  callback(0.1, "Start to parse.")
78
- for txt in laws.Docx()(filename, binary):
79
- sections.append((txt, ""))
80
  callback(0.8, "Finish parsing.")
81
 
82
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
 
10
  # See the License for the specific language governing permissions and
11
  # limitations under the License.
12
  #
13
+ from io import BytesIO
14
+ from docx import Document
15
  import re
16
  from deepdoc.parser.pdf_parser import PlainParser
17
  from rag.app import laws
18
  from rag.nlp import huqie, is_english, tokenize, naive_merge, tokenize_table, add_positions, tokenize_chunks
19
+ from deepdoc.parser import PdfParser, ExcelParser, DocxParser
20
  from rag.settings import cron_logger
21
 
22
+ class Docx(DocxParser):
23
+ def __init__(self):
24
+ pass
25
+
26
+ def __clean(self, line):
27
+ line = re.sub(r"\u3000", " ", line).strip()
28
+ return line
29
+
30
+ def __call__(self, filename, binary=None, from_page=0, to_page=100000):
31
+ self.doc = Document(
32
+ filename) if not binary else Document(BytesIO(binary))
33
+ pn = 0
34
+ lines = []
35
+ for p in self.doc.paragraphs:
36
+ if pn > to_page:
37
+ break
38
+ if from_page <= pn < to_page and p.text.strip():
39
+ lines.append(self.__clean(p.text))
40
+ for run in p.runs:
41
+ if 'lastRenderedPageBreak' in run._element.xml:
42
+ pn += 1
43
+ continue
44
+ if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
45
+ pn += 1
46
+ tbls = []
47
+ for tb in self.doc.tables:
48
+ html= "<table>"
49
+ for r in tb.rows:
50
+ html += "<tr>"
51
+ i = 0
52
+ while i < len(r.cells):
53
+ span = 1
54
+ c = r.cells[i]
55
+ for j in range(i+1, len(r.cells)):
56
+ if c.text == r.cells[j].text:
57
+ span += 1
58
+ i = j
59
+ i += 1
60
+ html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
61
+ html += "</tr>"
62
+ html += "</table>"
63
+ tbls.append(((None, html), ""))
64
+ return [(l, "") for l in lines if l], tbls
65
+
66
 
67
  class Pdf(PdfParser):
68
  def __call__(self, filename, binary=None, from_page=0,
 
120
  sections = []
121
  if re.search(r"\.docx?$", filename, re.IGNORECASE):
122
  callback(0.1, "Start to parse.")
123
+ sections, tbls = Docx()(filename, binary)
124
+ res = tokenize_table(tbls, doc, eng)
125
  callback(0.8, "Finish parsing.")
126
 
127
  elif re.search(r"\.pdf$", filename, re.IGNORECASE):
rag/app/table.py CHANGED
@@ -223,8 +223,8 @@ def chunk(filename, binary=None, from_page=0, to_page=10000000000,
223
  continue
224
  if not str(row[clmns[j]]):
225
  continue
226
- if pd.isna(row[clmns[j]]):
227
- continue
228
  fld = clmns_map[j][0]
229
  d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
230
  row[clmns[j]])
 
223
  continue
224
  if not str(row[clmns[j]]):
225
  continue
226
+ #if pd.isna(row[clmns[j]]):
227
+ # continue
228
  fld = clmns_map[j][0]
229
  d[fld] = row[clmns[j]] if clmn_tys[j] != "text" else huqie.qie(
230
  row[clmns[j]])
rag/llm/chat_model.py CHANGED
@@ -170,3 +170,4 @@ class LocalLLM(Base):
170
  return ans, num_tokens_from_string(ans)
171
  except Exception as e:
172
  return "**ERROR**: " + str(e), 0
 
 
170
  return ans, num_tokens_from_string(ans)
171
  except Exception as e:
172
  return "**ERROR**: " + str(e), 0
173
+
rag/nlp/__init__.py CHANGED
@@ -68,6 +68,7 @@ def bullets_category(sections):
68
 
69
  def is_english(texts):
70
  eng = 0
 
71
  for t in texts:
72
  if re.match(r"[a-zA-Z]{2,}", t.strip()):
73
  eng += 1
@@ -112,8 +113,8 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
112
  d = copy.deepcopy(doc)
113
  tokenize(d, rows, eng)
114
  d["content_with_weight"] = rows
115
- d["image"] = img
116
- add_positions(d, poss)
117
  res.append(d)
118
  continue
119
  de = "; " if eng else "; "
 
68
 
69
  def is_english(texts):
70
  eng = 0
71
+ if not texts: return False
72
  for t in texts:
73
  if re.match(r"[a-zA-Z]{2,}", t.strip()):
74
  eng += 1
 
113
  d = copy.deepcopy(doc)
114
  tokenize(d, rows, eng)
115
  d["content_with_weight"] = rows
116
+ if img: d["image"] = img
117
+ if poss: add_positions(d, poss)
118
  res.append(d)
119
  continue
120
  de = "; " if eng else "; "
rag/nlp/search.py CHANGED
@@ -46,7 +46,7 @@ class Dealer:
46
  "k": topk,
47
  "similarity": sim,
48
  "num_candidates": topk * 2,
49
- "query_vector": qv
50
  }
51
 
52
  def search(self, req, idxnm, emb_mdl=None):
 
46
  "k": topk,
47
  "similarity": sim,
48
  "num_candidates": topk * 2,
49
+ "query_vector": list(qv)
50
  }
51
 
52
  def search(self, req, idxnm, emb_mdl=None):