H commited on
Commit
0c9dbca
·
1 Parent(s): 85d9ac3

Add ParsertType Audio (#1637)

Browse files

### What problem does this PR solve?

#1514

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

api/apps/api_app.py CHANGED
@@ -335,6 +335,8 @@ def upload():
335
  doc["parser_id"] = request.form.get("parser_id").strip()
336
  if doc["type"] == FileType.VISUAL:
337
  doc["parser_id"] = ParserType.PICTURE.value
 
 
338
  if re.search(r"\.(ppt|pptx|pages)$", filename):
339
  doc["parser_id"] = ParserType.PRESENTATION.value
340
 
@@ -581,4 +583,4 @@ def completion_faq():
581
  return response
582
 
583
  except Exception as e:
584
- return server_error_response(e)
 
335
  doc["parser_id"] = request.form.get("parser_id").strip()
336
  if doc["type"] == FileType.VISUAL:
337
  doc["parser_id"] = ParserType.PICTURE.value
338
+ if doc["type"] == FileType.AURAL:
339
+ doc["parser_id"] = ParserType.AUDIO.value
340
  if re.search(r"\.(ppt|pptx|pages)$", filename):
341
  doc["parser_id"] = ParserType.PRESENTATION.value
342
 
 
583
  return response
584
 
585
  except Exception as e:
586
+ return server_error_response(e)
api/apps/dataset_api.py CHANGED
@@ -39,7 +39,7 @@ from api.utils import get_uuid
39
  from api.utils.api_utils import construct_json_result, construct_error_response
40
  from api.utils.api_utils import construct_result, validate_request
41
  from api.utils.file_utils import filename_type, thumbnail
42
- from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture
43
  from rag.nlp import search
44
  from rag.utils.es_conn import ELASTICSEARCH
45
  from rag.utils.minio_conn import MINIO
@@ -377,6 +377,8 @@ def upload_documents(dataset_id):
377
  }
378
  if doc["type"] == FileType.VISUAL:
379
  doc["parser_id"] = ParserType.PICTURE.value
 
 
380
  if re.search(r"\.(ppt|pptx|pages)$", filename):
381
  doc["parser_id"] = ParserType.PRESENTATION.value
382
  DocumentService.insert(doc)
@@ -648,6 +650,8 @@ def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id):
648
  resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
649
  case "table":
650
  table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
 
 
651
  case _:
652
  return False
653
 
 
39
  from api.utils.api_utils import construct_json_result, construct_error_response
40
  from api.utils.api_utils import construct_result, validate_request
41
  from api.utils.file_utils import filename_type, thumbnail
42
+ from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio
43
  from rag.nlp import search
44
  from rag.utils.es_conn import ELASTICSEARCH
45
  from rag.utils.minio_conn import MINIO
 
377
  }
378
  if doc["type"] == FileType.VISUAL:
379
  doc["parser_id"] = ParserType.PICTURE.value
380
+ if doc["type"] == FileType.AURAL:
381
+ doc["parser_id"] = ParserType.AUDIO.value
382
  if re.search(r"\.(ppt|pptx|pages)$", filename):
383
  doc["parser_id"] = ParserType.PRESENTATION.value
384
  DocumentService.insert(doc)
 
650
  resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
651
  case "table":
652
  table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
653
+ case "audio":
654
+ audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id))
655
  case _:
656
  return False
657
 
api/apps/document_app.py CHANGED
@@ -105,6 +105,8 @@ def upload():
105
  }
106
  if doc["type"] == FileType.VISUAL:
107
  doc["parser_id"] = ParserType.PICTURE.value
 
 
108
  if re.search(r"\.(ppt|pptx|pages)$", filename):
109
  doc["parser_id"] = ParserType.PRESENTATION.value
110
  DocumentService.insert(doc)
@@ -171,6 +173,8 @@ def web_crawl():
171
  }
172
  if doc["type"] == FileType.VISUAL:
173
  doc["parser_id"] = ParserType.PICTURE.value
 
 
174
  if re.search(r"\.(ppt|pptx|pages)$", filename):
175
  doc["parser_id"] = ParserType.PRESENTATION.value
176
  DocumentService.insert(doc)
 
105
  }
106
  if doc["type"] == FileType.VISUAL:
107
  doc["parser_id"] = ParserType.PICTURE.value
108
+ if doc["type"] == FileType.AURAL:
109
+ doc["parser_id"] = ParserType.AUDIO.value
110
  if re.search(r"\.(ppt|pptx|pages)$", filename):
111
  doc["parser_id"] = ParserType.PRESENTATION.value
112
  DocumentService.insert(doc)
 
173
  }
174
  if doc["type"] == FileType.VISUAL:
175
  doc["parser_id"] = ParserType.PICTURE.value
176
+ if doc["type"] == FileType.AURAL:
177
+ doc["parser_id"] = ParserType.AUDIO.value
178
  if re.search(r"\.(ppt|pptx|pages)$", filename):
179
  doc["parser_id"] = ParserType.PRESENTATION.value
180
  DocumentService.insert(doc)
api/db/__init__.py CHANGED
@@ -84,6 +84,7 @@ class ParserType(StrEnum):
84
  NAIVE = "naive"
85
  PICTURE = "picture"
86
  ONE = "one"
 
87
 
88
 
89
  class FileSource(StrEnum):
@@ -96,4 +97,4 @@ class CanvasType(StrEnum):
96
  ChatBot = "chatbot"
97
  DocBot = "docbot"
98
 
99
- KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
 
84
  NAIVE = "naive"
85
  PICTURE = "picture"
86
  ONE = "one"
87
+ AUDIO = "audio"
88
 
89
 
90
  class FileSource(StrEnum):
 
97
  ChatBot = "chatbot"
98
  DocBot = "docbot"
99
 
100
+ KNOWLEDGEBASE_FOLDER_NAME=".knowledgebase"
api/db/init_data.py CHANGED
@@ -121,6 +121,8 @@ def init_llm_factory():
121
  LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
122
  LLMService.filter_delete([LLMService.model.fid == "QAnything"])
123
  TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
 
 
124
  ## insert openai two embedding models to the current openai user.
125
  print("Start to insert 2 OpenAI embedding models...")
126
  tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
@@ -143,7 +145,7 @@ def init_llm_factory():
143
  """
144
  drop table llm;
145
  drop table llm_factories;
146
- update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One';
147
  alter table knowledgebase modify avatar longtext;
148
  alter table user modify avatar longtext;
149
  alter table dialog modify icon longtext;
 
121
  LLMFactoriesService.filter_delete([LLMFactoriesService.model.name == "QAnything"])
122
  LLMService.filter_delete([LLMService.model.fid == "QAnything"])
123
  TenantLLMService.filter_update([TenantLLMService.model.llm_factory == "QAnything"], {"llm_factory": "Youdao"})
124
+ TenantService.filter_update([1 == 1], {
125
+ "parser_ids": "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio"})
126
  ## insert openai two embedding models to the current openai user.
127
  print("Start to insert 2 OpenAI embedding models...")
128
  tenant_ids = set([row["tenant_id"] for row in TenantLLMService.get_openai_models()])
 
145
  """
146
  drop table llm;
147
  drop table llm_factories;
148
+ update tenant set parser_ids='naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio';
149
  alter table knowledgebase modify avatar longtext;
150
  alter table user modify avatar longtext;
151
  alter table dialog modify icon longtext;
api/db/services/llm_service.py CHANGED
@@ -15,7 +15,7 @@
15
  #
16
  from api.db.services.user_service import TenantService
17
  from api.settings import database_logger
18
- from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel
19
  from api.db import LLMType
20
  from api.db.db_models import DB, UserTenant
21
  from api.db.db_models import LLMFactories, LLM, TenantLLM
@@ -120,6 +120,14 @@ class TenantLLMService(CommonService):
120
  return ChatModel[model_config["llm_factory"]](
121
  model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
122
 
 
 
 
 
 
 
 
 
123
  @classmethod
124
  @DB.connection_context()
125
  def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
@@ -207,6 +215,14 @@ class LLMBundle(object):
207
  "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
208
  return txt
209
 
 
 
 
 
 
 
 
 
210
  def chat(self, system, history, gen_conf):
211
  txt, used_tokens = self.mdl.chat(system, history, gen_conf)
212
  if not TenantLLMService.increase_usage(
 
15
  #
16
  from api.db.services.user_service import TenantService
17
  from api.settings import database_logger
18
+ from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel
19
  from api.db import LLMType
20
  from api.db.db_models import DB, UserTenant
21
  from api.db.db_models import LLMFactories, LLM, TenantLLM
 
120
  return ChatModel[model_config["llm_factory"]](
121
  model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
122
 
123
+ if llm_type == LLMType.SPEECH2TEXT:
124
+ if model_config["llm_factory"] not in Seq2txtModel:
125
+ return
126
+ return Seq2txtModel[model_config["llm_factory"]](
127
+ model_config["api_key"], model_config["llm_name"], lang,
128
+ base_url=model_config["api_base"]
129
+ )
130
+
131
  @classmethod
132
  @DB.connection_context()
133
  def increase_usage(cls, tenant_id, llm_type, used_tokens, llm_name=None):
 
215
  "Can't update token usage for {}/IMAGE2TEXT".format(self.tenant_id))
216
  return txt
217
 
218
+ def transcription(self, audio):
219
+ txt, used_tokens = self.mdl.transcription(audio)
220
+ if not TenantLLMService.increase_usage(
221
+ self.tenant_id, self.llm_type, used_tokens):
222
+ database_logger.error(
223
+ "Can't update token usage for {}/SEQUENCE2TXT".format(self.tenant_id))
224
+ return txt
225
+
226
  def chat(self, system, history, gen_conf):
227
  txt, used_tokens = self.mdl.chat(system, history, gen_conf)
228
  if not TenantLLMService.increase_usage(
api/settings.py CHANGED
@@ -131,7 +131,7 @@ IMAGE2TEXT_MDL = default_llm[LLM_FACTORY]["image2text_model"]
131
  API_KEY = LLM.get("api_key", "")
132
  PARSERS = LLM.get(
133
  "parsers",
134
- "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One")
135
 
136
  # distribution
137
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
 
131
  API_KEY = LLM.get("api_key", "")
132
  PARSERS = LLM.get(
133
  "parsers",
134
+ "naive:General,qa:Q&A,resume:Resume,manual:Manual,table:Table,paper:Paper,book:Book,laws:Laws,presentation:Presentation,picture:Picture,one:One,audio:Audio")
135
 
136
  # distribution
137
  DEPENDENT_DISTRIBUTION = get_base_config("dependent_distribution", False)
rag/app/audio.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Licensed under the Apache License, Version 2.0 (the "License");
2
+ # you may not use this file except in compliance with the License.
3
+ # You may obtain a copy of the License at
4
+ #
5
+ # http://www.apache.org/licenses/LICENSE-2.0
6
+ #
7
+ # Unless required by applicable law or agreed to in writing, software
8
+ # distributed under the License is distributed on an "AS IS" BASIS,
9
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ # See the License for the specific language governing permissions and
11
+ # limitations under the License.
12
+ #
13
+ import io
14
+ import re
15
+ import numpy as np
16
+
17
+ from api.db import LLMType
18
+ from rag.nlp import rag_tokenizer
19
+ from api.db.services.llm_service import LLMBundle
20
+ from rag.nlp import tokenize
21
+
22
+
23
+ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
24
+ doc = {
25
+ "docnm_kwd": filename,
26
+ "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
27
+ }
28
+ doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
29
+
30
+ # is it English
31
+ eng = lang.lower() == "english" # is_english(sections)
32
+ try:
33
+ callback(0.1, "USE Sequence2Txt LLM to transcription the audio")
34
+ seq2txt_mdl = LLMBundle(tenant_id, LLMType.SPEECH2TEXT, lang=lang)
35
+ ans = seq2txt_mdl.transcription(binary)
36
+ callback(0.8, "Sequence2Txt LLM respond: %s ..." % ans[:32])
37
+ tokenize(doc, ans, eng)
38
+ return [doc]
39
+ except Exception as e:
40
+ callback(prog=-1, msg=str(e))
41
+
42
+ return []
rag/app/picture.py CHANGED
@@ -42,7 +42,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
42
  callback(0.4, "Use CV LLM to describe the picture.")
43
  cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
44
  ans = cv_mdl.describe(binary)
45
- callback(0.8, "CV LLM respoond: %s ..." % ans[:32])
46
  txt += "\n" + ans
47
  tokenize(doc, txt, eng)
48
  return [doc]
 
42
  callback(0.4, "Use CV LLM to describe the picture.")
43
  cv_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, lang=lang)
44
  ans = cv_mdl.describe(binary)
45
+ callback(0.8, "CV LLM respond: %s ..." % ans[:32])
46
  txt += "\n" + ans
47
  tokenize(doc, txt, eng)
48
  return [doc]
rag/svr/task_executor.py CHANGED
@@ -45,7 +45,7 @@ from rag.nlp import search, rag_tokenizer
45
  from io import BytesIO
46
  import pandas as pd
47
 
48
- from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one
49
 
50
  from api.db import LLMType, ParserType
51
  from api.db.services.document_service import DocumentService
@@ -68,6 +68,7 @@ FACTORY = {
68
  ParserType.RESUME.value: resume,
69
  ParserType.PICTURE.value: picture,
70
  ParserType.ONE.value: one,
 
71
  }
72
 
73
 
 
45
  from io import BytesIO
46
  import pandas as pd
47
 
48
+ from rag.app import laws, paper, presentation, manual, qa, table, book, resume, picture, naive, one, audio
49
 
50
  from api.db import LLMType, ParserType
51
  from api.db.services.document_service import DocumentService
 
68
  ParserType.RESUME.value: resume,
69
  ParserType.PICTURE.value: picture,
70
  ParserType.ONE.value: one,
71
+ ParserType.AUDIO.value: audio
72
  }
73
 
74