黄腾 aopstudio commited on
Commit
1c7b682
·
1 Parent(s): 6bcaa26

add support for TongyiQwen tts (#2311)

Browse files

### What problem does this PR solve?

add support for TongyiQwen tts
#1853

### Type of change


- [x] New Feature (non-breaking change which adds functionality)

---------

Co-authored-by: Zhedong Cen <[email protected]>

conf/llm_factories.json CHANGED
@@ -104,18 +104,24 @@
104
  "max_tokens": 2048,
105
  "model_type": "embedding"
106
  },
 
 
 
 
 
 
 
 
 
 
 
 
107
  {
108
  "llm_name": "text-embedding-v3",
109
  "tags": "TEXT EMBEDDING,8K",
110
  "max_tokens": 8192,
111
  "model_type": "embedding"
112
  },
113
- {
114
- "llm_name": "paraformer-realtime-8k-v1",
115
- "tags": "SPEECH2TEXT",
116
- "max_tokens": 26214400,
117
- "model_type": "speech2text"
118
- },
119
  {
120
  "llm_name": "qwen-vl-max",
121
  "tags": "LLM,CHAT,IMAGE2TEXT",
 
104
  "max_tokens": 2048,
105
  "model_type": "embedding"
106
  },
107
+ {
108
+ "llm_name": "sambert-zhide-v1",
109
+ "tags": "TTS",
110
+ "max_tokens": 2048,
111
+ "model_type": "tts"
112
+ },
113
+ {
114
+ "llm_name": "sambert-zhiru-v1",
115
+ "tags": "TTS",
116
+ "max_tokens": 2048,
117
+ "model_type": "tts"
118
+ },
119
  {
120
  "llm_name": "text-embedding-v3",
121
  "tags": "TEXT EMBEDDING,8K",
122
  "max_tokens": 8192,
123
  "model_type": "embedding"
124
  },
 
 
 
 
 
 
125
  {
126
  "llm_name": "qwen-vl-max",
127
  "tags": "LLM,CHAT,IMAGE2TEXT",
rag/llm/__init__.py CHANGED
@@ -137,5 +137,6 @@ Seq2txtModel = {
137
  }
138
 
139
  TTSModel = {
140
- "Fish Audio": FishAudioTTS
 
141
  }
 
137
  }
138
 
139
  TTSModel = {
140
+ "Fish Audio": FishAudioTTS,
141
+ "Tongyi-Qianwen": QwenTTS
142
  }
rag/llm/tts_model.py CHANGED
@@ -22,7 +22,7 @@ from pydantic import BaseModel, conint
22
  from rag.utils import num_tokens_from_string
23
  import json
24
  import re
25
-
26
  class ServeReferenceAudio(BaseModel):
27
  audio: bytes
28
  text: str
@@ -96,3 +96,61 @@ class FishAudioTTS(Base):
96
 
97
  except httpx.HTTPStatusError as e:
98
  raise RuntimeError(f"**ERROR**: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  from rag.utils import num_tokens_from_string
23
  import json
24
  import re
25
+ import time
26
  class ServeReferenceAudio(BaseModel):
27
  audio: bytes
28
  text: str
 
96
 
97
  except httpx.HTTPStatusError as e:
98
  raise RuntimeError(f"**ERROR**: {e}")
99
+
100
+
101
+ class QwenTTS(Base):
102
+ def __init__(self, key, model_name, base_url=""):
103
+ import dashscope
104
+
105
+ self.model_name = model_name
106
+ dashscope.api_key = key
107
+
108
+ def tts(self, text):
109
+ from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
110
+ from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
111
+ from collections import deque
112
+
113
+ class Callback(ResultCallback):
114
+ def __init__(self) -> None:
115
+ self.dque = deque()
116
+
117
+ def _run(self):
118
+ while True:
119
+ if not self.dque:
120
+ time.sleep(0)
121
+ continue
122
+ val = self.dque.popleft()
123
+ if val:
124
+ yield val
125
+ else:
126
+ break
127
+
128
+ def on_open(self):
129
+ pass
130
+
131
+ def on_complete(self):
132
+ self.dque.append(None)
133
+
134
+ def on_error(self, response: SpeechSynthesisResponse):
135
+ raise RuntimeError(str(response))
136
+
137
+ def on_close(self):
138
+ pass
139
+
140
+ def on_event(self, result: SpeechSynthesisResult):
141
+ if result.get_audio_frame() is not None:
142
+ self.dque.append(result.get_audio_frame())
143
+
144
+ text = self.normalize_text(text)
145
+ callback = Callback()
146
+ SpeechSynthesizer.call(model=self.model_name,
147
+ text=text,
148
+ callback=callback,
149
+ format="mp3")
150
+ try:
151
+ for data in callback._run():
152
+ yield data
153
+ yield num_tokens_from_string(text)
154
+
155
+ except Exception as e:
156
+ raise RuntimeError(f"**ERROR**: {e}")