audio_chat_indic

Sleeping

App Files Files Community

pvanand commited on Sep 7, 2024

Commit

e35c088

verified ·

1 Parent(s): eaee333

Update main.py

Browse files

Files changed (1) hide show

main.py +85 -14

main.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from fastapi import FastAPI, HTTPException
 from fastapi.responses import StreamingResponse
 import onnxruntime as ort
 import numpy as np
@@ -7,7 +7,14 @@ import io
 import wave
 import uvicorn
 from fastapi.middleware.cors import CORSMiddleware
-app = FastAPI()
 # Set up CORS
 app.add_middleware(
@@ -18,21 +25,51 @@ app.add_middleware(
     allow_headers=["*"],  # Allows all headers
 )
-# Load the ONNX model
-session = ort.InferenceSession("mms-tts-hin.onnx", providers=['CPUExecutionProvider'])
-# Load the tokenizer
-tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-hin")
 CHUNK_SIZE = 4000  # Number of samples per chunk
-def text_to_speech_generator(text):
-    inputs = tokenizer(text, return_tensors="np")
     input_ids = inputs.input_ids.astype(np.int64)
-    onnx_output = session.run(None, {"input_ids": input_ids})
     waveform = onnx_output[0][0]
     for i in range(0, len(waveform), CHUNK_SIZE):
         yield waveform[i:i+CHUNK_SIZE]
@@ -45,8 +82,32 @@ def create_wav_header(sample_rate, bits_per_sample, channels):
         wav_file.writeframes(b'')  # Write empty frames to create header
     return byte_io.getvalue()
-@app.get("/tts")
-async def tts_endpoint(text: str):
     try:
         sample_rate = 16000
         bits_per_sample = 16
@@ -55,15 +116,25 @@ async def tts_endpoint(text: str):
         async def audio_stream_generator():
             # First, yield the WAV header
             yield create_wav_header(sample_rate, bits_per_sample, channels)
             # Then stream the audio data
-            for chunk in text_to_speech_generator(text):
                 yield (chunk * 32767).astype(np.int16).tobytes()
         return StreamingResponse(audio_stream_generator(), media_type="audio/wav")
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":
     host = "0.0.0.0"
     port = 8000

+from fastapi import FastAPI, HTTPException, Query
 from fastapi.responses import StreamingResponse
 import onnxruntime as ort
 import numpy as np
 import wave
 import uvicorn
 from fastapi.middleware.cors import CORSMiddleware
+from typing import Dict, List
+from enum import Enum
+app = FastAPI(
+    title="Multilingual Text-to-Speech API",
+    description="This API provides text-to-speech conversion for multiple Indian languages using ONNX models.",
+    version="1.0.0",
+)
 # Set up CORS
 app.add_middleware(
     allow_headers=["*"],  # Allows all headers
 )
+# Define supported languages and their corresponding model files
+SUPPORTED_LANGUAGES: Dict[str, Dict[str, str]] = {
+    "hin": {"name": "Hindi", "file": "mms-tts-hin.onnx"},
+    "ben": {"name": "Bengali", "file": "mms-tts-ben.onnx"},
+    "mar": {"name": "Marathi", "file": "mms-tts-mar.onnx"},
+    "tel": {"name": "Telugu", "file": "mms-tts-tel.onnx"},
+    "tam": {"name": "Tamil", "file": "mms-tts-tam.onnx"},
+    "guj": {"name": "Gujarati", "file": "mms-tts-guj.onnx"},
+    "urd": {"name": "Urdu", "file": "mms-tts-urd-script_arabic.onnx"},
+    "kan": {"name": "Kannada", "file": "mms-tts-kan.onnx"},
+    "mal": {"name": "Malayalam", "file": "mms-tts-mal.onnx"},
+    "pan": {"name": "Punjabi", "file": "mms-tts-pan.onnx"},
+    "nep": {"name": "Nepali", "file": "mms-tts-nep.onnx"}
+}
+# Create an Enum for language codes
+class LanguageCode(str, Enum):
+    hindi = "hin"
+    bengali = "ben"
+    marathi = "mar"
+    telugu = "tel"
+    tamil = "tam"
+    gujarati = "guj"
+    urdu = "urd"
+    kannada = "kan"
+    malayalam = "mal"
+    punjabi = "pan"
+    nepali = "nep"
+# Initialize dictionaries to store sessions and tokenizers
+sessions: Dict[str, ort.InferenceSession] = {}
+tokenizers: Dict[str, AutoTokenizer] = {}
+# Load models and tokenizers for all supported languages
+for lang, info in SUPPORTED_LANGUAGES.items():
+    sessions[lang] = ort.InferenceSession(info["file"], providers=['CPUExecutionProvider'])
+    tokenizers[lang] = AutoTokenizer.from_pretrained(f"facebook/mms-tts-{lang}")
 CHUNK_SIZE = 4000  # Number of samples per chunk
+def text_to_speech_generator(text: str, lang: str):
+    inputs = tokenizers[lang](text, return_tensors="np")
     input_ids = inputs.input_ids.astype(np.int64)
+    onnx_output = sessions[lang].run(None, {"input_ids": input_ids})
     waveform = onnx_output[0][0]
     for i in range(0, len(waveform), CHUNK_SIZE):
         yield waveform[i:i+CHUNK_SIZE]
         wav_file.writeframes(b'')  # Write empty frames to create header
     return byte_io.getvalue()
+@app.get("/tts", summary="Convert text to speech", response_description="Audio stream in WAV format")
+async def tts_endpoint(
+    text: str = Query(..., description="The text to convert to speech"),
+    lang: LanguageCode = Query(..., description="The language code for text-to-speech conversion")
+):
+    """
+    Convert the given text to speech in the specified language.
+    - **text**: The input text to be converted to speech
+    - **lang**: The language code for the input text and desired speech output
+    Available language codes:
+    - hin: Hindi
+    - ben: Bengali
+    - mar: Marathi
+    - tel: Telugu
+    - tam: Tamil
+    - guj: Gujarati
+    - urd: Urdu
+    - kan: Kannada
+    - mal: Malayalam
+    - pan: Punjabi
+    - nep: Nepali
+    Returns a streaming response with the audio data in WAV format.
+    """
     try:
         sample_rate = 16000
         bits_per_sample = 16
         async def audio_stream_generator():
             # First, yield the WAV header
             yield create_wav_header(sample_rate, bits_per_sample, channels)
             # Then stream the audio data
+            for chunk in text_to_speech_generator(text, lang):
                 yield (chunk * 32767).astype(np.int16).tobytes()
         return StreamingResponse(audio_stream_generator(), media_type="audio/wav")
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+@app.get("/languages", summary="Get supported languages", response_model=List[Dict[str, str]])
+async def get_languages():
+    """
+    Retrieve a list of supported languages with their codes and names.
+    Returns a list of dictionaries, each containing:
+    - **code**: The language code
+    - **name**: The full name of the language
+    """
+    return [{"code": code, "name": info["name"]} for code, info in SUPPORTED_LANGUAGES.items()]
 if __name__ == "__main__":
     host = "0.0.0.0"
     port = 8000