Spaces:

walker11
/

rawistt

Sleeping

App Files Files Community

walker11 commited on 7 days ago

Commit

4197dc6

verified ·

1 Parent(s): 2abb745

Upload 2 files

Browse files

Files changed (2) hide show

app.py +39 -3
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.staticfiles import StaticFiles
 import uvicorn
 from pathlib import Path
 # Create FastAPI app
 app = FastAPI(title="Speech to Text Model")
@@ -28,8 +29,9 @@ recognizer = sr.Recognizer()
 @app.post("/generate-story")
 async def generate_story_api(file: UploadFile = File(...)):
     try:
-        # Save uploaded audio to a temp file
-        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
             tmp.write(await file.read())
             tmp_path = tmp.name
@@ -49,14 +51,48 @@ async def generate_story_api(file: UploadFile = File(...)):
             content={"error": str(e)}
         )
 # Function for processing audio (used by both FastAPI and Gradio)
 def transcribe_audio(audio_path):
     try:
         # Use speech_recognition to transcribe
-        with sr.AudioFile(audio_path) as source:
             audio_data = recognizer.record(source)
             # Try to use Google's speech recognition for Arabic
             text = recognizer.recognize_google(audio_data, language="ar-AR")
             return text
     except sr.UnknownValueError:
         return "لم يتم التعرف على الكلام"

 from fastapi.staticfiles import StaticFiles
 import uvicorn
 from pathlib import Path
+from pydub import AudioSegment
 # Create FastAPI app
 app = FastAPI(title="Speech to Text Model")
 @app.post("/generate-story")
 async def generate_story_api(file: UploadFile = File(...)):
     try:
+        # Save uploaded audio to a temp file with original extension
+        file_extension = os.path.splitext(file.filename)[1] if file.filename else ".wav"
+        with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp:
             tmp.write(await file.read())
             tmp_path = tmp.name
             content={"error": str(e)}
         )
+# Convert any audio format to WAV
+def convert_to_wav(audio_path):
+    try:
+        # Get the file extension
+        file_extension = os.path.splitext(audio_path)[1].lower()
+        # If already WAV, don't convert
+        if file_extension == ".wav":
+            return audio_path
+        # Create a new temporary WAV file
+        wav_path = os.path.splitext(audio_path)[0] + "_converted.wav"
+        # Convert based on file extension
+        if file_extension in [".mp3", ".m4a", ".ogg", ".flac", ".aac"]:
+            audio = AudioSegment.from_file(audio_path)
+            audio.export(wav_path, format="wav")
+            return wav_path
+        else:
+            # For unknown formats, try a generic approach
+            audio = AudioSegment.from_file(audio_path)
+            audio.export(wav_path, format="wav")
+            return wav_path
+    except Exception as e:
+        raise Exception(f"Error converting audio format: {str(e)}")
 # Function for processing audio (used by both FastAPI and Gradio)
 def transcribe_audio(audio_path):
     try:
+        # Convert audio to WAV format first
+        wav_path = convert_to_wav(audio_path)
         # Use speech_recognition to transcribe
+        with sr.AudioFile(wav_path) as source:
             audio_data = recognizer.record(source)
             # Try to use Google's speech recognition for Arabic
             text = recognizer.recognize_google(audio_data, language="ar-AR")
+            # Clean up converted file if it's different from the original
+            if wav_path != audio_path and os.path.exists(wav_path):
+                os.remove(wav_path)
             return text
     except sr.UnknownValueError:
         return "لم يتم التعرف على الكلام"

requirements.txt CHANGED Viewed

@@ -4,3 +4,4 @@ fastapi==0.103.1
 uvicorn==0.23.2
 python-multipart==0.0.6
 pydub==0.25.1

 uvicorn==0.23.2
 python-multipart==0.0.6
 pydub==0.25.1
+ffmpeg-python==0.2.0