import gradio as gr import os import shutil import uuid import subprocess from pydub import AudioSegment def convert_wav_for_browser(input_path, output_path): audio = AudioSegment.from_file(input_path) audio = audio.set_frame_rate(44100).set_sample_width(2).set_channels(2) audio.export(output_path, format="wav") # Function to handle image upload and transcription def process_image(image): # Save uploaded image to temp folder temp_folder = "./temp_uploads" os.makedirs(temp_folder, exist_ok=True) # Generate unique filename image_filename = f"{uuid.uuid4().hex}.jpg" image_path = os.path.join(temp_folder, image_filename) # Save image image.save(image_path) # Run your script with subprocess try: subprocess.run(["python", "page_transcription.py", f"-img={image_path}"], check=True) except subprocess.CalledProcessError as e: return f"Error during transcription: {e}", None # Check if output.wav is generated output_audio_path = "output.wav" if os.path.exists(output_audio_path): # Convert to browser-safe format safe_audio_path = "output_safe.wav" convert_wav_for_browser("output.wav", safe_audio_path) return "Transcription complete. Playing audio...", safe_audio_path else: return "Failed to generate audio file.", None # Gradio interface iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil", label="Upload a Manga Page"), outputs=[ gr.Textbox(label="Status"), gr.Audio(label="Generated Audio", type="filepath") ], title="Manga Page Audio Transcription", description="Upload a manga image page, and this tool will transcribe and play the audio using a backend Python script." ) # Launch app if __name__ == "__main__": iface.launch()