File size: 1,835 Bytes
3046549
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import gradio as gr
import os
import shutil
import uuid
import subprocess

from pydub import AudioSegment

def convert_wav_for_browser(input_path, output_path):
    audio = AudioSegment.from_file(input_path)
    audio = audio.set_frame_rate(44100).set_sample_width(2).set_channels(2)
    audio.export(output_path, format="wav")


# Function to handle image upload and transcription
def process_image(image):
    # Save uploaded image to temp folder
    temp_folder = "./temp_uploads"
    os.makedirs(temp_folder, exist_ok=True)

    # Generate unique filename
    image_filename = f"{uuid.uuid4().hex}.jpg"
    image_path = os.path.join(temp_folder, image_filename)

    # Save image
    image.save(image_path)

    # Run your script with subprocess
    try:
        subprocess.run(["python", "page_transcription.py", f"-img={image_path}"], check=True)
    except subprocess.CalledProcessError as e:
        return f"Error during transcription: {e}", None

    # Check if output.wav is generated
    output_audio_path = "output.wav"
    if os.path.exists(output_audio_path):
        # Convert to browser-safe format
        safe_audio_path = "output_safe.wav"
        convert_wav_for_browser("output.wav", safe_audio_path)
        return "Transcription complete. Playing audio...", safe_audio_path
    else:
        return "Failed to generate audio file.", None

# Gradio interface
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil", label="Upload a Manga Page"),
    outputs=[
        gr.Textbox(label="Status"),
        gr.Audio(label="Generated Audio", type="filepath")
    ],
    title="Manga Page Audio Transcription",
    description="Upload a manga image page, and this tool will transcribe and play the audio using a backend Python script."
)

# Launch app
if __name__ == "__main__":
    iface.launch()