qqwjq1981 commited on
Commit
cd89a99
·
verified ·
1 Parent(s): c973cbb

Upload 4 files

Browse files
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. NotoSansSC-Regular.ttf +3 -0
  3. README.md +6 -5
  4. app.py +435 -0
  5. requirements.txt +25 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ NotoSansSC-Regular.ttf filter=lfs diff=lfs merge=lfs -text
NotoSansSC-Regular.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cf8b2a0576d5680284ab03a7a8219499d59bbe981a79bb3dc0031f251c39736
3
+ size 10560616
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Test Studio
3
- emoji: 💻
4
- colorFrom: blue
5
- colorTo: yellow
6
  sdk: gradio
7
- sdk_version: 5.17.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Studio
3
+ emoji: 🔥
4
+ colorFrom: pink
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 5.12.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Studio
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,435 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import concurrent.futures
3
+ import gradio as gr
4
+ from datetime import datetime
5
+ import random
6
+ import moviepy
7
+ from transformers import pipeline
8
+ from transformers.pipelines.audio_utils import ffmpeg_read
9
+ from moviepy import (
10
+ ImageClip,
11
+ VideoFileClip,
12
+ TextClip,
13
+ CompositeVideoClip,
14
+ CompositeAudioClip,
15
+ AudioFileClip,
16
+ concatenate_videoclips,
17
+ concatenate_audioclips
18
+ )
19
+ from gtts import gTTS
20
+ import subprocess
21
+ import speech_recognition as sr
22
+ import json
23
+ from nltk.tokenize import sent_tokenize
24
+ import logging
25
+ from textblob import TextBlob
26
+ import whisper
27
+ import time
28
+ import sqlite3
29
+
30
+ # Define the passcode
31
+ PASSCODE = "show_feedback_db"
32
+
33
+ css = """
34
+ /* Adjust row height */
35
+ .dataframe-container tr {
36
+ height: 50px !important;
37
+ }
38
+
39
+ /* Ensure text wrapping and prevent overflow */
40
+ .dataframe-container td {
41
+ white-space: normal !important;
42
+ word-break: break-word !important;
43
+ }
44
+
45
+ /* Set column widths */
46
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(1),
47
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(1) {
48
+ width: 6%; /* Start column */
49
+ }
50
+
51
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(2),
52
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(2) {
53
+ width: 47%; /* Original text */
54
+ }
55
+
56
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(3),
57
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(3) {
58
+ width: 47%; /* Translated text */
59
+ }
60
+
61
+ [data-testid="block-container"] .scrolling-dataframe th:nth-child(4),
62
+ [data-testid="block-container"] .scrolling-dataframe td:nth-child(4) {
63
+ display: none !important;
64
+ }
65
+ """
66
+
67
+ # Function to save feedback or provide access to the database file
68
+ def handle_feedback(feedback):
69
+ feedback = feedback.strip() # Clean up leading/trailing whitespace
70
+ if not feedback:
71
+ return "Feedback cannot be empty.", None
72
+
73
+ if feedback == PASSCODE:
74
+ # Provide access to the feedback.db file
75
+ return "Access granted! Download the database file below.", "feedback.db"
76
+ else:
77
+ # Save feedback to the database
78
+ with sqlite3.connect("feedback.db") as conn:
79
+ cursor = conn.cursor()
80
+ cursor.execute("CREATE TABLE IF NOT EXISTS studio_feedback (id INTEGER PRIMARY KEY, comment TEXT)")
81
+ cursor.execute("INSERT INTO studio_feedback (comment) VALUES (?)", (feedback,))
82
+ conn.commit()
83
+ return "Thank you for your feedback!", None
84
+
85
+ # Configure logging
86
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
87
+ logger = logging.getLogger(__name__)
88
+ logger.info(f"MoviePy Version: {moviepy.__version__}")
89
+
90
+ def silence(duration, fps=44100):
91
+ """
92
+ Returns a silent AudioClip of the specified duration.
93
+ """
94
+ return AudioFileClip(np.zeros((int(fps*duration), 2)), fps=fps)
95
+
96
+ def transcribe_video(video_path):
97
+ # Load the video file and extract audio
98
+ video = VideoFileClip(video_path)
99
+ audio_path = "audio.wav"
100
+ video.audio.write_audiofile(audio_path)
101
+
102
+ # Load Whisper model
103
+ model = whisper.load_model("base") # Options: tiny, base, small, medium, large
104
+
105
+ # Transcribe with Whisper
106
+ result = model.transcribe(audio_path, word_timestamps=True)
107
+
108
+ # Extract timestamps and text
109
+ transcript_with_timestamps = [
110
+ {
111
+ "start": segment["start"],
112
+ "end": segment["end"],
113
+ "text": segment["text"]
114
+ }
115
+ for segment in result["segments"]
116
+ ]
117
+ # Get the detected language
118
+ detected_language = result["language"]
119
+ logger.debug(f"Detected language:\n{detected_language}")
120
+ return transcript_with_timestamps, detected_language
121
+
122
+ # Function to get the appropriate translation model based on target language
123
+ def get_translation_model(source_language, target_language):
124
+ """
125
+ Get the translation model based on the source and target language.
126
+
127
+ Parameters:
128
+ - target_language (str): The language to translate the content into (e.g., 'es', 'fr').
129
+ - source_language (str): The language of the input content (default is 'en' for English).
130
+
131
+ Returns:
132
+ - str: The translation model identifier.
133
+ """
134
+ # List of allowable languages
135
+ allowable_languages = ["en", "es", "fr", "zh", "de", "it", "pt", "ja", "ko", "ru"]
136
+
137
+ # Validate source and target languages
138
+ if source_language not in allowable_languages:
139
+ logger.debug(f"Invalid source language '{source_language}'. Supported languages are: {', '.join(allowable_languages)}")
140
+ # Return a default model if source language is invalid
141
+ source_language = "en" # Default to 'en'
142
+
143
+ if target_language not in allowable_languages:
144
+ logger.debug(f"Invalid target language '{target_language}'. Supported languages are: {', '.join(allowable_languages)}")
145
+ # Return a default model if target language is invalid
146
+ target_language = "zh" # Default to 'zh'
147
+
148
+ if source_language == target_language:
149
+ source_language = "en" # Default to 'en'
150
+ target_language = "zh" # Default to 'zh'
151
+
152
+ # Return the model using string concatenation
153
+ return f"Helsinki-NLP/opus-mt-{source_language}-{target_language}"
154
+
155
+ def translate_text(transcription_json, source_language, target_language):
156
+ # Load the translation model for the specified target language
157
+ translation_model_id = get_translation_model(source_language, target_language)
158
+ logger.debug(f"Translation model: {translation_model_id}")
159
+ translator = pipeline("translation", model=translation_model_id)
160
+
161
+ # Prepare output structure
162
+ translated_json = []
163
+
164
+ # Translate each sentence and store it with its start time
165
+ for entry in transcription_json:
166
+ original_text = entry["text"]
167
+ translated_text = translator(original_text)[0]['translation_text']
168
+ translated_json.append({
169
+ "start": entry["start"],
170
+ "original": original_text,
171
+ "translated": translated_text,
172
+ "end": entry["end"]
173
+ })
174
+ # Log the components being added to translated_json
175
+ logger.debug("Adding to translated_json: start=%s, original=%s, translated=%s, end=%s",
176
+ entry["start"], original_text, translated_text, entry["end"])
177
+
178
+ # Return the translated timestamps as a JSON string
179
+ return translated_json
180
+
181
+ def update_translations(file, edited_table, mode):
182
+ """
183
+ Update the translations based on user edits in the Gradio Dataframe.
184
+ """
185
+ output_video_path = "output_video.mp4"
186
+ logger.debug(f"Editable Table: {edited_table}")
187
+
188
+ try:
189
+ start_time = time.time() # Start the timer
190
+
191
+ # Convert the edited_table (list of lists) back to list of dictionaries
192
+ updated_translations = [
193
+ {
194
+ "start": row["start"], # Access by column name
195
+ "original": row["original"],
196
+ "translated": row["translated"],
197
+ "end": row["end"]
198
+ }
199
+ for _, row in edited_table.iterrows()
200
+ ]
201
+
202
+ # Call the function to process the video with updated translations
203
+ add_transcript_voiceover(file.name, updated_translations, output_video_path, mode=="Transcription with Voiceover")
204
+
205
+ # Calculate elapsed time
206
+ elapsed_time = time.time() - start_time
207
+ elapsed_time_display = f"Updates applied successfully in {elapsed_time:.2f} seconds."
208
+
209
+ return output_video_path, elapsed_time_display
210
+
211
+ except Exception as e:
212
+ raise ValueError(f"Error updating translations: {e}")
213
+
214
+ def process_entry(entry, i, video_width, video_height, add_voiceover, target_language):
215
+ logger.debug(f"Processing entry {i}: {entry}")
216
+
217
+ # Create text clip for subtitles
218
+ txt_clip = TextClip(
219
+ text=entry["translated"],
220
+ font="./NotoSansSC-Regular.ttf",
221
+ method='caption',
222
+ color='yellow',
223
+ stroke_color='black', # Border color
224
+ stroke_width=2, # Border thickness
225
+ font_size=int(video_height // 20),
226
+ size=(int(video_width * 0.8), None)
227
+ ).with_start(entry["start"]).with_duration(entry["end"] - entry["start"]).with_position(('bottom')).with_opacity(0.8)
228
+
229
+ audio_segment = None
230
+ if add_voiceover:
231
+ segment_audio_path = f"segment_{i}_voiceover.wav"
232
+ generate_voiceover([entry], target_language, segment_audio_path)
233
+ audio_clip = AudioFileClip(segment_audio_path)
234
+ # Get and log all methods in AudioFileClip
235
+ logger.info("Methods in AudioFileClip:")
236
+ for method in dir(audio_clip):
237
+ logger.info(method)
238
+ desired_duration = entry["end"] - entry["start"]
239
+
240
+ # Log duration of the audio clip and the desired duration for debugging.
241
+ logger.debug(f"Audio clip duration: {audio_clip.duration}, Desired duration: {desired_duration}")
242
+
243
+ if audio_clip.duration < desired_duration:
244
+ # Pad with silence if audio is too short
245
+ silence_duration = desired_duration - audio_clip.duration
246
+
247
+ # Concatenate the original audio and silence
248
+ audio_clip = concatenate_audioclips([audio_clip, silence(duration=silence_duration)])
249
+ logger.info(f"Padded audio with {silence_duration} seconds of silence.")
250
+
251
+ # Set the audio_segment to the required duration.
252
+ audio_segment = audio_clip.with_start(entry["start"]).with_duration(desired_duration)
253
+
254
+ return i, txt_clip, audio_segment
255
+
256
+ def add_transcript_voiceover(video_path, translated_json, output_path, add_voiceover=False, target_language="en"):
257
+ """
258
+ Add transcript and voiceover to a video, segment by segment.
259
+ """
260
+ video = VideoFileClip(video_path)
261
+ font_path = "./NotoSansSC-Regular.ttf"
262
+
263
+ text_clips = []
264
+ audio_segments = []
265
+
266
+ with concurrent.futures.ThreadPoolExecutor() as executor:
267
+ futures = [executor.submit(process_entry, entry, i, video.w, video.h, add_voiceover, target_language)
268
+ for i, entry in enumerate(translated_json)]
269
+
270
+ # Collect results with original index i
271
+ results = []
272
+ for future in concurrent.futures.as_completed(futures):
273
+ try:
274
+ i, txt_clip, audio_segment = future.result()
275
+ results.append((i, txt_clip, audio_segment))
276
+ except Exception as e:
277
+ logger.error(f"Error processing entry: {e}")
278
+
279
+ # Sort by original index i
280
+ results.sort(key=lambda x: x[0])
281
+
282
+ # Extract sorted clips
283
+ text_clips = [clip for i, clip, segment in results]
284
+
285
+ final_video = CompositeVideoClip([video] + text_clips)
286
+
287
+ logger.info("Methods in CompositeVideoClip:")
288
+ for method in dir(final_video):
289
+ logger.info(method)
290
+
291
+ if add_voiceover:
292
+ audio_segments = [segment for i, clip, segment in results if segment is not None]
293
+ final_audio = CompositeAudioClip(audio_segments) # Critical fix
294
+ final_audio = final_audio.with_duration(video.duration)
295
+
296
+ final_video = final_video.with_audio(final_audio)
297
+
298
+ logger.info(f"Saving the final video to: {output_path}")
299
+ final_video.write_videofile(output_path, codec="libx264", audio_codec="aac")
300
+
301
+ logger.info("Video processing completed successfully.")
302
+
303
+ def generate_voiceover(translated_json, language, output_audio_path):
304
+ """
305
+ Generate voiceover from translated text for a given language.
306
+ """
307
+ # Concatenate translated text into a single string
308
+ full_text = " ".join(entry["translated"] for entry in translated_json)
309
+
310
+ try:
311
+ tts = gTTS(text=full_text, lang=language)
312
+ time.sleep(10) # Add a delay of 10 seconds between requests
313
+ tts.save(output_audio_path)
314
+ except Exception as e:
315
+ raise ValueError(f"Error generating voiceover: {e}")
316
+
317
+ def upload_and_manage(file, target_language, mode="transcription"):
318
+ if file is None:
319
+ logger.info("No file uploaded. Please upload a video/audio file.")
320
+ return None, [], None, "No file uploaded. Please upload a video/audio file."
321
+
322
+ try:
323
+ start_time = time.time() # Start the timer
324
+ logger.info(f"Started processing file: {file.name}")
325
+
326
+ # Define paths for audio and output files
327
+ audio_path = "audio.wav"
328
+ output_video_path = "output_video.mp4"
329
+ voiceover_path = "voiceover.wav"
330
+ logger.info(f"Using audio path: {audio_path}, output video path: {output_video_path}, voiceover path: {voiceover_path}")
331
+
332
+ # Step 1: Transcribe audio from uploaded media file and get timestamps
333
+ logger.info("Transcribing audio...")
334
+ transcription_json, source_language = transcribe_video(file.name)
335
+ logger.info(f"Transcription completed. Detected source language: {source_language}")
336
+
337
+ # Step 2: Translate the transcription
338
+ logger.info(f"Translating transcription from {source_language} to {target_language}...")
339
+ translated_json = translate_text(transcription_json, source_language, target_language)
340
+ logger.info(f"Translation completed. Number of translated segments: {len(translated_json)}")
341
+
342
+ # Step 3: Add transcript to video based on timestamps
343
+ logger.info("Adding translated transcript to video...")
344
+ add_transcript_voiceover(file.name, translated_json, output_video_path, mode == "Transcription with Voiceover", target_language)
345
+ logger.info(f"Transcript added to video. Output video saved at {output_video_path}")
346
+
347
+ # Convert translated JSON into a format for the editable table
348
+ logger.info("Converting translated JSON into editable table format...")
349
+ editable_table = [
350
+ [float(entry["start"]), entry["original"], entry["translated"], float(entry["end"])]
351
+ for entry in translated_json
352
+ ]
353
+
354
+ # Calculate elapsed time
355
+ elapsed_time = time.time() - start_time
356
+ elapsed_time_display = f"Processing completed in {elapsed_time:.2f} seconds."
357
+ logger.info(f"Processing completed in {elapsed_time:.2f} seconds.")
358
+
359
+ return translated_json, editable_table, output_video_path, elapsed_time_display
360
+
361
+ except Exception as e:
362
+ logger.error(f"An error occurred: {str(e)}")
363
+ return None, [], None, f"An error occurred: {str(e)}"
364
+ # Gradio Interface with Tabs
365
+ def build_interface():
366
+ with gr.Blocks(css=css) as demo:
367
+ gr.Markdown("## Video Localization")
368
+ with gr.Row():
369
+ with gr.Column(scale=4):
370
+ file_input = gr.File(label="Upload Video/Audio File")
371
+ language_input = gr.Dropdown(["en", "es", "fr", "zh"], label="Select Language") # Language codes
372
+ process_mode = gr.Radio(choices=["Transcription", "Transcription with Voiceover"], label="Choose Processing Type", value="Transcription")
373
+ submit_button = gr.Button("Post and Process")
374
+ editable_translations = gr.State(value=[])
375
+
376
+ with gr.Column(scale=8):
377
+ gr.Markdown("## Edit Translations")
378
+
379
+ # Editable JSON Data
380
+ editable_table = gr.Dataframe(
381
+ value=[], # Default to an empty list to avoid undefined values
382
+ headers=["start", "original", "translated", "end"],
383
+ datatype=["number", "str", "str", "number"],
384
+ row_count=1, # Initially empty
385
+ col_count=4,
386
+ interactive=[False, True, True, False], # Control editability
387
+ label="Edit Translations",
388
+ wrap=True # Enables text wrapping if supported
389
+ )
390
+ save_changes_button = gr.Button("Save Changes")
391
+ processed_video_output = gr.File(label="Download Processed Video", interactive=True) # Download button
392
+ elapsed_time_display = gr.Textbox(label="Elapsed Time", lines=1, interactive=False)
393
+
394
+ with gr.Column(scale=1):
395
+ gr.Markdown("**Feedback**")
396
+ feedback_input = gr.Textbox(
397
+ placeholder="Leave your feedback here...",
398
+ label=None,
399
+ lines=3,
400
+ )
401
+ feedback_btn = gr.Button("Submit Feedback")
402
+ response_message = gr.Textbox(label=None, lines=1, interactive=False)
403
+ db_download = gr.File(label="Download Database File", visible=False)
404
+
405
+ # Link the feedback handling
406
+ def feedback_submission(feedback):
407
+ message, file_path = handle_feedback(feedback)
408
+ if file_path:
409
+ return message, gr.update(value=file_path, visible=True)
410
+ return message, gr.update(visible=False)
411
+
412
+ save_changes_button.click(
413
+ update_translations,
414
+ inputs=[file_input, editable_table, process_mode],
415
+ outputs=[processed_video_output, elapsed_time_display]
416
+ )
417
+
418
+ submit_button.click(
419
+ upload_and_manage,
420
+ inputs=[file_input, language_input, process_mode],
421
+ outputs=[editable_translations, editable_table, processed_video_output, elapsed_time_display]
422
+ )
423
+
424
+ # Connect submit button to save_feedback_db function
425
+ feedback_btn.click(
426
+ feedback_submission,
427
+ inputs=[feedback_input],
428
+ outputs=[response_message, db_download]
429
+ )
430
+
431
+ return demo
432
+
433
+ # Launch the Gradio interface
434
+ demo = build_interface()
435
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ openai-whisper
2
+ sentencepiece
3
+ SpeechRecognition
4
+ pydub
5
+ youtube_transcript_api
6
+ nltk
7
+ textblob
8
+ gradio
9
+ newspaper3k
10
+ transformers
11
+ sentence-transformers
12
+ openai
13
+ todoist-api-python
14
+ flask
15
+ twilio
16
+ fastapi
17
+ uvicorn
18
+ moviepy
19
+ ffmpy
20
+ google-cloud-storage
21
+ fpdf
22
+ markdown
23
+ nest_asyncio
24
+ reportlab
25
+ gtts