shixian.shi
commited on
Commit
•
76934e1
1
Parent(s):
3c166d6
update
Browse files- README.md +19 -11
- app.py +137 -0
- examples/2022云栖大会_片段.mp4 +3 -0
- examples/2022云栖大会_片段2.mp4 +3 -0
- examples/guide.png +3 -0
- examples/为什么要多读书?这是我听过最好的答案-片段.mp4 +3 -0
- examples/使用chatgpt_片段.mp4 +3 -0
- examples/鲁肃采访片段1.wav +3 -0
- font/STHeitiMedium.ttc +3 -0
- subtitle_utils.py +105 -0
- trans_utils.py +57 -0
- videoclipper.py +146 -0
README.md
CHANGED
@@ -1,13 +1,21 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
---
|
12 |
-
|
13 |
-
|
|
|
|
|
|
1 |
---
|
2 |
+
# 详细文档见https://modelscope.cn/docs/%E5%88%9B%E7%A9%BA%E9%97%B4%E5%8D%A1%E7%89%87
|
3 |
+
domain: #领域:cv/nlp/audio/multi-modal/AutoML
|
4 |
+
- audio
|
5 |
+
tags: #自定义标签
|
6 |
+
- ClipVideo
|
7 |
+
models: #关联模型
|
8 |
+
- damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
9 |
+
- damo/speech_timestamp_prediction-v1-16k-offline
|
10 |
+
deployspec:
|
11 |
+
cpu: 6
|
12 |
+
memory: 87000
|
13 |
+
gpu: 1
|
14 |
+
instance_type: ecs.gn6e-c12g1.3xlarge
|
15 |
+
image_id: mshub-registry-vpc.cn-zhangjiakou.cr.aliyuncs.com/modelscope-repo/studio-service-gpu:funasr-clipvideo-gradio-3.29.0-2
|
16 |
+
license: Apache License 2.0
|
17 |
---
|
18 |
+
#### Clone with HTTP
|
19 |
+
```bash
|
20 |
+
git clone https://www.modelscope.cn/studios/damo/funasr_app_clipvideo.git
|
21 |
+
```
|
app.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from modelscope.pipelines import pipeline
|
3 |
+
from modelscope.utils.constant import Tasks
|
4 |
+
from videoclipper import VideoClipper
|
5 |
+
|
6 |
+
|
7 |
+
if __name__ == "__main__":
|
8 |
+
inference_pipeline = pipeline(
|
9 |
+
task=Tasks.auto_speech_recognition,
|
10 |
+
model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
|
11 |
+
vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
|
12 |
+
punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
|
13 |
+
)
|
14 |
+
audio_clipper = VideoClipper(inference_pipeline)
|
15 |
+
|
16 |
+
def audio_recog(audio_input):
|
17 |
+
return audio_clipper.recog(audio_input)
|
18 |
+
|
19 |
+
def audio_clip(dest_text, start_ost, end_ost, state):
|
20 |
+
return audio_clipper.clip(dest_text, start_ost, end_ost, state)
|
21 |
+
|
22 |
+
def video_recog(video_input):
|
23 |
+
return audio_clipper.video_recog(video_input)
|
24 |
+
|
25 |
+
def video_clip(dest_text, start_ost, end_ost, state):
|
26 |
+
return audio_clipper.video_clip(dest_text, start_ost, end_ost, state)
|
27 |
+
|
28 |
+
def video_clip_addsub(dest_text, start_ost, end_ost, state, font_size, font_color):
|
29 |
+
return audio_clipper.video_clip(dest_text, start_ost, end_ost, state, font_size, font_color, add_sub=True)
|
30 |
+
|
31 |
+
'''
|
32 |
+
top_md_1 = ("""
|
33 |
+
基于达摩院自研Paraformer-长音频版的语音识别、端点检测、标点预测、时间戳功能
|
34 |
+
|
35 |
+
准确识别,自由复制所需段落并一键裁剪、添加字幕
|
36 |
+
|
37 |
+
* Step1: 上传视频文件(或使用下方的用例体验),点击 **<font color="#f7802b">识别</font>** 按钮
|
38 |
+
* Step2: 复制识别结果中所需的文字至右上方,设置偏移与字幕配置(可选)
|
39 |
+
* Step3: 点击 **<font color="#f7802b">裁剪</font>** 按钮或 **<font color="#f7802b">裁剪并添加字幕</font>** 按钮获得结果
|
40 |
+
""")
|
41 |
+
'''
|
42 |
+
|
43 |
+
top_md_2 = ("""
|
44 |
+
受到网络传输与服务资源的限制,用于体验的视频最好大小在40mb以下
|
45 |
+
过大的视频可以尝试分离音轨使用音频剪辑,或 **<font color="#1785c4">通过源代码将您的ClipVideo服务部署在本地(推荐)</font>** :
|
46 |
+
<div align="center">
|
47 |
+
<div style="display:flex; gap: 0.25rem;" align="center">
|
48 |
+
FunASR_APP: <a href='https://github.com/alibaba/funasr-app'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
|
49 |
+
🌟支持我们: <a href='https://github.com/alibaba/funasr-app/stargazers'><img src='https://img.shields.io/github/stars/alibaba/funasr-app.svg?style=social'></a>
|
50 |
+
</div>
|
51 |
+
</div>
|
52 |
+
""")
|
53 |
+
|
54 |
+
top_md_3 = ("""访问FunASR项目与论文能够帮助您深入了解ClipVideo中所使用的语音处理相关模型:
|
55 |
+
<div align="center">
|
56 |
+
<div style="display:flex; gap: 0.25rem;" align="center">
|
57 |
+
FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
|
58 |
+
FunASR Paper: <a href="https://arxiv.org/abs/2305.11013"><img src="https://img.shields.io/badge/Arxiv-2305.11013-orange"></a>
|
59 |
+
🌟Star FunASR: <a href='https://github.com/alibaba-damo-academy/FunASR/stargazers'><img src='https://img.shields.io/github/stars/alibaba-damo-academy/FunASR.svg?style=social'></a>
|
60 |
+
</div>
|
61 |
+
</div>
|
62 |
+
""")
|
63 |
+
|
64 |
+
# gradio interface
|
65 |
+
with gr.Blocks() as demo:
|
66 |
+
#gr.Image("./examples/guide.png", show_label=False)
|
67 |
+
# gr.Markdown(top_md_1)
|
68 |
+
#gr.Markdown(top_md_2)
|
69 |
+
#gr.Markdown(top_md_3)
|
70 |
+
video_state = gr.State()
|
71 |
+
audio_state = gr.State()
|
72 |
+
with gr.Tab("🎥✂️视频裁剪 Video Clipping"):
|
73 |
+
with gr.Row():
|
74 |
+
with gr.Column():
|
75 |
+
video_input = gr.Video(label="🎥视频输入 Video Input")
|
76 |
+
gr.Examples(['examples/2022云栖大会_片段2.mp4',
|
77 |
+
'examples/2022云栖大会_片段.mp4',
|
78 |
+
'examples/为什么要多读书?这是我听过最好的答案-片段.mp4',
|
79 |
+
'examples/使用chatgpt_片段.mp4'],
|
80 |
+
[video_input])
|
81 |
+
recog_button2 = gr.Button("👂识别 Recognize")
|
82 |
+
video_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
|
83 |
+
video_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
|
84 |
+
with gr.Column():
|
85 |
+
video_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
|
86 |
+
with gr.Row():
|
87 |
+
video_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
|
88 |
+
video_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
|
89 |
+
with gr.Row():
|
90 |
+
font_size = gr.Slider(minimum=10, maximum=100, value=32, step=2, label="🔠字幕字体大小 Subtitle Font Size")
|
91 |
+
font_color = gr.Radio(["black", "white", "green", "red"], label="🌈字幕颜色 Subtitle Color", value='white')
|
92 |
+
# font = gr.Radio(["黑体", "Alibaba Sans"], label="字体 Font")
|
93 |
+
with gr.Row():
|
94 |
+
clip_button2 = gr.Button("✂️裁剪\nClip")
|
95 |
+
clip_button3 = gr.Button("✂️裁剪并添加字幕\nClip and Generate Subtitles")
|
96 |
+
video_output = gr.Video(label="🎥裁剪结果 Audio Clipped")
|
97 |
+
video_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
|
98 |
+
video_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
|
99 |
+
|
100 |
+
with gr.Tab("🔊✂️音频裁剪 Audio Clipping"):
|
101 |
+
with gr.Row():
|
102 |
+
with gr.Column():
|
103 |
+
audio_input = gr.Audio(label="🔊音频输入 Audio Input")
|
104 |
+
gr.Examples(['examples/鲁肃采访片段1.wav'], [audio_input])
|
105 |
+
recog_button1 = gr.Button("👂识别 Recognize")
|
106 |
+
audio_text_output = gr.Textbox(label="✏️识别结果 Recognition Result")
|
107 |
+
audio_srt_output = gr.Textbox(label="📖SRT字幕内容 RST Subtitles")
|
108 |
+
with gr.Column():
|
109 |
+
audio_text_input = gr.Textbox(label="✏️待裁剪文本 Text to Clip (多段文本使用'#'连接)")
|
110 |
+
with gr.Row():
|
111 |
+
audio_start_ost = gr.Slider(minimum=-500, maximum=1000, value=0, step=50, label="⏪开始位置偏移 Start Offset (ms)")
|
112 |
+
audio_end_ost = gr.Slider(minimum=-500, maximum=1000, value=100, step=50, label="⏩结束位置偏移 End Offset (ms)")
|
113 |
+
with gr.Row():
|
114 |
+
clip_button1 = gr.Button("✂️裁剪 Clip")
|
115 |
+
audio_output = gr.Audio(label="🔊裁剪结果 Audio Clipped")
|
116 |
+
audio_mess_output = gr.Textbox(label="ℹ️裁剪信息 Clipping Log")
|
117 |
+
audio_srt_clip_output = gr.Textbox(label="📖裁剪部分SRT字幕内容 Clipped RST Subtitles")
|
118 |
+
|
119 |
+
recog_button1.click(audio_recog,
|
120 |
+
inputs=audio_input,
|
121 |
+
outputs=[audio_text_output, audio_srt_output, audio_state])
|
122 |
+
clip_button1.click(audio_clip,
|
123 |
+
inputs=[audio_text_input, audio_start_ost, audio_end_ost, audio_state],
|
124 |
+
outputs=[audio_output, audio_mess_output, audio_srt_clip_output])
|
125 |
+
|
126 |
+
recog_button2.click(video_recog,
|
127 |
+
inputs=video_input,
|
128 |
+
outputs=[video_text_output, video_srt_output, video_state])
|
129 |
+
clip_button2.click(video_clip,
|
130 |
+
inputs=[video_text_input, video_start_ost, video_end_ost, video_state],
|
131 |
+
outputs=[video_output, video_mess_output, video_srt_clip_output])
|
132 |
+
clip_button3.click(video_clip_addsub,
|
133 |
+
inputs=[video_text_input, video_start_ost, video_end_ost, video_state, font_size, font_color],
|
134 |
+
outputs=[video_output, video_mess_output, video_srt_clip_output])
|
135 |
+
|
136 |
+
# start gradio service in local
|
137 |
+
demo.queue(concurrency_count=3).launch()
|
examples/2022云栖大会_片段.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c4d171002592d4b87917349d75c9c4fdfd99e4186f8aa71420d6d98acbeb31ca
|
3 |
+
size 36077615
|
examples/2022云栖大会_片段2.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5e09b542f45373db284228cdf1507e28d926b4a2ab4697a20e1dca04d5ad0c88
|
3 |
+
size 6715481
|
examples/guide.png
ADDED
Git LFS Details
|
examples/为什么要多读书?这是我听过最好的答案-片段.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3901cea588badd3ebc502428d983c3d1413edf5159db134ec82e92101535989f
|
3 |
+
size 46912031
|
examples/使用chatgpt_片段.mp4
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b7c448b49a96c239d19a0e7e5c14a7a8f558231f700e93722cb0b3da2991cfb
|
3 |
+
size 12033342
|
examples/鲁肃采访片段1.wav
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5b57d8e54fd1a6d56dd16c969ac89b6570e8f73eedc99b12a18a8112a13377d0
|
3 |
+
size 6086432
|
font/STHeitiMedium.ttc
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f8fa4a63e2cf500e98e64d4c73260daaba049306cf85dec9e3729bc285b7d645
|
3 |
+
size 55754164
|
subtitle_utils.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def time_convert(ms):
|
2 |
+
ms = int(ms)
|
3 |
+
tail = ms % 1000
|
4 |
+
s = ms // 1000
|
5 |
+
mi = s // 60
|
6 |
+
s = s % 60
|
7 |
+
h = mi // 60
|
8 |
+
mi = mi % 60
|
9 |
+
h = "00" if h == 0 else str(h)
|
10 |
+
mi = "00" if mi == 0 else str(mi)
|
11 |
+
s = "00" if s == 0 else str(s)
|
12 |
+
tail = str(tail)
|
13 |
+
if len(h) == 1: h = '0' + h
|
14 |
+
if len(mi) == 1: mi = '0' + mi
|
15 |
+
if len(s) == 1: s = '0' + s
|
16 |
+
return "{}:{}:{},{}".format(h, mi, s, tail)
|
17 |
+
|
18 |
+
|
19 |
+
class Text2SRT():
|
20 |
+
def __init__(self, text_seg, ts_list, offset=0):
|
21 |
+
self.token_list = [i for i in text_seg.split() if len(i)]
|
22 |
+
self.ts_list = ts_list
|
23 |
+
start, end = ts_list[0][0] - offset, ts_list[-1][1] - offset
|
24 |
+
self.start_sec, self.end_sec = start, end
|
25 |
+
self.start_time = time_convert(start)
|
26 |
+
self.end_time = time_convert(end)
|
27 |
+
def text(self):
|
28 |
+
res = ""
|
29 |
+
for word in self.token_list:
|
30 |
+
if '\u4e00' <= word <= '\u9fff':
|
31 |
+
res += word
|
32 |
+
else:
|
33 |
+
res += " " + word
|
34 |
+
return res
|
35 |
+
def len(self):
|
36 |
+
return len(self.token_list)
|
37 |
+
def srt(self):
|
38 |
+
return "{} --> {}\n{}\n".format(self.start_time, self.end_time, self.text())
|
39 |
+
def time(self):
|
40 |
+
return (self.start_sec/1000, self.end_sec/1000)
|
41 |
+
|
42 |
+
|
43 |
+
def generate_srt(sentence_list):
|
44 |
+
srt_total = ''
|
45 |
+
for i, d in enumerate(sentence_list):
|
46 |
+
t2s = Text2SRT(d['text_seg'], d['ts_list'])
|
47 |
+
srt_total += "{}\n{}".format(i, t2s.srt())
|
48 |
+
return srt_total
|
49 |
+
|
50 |
+
def generate_srt_clip(sentence_list, start, end, begin_index=0):
|
51 |
+
start, end = int(start * 1000), int(end * 1000)
|
52 |
+
srt_total = ''
|
53 |
+
cc = 1 + begin_index
|
54 |
+
subs = []
|
55 |
+
for i, d in enumerate(sentence_list):
|
56 |
+
if d['ts_list'][-1][1] <= start:
|
57 |
+
continue
|
58 |
+
if d['ts_list'][0][0] >= end:
|
59 |
+
break
|
60 |
+
# parts in between
|
61 |
+
if (d['ts_list'][-1][1] < end and d['ts_list'][0][0] > start) or (d['ts_list'][-1][1] == end and d['ts_list'][0][0] == start):
|
62 |
+
t2s = Text2SRT(d['text_seg'], d['ts_list'], offset=start)
|
63 |
+
srt_total += "{}\n{}".format(cc, t2s.srt())
|
64 |
+
subs.append((t2s.time(), t2s.text()))
|
65 |
+
cc += 1
|
66 |
+
continue
|
67 |
+
if d['ts_list'][0][0] <= start:
|
68 |
+
if not d['ts_list'][-1][1] > end:
|
69 |
+
for j, ts in enumerate(d['ts_list']):
|
70 |
+
if ts[1] > start:
|
71 |
+
break
|
72 |
+
_text = " ".join(d['text_seg'].split()[j:])
|
73 |
+
_ts = d['ts_list'][j:]
|
74 |
+
else:
|
75 |
+
for j, ts in enumerate(d['ts_list']):
|
76 |
+
if ts[1] > start:
|
77 |
+
_start = j
|
78 |
+
break
|
79 |
+
for j, ts in enumerate(d['ts_list']):
|
80 |
+
if ts[1] > end:
|
81 |
+
_end = j
|
82 |
+
break
|
83 |
+
_text = " ".join(d['text_seg'].split()[_start:_end])
|
84 |
+
_ts = d['ts_list'][_start:_end]
|
85 |
+
if len(ts):
|
86 |
+
t2s = Text2SRT(_text, _ts, offset=start)
|
87 |
+
srt_total += "{}\n{}".format(cc, t2s.srt())
|
88 |
+
subs.append((t2s.time(), t2s.text()))
|
89 |
+
cc += 1
|
90 |
+
continue
|
91 |
+
if d['ts_list'][-1][1] > end:
|
92 |
+
for j, ts in enumerate(d['ts_list']):
|
93 |
+
if ts[1] > end:
|
94 |
+
break
|
95 |
+
_text = " ".join(d['text_seg'].split()[:j])
|
96 |
+
_ts = d['ts_list'][:j]
|
97 |
+
if len(_ts):
|
98 |
+
t2s = Text2SRT(_text, _ts, offset=start)
|
99 |
+
srt_total += "{}\n{}".format(cc, t2s.srt())
|
100 |
+
subs.append(
|
101 |
+
(t2s.time(), t2s.text())
|
102 |
+
)
|
103 |
+
cc += 1
|
104 |
+
continue
|
105 |
+
return srt_total, subs, cc
|
trans_utils.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PUNC_LIST = [',', '。', '!', '?', '、']
|
2 |
+
|
3 |
+
|
4 |
+
def pre_proc(text):
|
5 |
+
res = ''
|
6 |
+
for i in range(len(text)):
|
7 |
+
if text[i] in PUNC_LIST:
|
8 |
+
continue
|
9 |
+
if '\u4e00' <= text[i] <= '\u9fff':
|
10 |
+
if len(res) and res[-1] != " ":
|
11 |
+
res += ' ' + text[i]+' '
|
12 |
+
else:
|
13 |
+
res += text[i]+' '
|
14 |
+
else:
|
15 |
+
res += text[i]
|
16 |
+
if res[-1] == ' ':
|
17 |
+
res = res[:-1]
|
18 |
+
return res
|
19 |
+
|
20 |
+
def proc(raw_text, timestamp, dest_text):
|
21 |
+
# simple matching
|
22 |
+
ld = len(dest_text.split())
|
23 |
+
mi, ts = [], []
|
24 |
+
offset = 0
|
25 |
+
while True:
|
26 |
+
fi = raw_text.find(dest_text, offset, len(raw_text))
|
27 |
+
# import pdb; pdb.set_trace()
|
28 |
+
ti = raw_text[:fi].count(' ')
|
29 |
+
if fi == -1:
|
30 |
+
break
|
31 |
+
offset = fi + ld
|
32 |
+
mi.append(fi)
|
33 |
+
ts.append([timestamp[ti][0]*16, timestamp[ti+ld-1][1]*16])
|
34 |
+
# import pdb; pdb.set_trace()
|
35 |
+
return ts
|
36 |
+
|
37 |
+
|
38 |
+
def write_state(output_dir, state):
|
39 |
+
for key in ['/recog_res_raw', '/timestamp', '/sentences']:
|
40 |
+
with open(output_dir+key, 'w') as fout:
|
41 |
+
fout.write(str(state[key[1:]]))
|
42 |
+
|
43 |
+
|
44 |
+
def load_state(output_dir):
|
45 |
+
state = {}
|
46 |
+
with open(output_dir+'/recog_res_raw') as fin:
|
47 |
+
line = fin.read()
|
48 |
+
state['recog_res_raw'] = line
|
49 |
+
with open(output_dir+'/timestamp') as fin:
|
50 |
+
line = fin.read()
|
51 |
+
state['timestamp'] = eval(line)
|
52 |
+
with open(output_dir+'/sentences') as fin:
|
53 |
+
line = fin.read()
|
54 |
+
state['sentences'] = eval(line)
|
55 |
+
return state
|
56 |
+
|
57 |
+
|
videoclipper.py
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import copy
|
3 |
+
import librosa
|
4 |
+
import logging
|
5 |
+
import argparse
|
6 |
+
import numpy as np
|
7 |
+
import soundfile as sf
|
8 |
+
import moviepy.editor as mpy
|
9 |
+
from modelscope.pipelines import pipeline
|
10 |
+
from modelscope.utils.constant import Tasks
|
11 |
+
from subtitle_utils import generate_srt, generate_srt_clip
|
12 |
+
from trans_utils import pre_proc, proc, write_state, load_state
|
13 |
+
|
14 |
+
from moviepy.editor import *
|
15 |
+
from moviepy.video.tools.subtitles import SubtitlesClip
|
16 |
+
|
17 |
+
|
18 |
+
class VideoClipper():
|
19 |
+
def __init__(self, asr_pipeline):
|
20 |
+
logging.warning("Initializing VideoClipper.")
|
21 |
+
self.asr_pipeline = asr_pipeline
|
22 |
+
|
23 |
+
def recog(self, audio_input, state=None):
|
24 |
+
if state is None:
|
25 |
+
state = {}
|
26 |
+
state['audio_input'] = audio_input
|
27 |
+
_, data = audio_input
|
28 |
+
data = data.astype(np.float64)
|
29 |
+
rec_result = self.asr_pipeline(audio_in=data)
|
30 |
+
state['recog_res_raw'] = rec_result['text_postprocessed']
|
31 |
+
state['timestamp'] = rec_result['time_stamp']
|
32 |
+
state['sentences'] = rec_result['sentences']
|
33 |
+
res_text = rec_result['text']
|
34 |
+
res_srt = generate_srt(rec_result['sentences'])
|
35 |
+
return res_text, res_srt, state
|
36 |
+
|
37 |
+
def clip(self, dest_text, start_ost, end_ost, state):
|
38 |
+
# get from state
|
39 |
+
audio_input = state['audio_input']
|
40 |
+
recog_res_raw = state['recog_res_raw']
|
41 |
+
timestamp = state['timestamp']
|
42 |
+
sentences = state['sentences']
|
43 |
+
sr, data = audio_input
|
44 |
+
data = data.astype(np.float64)
|
45 |
+
|
46 |
+
all_ts = []
|
47 |
+
for _dest_text in dest_text.split('#'):
|
48 |
+
_dest_text = pre_proc(_dest_text)
|
49 |
+
ts = proc(recog_res_raw, timestamp, _dest_text)
|
50 |
+
for _ts in ts: all_ts.append(_ts)
|
51 |
+
ts = all_ts
|
52 |
+
srt_index = 0
|
53 |
+
clip_srt = ""
|
54 |
+
if len(ts):
|
55 |
+
start, end = ts[0]
|
56 |
+
start = min(max(0, start+start_ost*16), len(data))
|
57 |
+
end = min(max(0, end+end_ost*16), len(data))
|
58 |
+
res_audio = data[start:end]
|
59 |
+
start_end_info = "from {} to {}".format(start/16000, end/16000)
|
60 |
+
srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index)
|
61 |
+
clip_srt += srt_clip
|
62 |
+
for _ts in ts[1:]: # multiple sentence input or multiple output matched
|
63 |
+
start, end = _ts
|
64 |
+
start = min(max(0, start+start_ost*16), len(data))
|
65 |
+
end = min(max(0, end+end_ost*16), len(data))
|
66 |
+
start_end_info += ", from {} to {}".format(start, end)
|
67 |
+
res_audio = np.concatenate([res_audio, data[start+start_ost*16:end+end_ost*16]], -1)
|
68 |
+
srt_clip, _, srt_index = generate_srt_clip(sentences, start/16000.0, end/16000.0, begin_index=srt_index-1)
|
69 |
+
clip_srt += srt_clip
|
70 |
+
if len(ts):
|
71 |
+
message = "{} periods found in the speech: ".format(len(ts)) + start_end_info
|
72 |
+
else:
|
73 |
+
message = "No period found in the speech, return raw speech. You may check the recognition result and try other destination text."
|
74 |
+
return (sr, res_audio), message, clip_srt
|
75 |
+
|
76 |
+
def video_recog(self, vedio_filename):
|
77 |
+
vedio_filename = vedio_filename
|
78 |
+
clip_video_file = vedio_filename[:-4] + '_clip.mp4'
|
79 |
+
video = mpy.VideoFileClip(vedio_filename)
|
80 |
+
audio_file = vedio_filename[:-3] + 'wav'
|
81 |
+
video.audio.write_audiofile(audio_file)
|
82 |
+
wav = librosa.load(audio_file, 16000)[0]
|
83 |
+
state = {
|
84 |
+
'vedio_filename': vedio_filename,
|
85 |
+
'clip_video_file': clip_video_file,
|
86 |
+
'video': video,
|
87 |
+
}
|
88 |
+
# res_text, res_srt = self.recog((16000, wav), state)
|
89 |
+
return self.recog((16000, wav), state)
|
90 |
+
|
91 |
+
def video_clip(self, dest_text, start_ost, end_ost, state, font_size=32, font_color='white', add_sub=False):
|
92 |
+
# get from state
|
93 |
+
recog_res_raw = state['recog_res_raw']
|
94 |
+
timestamp = state['timestamp']
|
95 |
+
sentences = state['sentences']
|
96 |
+
video = state['video']
|
97 |
+
clip_video_file = state['clip_video_file']
|
98 |
+
vedio_filename = state['vedio_filename']
|
99 |
+
|
100 |
+
all_ts = []
|
101 |
+
srt_index = 0
|
102 |
+
for _dest_text in dest_text.split('#'):
|
103 |
+
_dest_text = pre_proc(_dest_text)
|
104 |
+
ts = proc(recog_res_raw, timestamp, _dest_text)
|
105 |
+
for _ts in ts: all_ts.append(_ts)
|
106 |
+
ts = all_ts
|
107 |
+
clip_srt = ""
|
108 |
+
if len(ts):
|
109 |
+
start, end = ts[0][0] / 16000, ts[0][1] / 16000
|
110 |
+
start, end = start+start_ost/1000.0, end+end_ost/1000.0
|
111 |
+
video_clip = video.subclip(start, end)
|
112 |
+
clip_video_file = clip_video_file
|
113 |
+
start_end_info = "from {} to {}".format(start, end)
|
114 |
+
# message = "{} periods found in the audio: from {} to {}.".format(len(ts), start, end)
|
115 |
+
srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index)
|
116 |
+
clip_srt += srt_clip
|
117 |
+
if add_sub:
|
118 |
+
generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
|
119 |
+
subtitles = SubtitlesClip(subs, generator)
|
120 |
+
video_clip = CompositeVideoClip([video_clip, subtitles.set_pos(('center','bottom'))])
|
121 |
+
concate_clip = [video_clip]
|
122 |
+
for _ts in ts[1:]:
|
123 |
+
start, end = _ts[0] / 16000, _ts[1] / 16000
|
124 |
+
start, end = start+start_ost/1000.0, end+end_ost/1000.0
|
125 |
+
_video_clip = video.subclip(start, end)
|
126 |
+
clip_video_file = clip_video_file
|
127 |
+
start_end_info += ", from {} to {}".format(start, end)
|
128 |
+
srt_clip, subs, srt_index = generate_srt_clip(sentences, start, end, begin_index=srt_index-1)
|
129 |
+
clip_srt += srt_clip
|
130 |
+
if add_sub:
|
131 |
+
generator = lambda txt: TextClip(txt, font='./font/STHeitiMedium.ttc', fontsize=font_size, color=font_color)
|
132 |
+
subtitles = SubtitlesClip(subs, generator)
|
133 |
+
_video_clip = CompositeVideoClip([_video_clip, subtitles.set_pos(('center','bottom'))])
|
134 |
+
concate_clip.append(copy.copy(_video_clip))
|
135 |
+
message = "{} periods found in the audio: ".format(len(ts)) + start_end_info
|
136 |
+
logging.warning("Concating...")
|
137 |
+
if len(concate_clip) > 1:
|
138 |
+
video_clip = concatenate_videoclips(concate_clip)
|
139 |
+
video_clip.write_videofile(clip_video_file)
|
140 |
+
else:
|
141 |
+
clip_video_file = vedio_filename
|
142 |
+
message = "No period found in the audio, return raw speech. You may check the recognition result and try other destination text."
|
143 |
+
srt_clip = ''
|
144 |
+
return clip_video_file, message, clip_srt
|
145 |
+
|
146 |
+
|