Amy Roberts commited on
Commit
f2b92aa
·
1 Parent(s): 424ae15
Files changed (7) hide show
  1. .gitattributes +4 -0
  2. README.md +2 -2
  3. app.py +151 -0
  4. examples/bed.mp4 +3 -0
  5. examples/book.mp4 +3 -0
  6. examples/food.mp4 +3 -0
  7. requirements.txt +8 -0
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ examples/ filter=lfs diff=lfs merge=lfs -text
37
+ examples/bed.mp4 filter=lfs diff=lfs merge=lfs -text
38
+ examples/book.mp4 filter=lfs diff=lfs merge=lfs -text
39
+ examples/food.mp4 filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,6 +1,6 @@
1
  ---
2
- title: Tvp
3
- emoji: 🏢
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
 
1
  ---
2
+ title: TVP
3
+ emoji: ❓📽️
4
  colorFrom: purple
5
  colorTo: pink
6
  sdk: gradio
app.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import av
4
+ import cv2
5
+ import numpy as np
6
+ import torch
7
+ import gradio as gr
8
+ from transformers import AutoProcessor, TvpForVideoGrounding
9
+
10
+
11
+
12
+ def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
13
+ '''
14
+ Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
15
+ Args:
16
+ container (container): pyav container.
17
+ sampling_rate (int): frame sampling rate (interval between two sampled frames).
18
+ num_frames (int): number of frames to sample.
19
+ clip_idx (int): if clip_idx is -1, perform random temporal sampling.
20
+ If clip_idx is larger than -1, uniformly split the video to num_clips
21
+ clips, and select the clip_idx-th video clip.
22
+ num_clips (int): overall number of clips to uniformly sample from the given video.
23
+ target_fps (int): the input video may have different fps, convert it to
24
+ the target video fps before frame sampling.
25
+ Returns:
26
+ frames (tensor): decoded frames from the video. Return None if the no
27
+ video stream was found.
28
+ fps (float): the number of frames per second of the video.
29
+ '''
30
+ video = container.streams.video[0]
31
+ fps = float(video.average_rate)
32
+ clip_size = sampling_rate * num_frames / target_fps * fps
33
+ delta = max(num_frames - clip_size, 0)
34
+ start_idx = delta * clip_idx / num_clips
35
+ end_idx = start_idx + clip_size - 1
36
+ timebase = video.duration / num_frames
37
+ video_start_pts = int(start_idx * timebase)
38
+ video_end_pts = int(end_idx * timebase)
39
+ seek_offset = max(video_start_pts - 1024, 0)
40
+ container.seek(seek_offset, any_frame=False, backward=True, stream=video)
41
+ frames = {}
42
+ for frame in container.decode(video=0):
43
+ if frame.pts < video_start_pts:
44
+ continue
45
+ frames[frame.pts] = frame
46
+ if frame.pts > video_end_pts:
47
+ break
48
+ frames = [frames[pts] for pts in sorted(frames)]
49
+ return frames, fps
50
+
51
+
52
+ def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
53
+ '''
54
+ Decode the video and perform temporal sampling.
55
+ Args:
56
+ container (container): pyav container.
57
+ sampling_rate (int): frame sampling rate (interval between two sampled frames).
58
+ num_frames (int): number of frames to sample.
59
+ clip_idx (int): if clip_idx is -1, perform random temporal sampling.
60
+ If clip_idx is larger than -1, uniformly split the video to num_clips
61
+ clips, and select the clip_idx-th video clip.
62
+ num_clips (int): overall number of clips to uniformly sample from the given video.
63
+ target_fps (int): the input video may have different fps, convert it to
64
+ the target video fps before frame sampling.
65
+ Returns:
66
+ frames (tensor): decoded frames from the video.
67
+ '''
68
+ assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
69
+ frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
70
+ clip_size = sampling_rate * num_frames / target_fps * fps
71
+ index = np.linspace(0, clip_size - 1, num_frames)
72
+ index = np.clip(index, 0, len(frames) - 1).astype(np.int64)
73
+ frames = np.array([frames[idx].to_rgb().to_ndarray() for idx in index])
74
+ frames = frames.transpose(0, 3, 1, 2)
75
+ return frames
76
+
77
+
78
+ def get_video_duration(filename):
79
+ cap = cv2.VideoCapture(filename)
80
+ if cap.isOpened():
81
+ rate = cap.get(5)
82
+ frame_num = cap.get(7)
83
+ duration = frame_num/rate
84
+ return duration
85
+ return -1
86
+
87
+
88
+ def predict_durations(model_checkpoint, text, video_filename, device="cpu"):
89
+ print(f"Loading model: {model_checkpoint}")
90
+ model = TvpForVideoGrounding.from_pretrained(model_checkpoint)
91
+ processor = AutoProcessor.from_pretrained(model_checkpoint)
92
+ print(f"Loading video: {video_filename}")
93
+ raw_sampled_frames = decode(
94
+ container=av.open(video_filename, metadata_errors="ignore"),
95
+ sampling_rate=1,
96
+ num_frames=model.config.num_frames,
97
+ clip_idx=0,
98
+ num_clips=1,
99
+ target_fps=3,
100
+ )
101
+ print("Processing video and text")
102
+ model_inputs = processor(
103
+ text=[text], videos=list(raw_sampled_frames), return_tensors="pt", max_text_length=100
104
+ ).to(device)
105
+ # model_inputs["pixel_values"] = model_inputs["pixel_values"].to(model.dtype)
106
+ print("Running inference")
107
+ output = model(**model_inputs)
108
+ duration = get_video_duration(video_filename)
109
+ start, end = processor.post_process_video_grounding(output.logits, duration)
110
+ return f"start: {start}s, end: {end}s"
111
+
112
+
113
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
114
+ DEVICE = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
115
+ MODELS = ["Intel/tvp-base", "Intel/tvp-base-ANet"]
116
+ EXAMPLES = [
117
+ ["./examples/bed.mp4", "a person is sitting on a bed."],
118
+ ["./examples/food.mp4", "a person eats some food."],
119
+ ["./examples/book.mp4", "a person reads a book."],
120
+ ]
121
+
122
+ model_checkpoint = gr.Dropdown(MODELS, label="Model", value=MODELS[0], type="value")
123
+ video_in = gr.Video(label="Video File", elem_id="video_in")
124
+ text_in = gr.Textbox(label="Text", placeholder="Description of event in the video", interactive=True)
125
+ text_out = gr.Textbox(label="Prediction", placeholder="Predicted start and end time")
126
+
127
+
128
+ title = "Video Grounding with TVP"
129
+ DESCRIPTION = """# Video Grounding with TVP"""
130
+ css = """.toast-wrap { display: none !important } """
131
+ with gr.Blocks(title=title) as demo:
132
+ gr.Markdown(DESCRIPTION)
133
+ with gr.Row():
134
+ model_checkpoint.render()
135
+
136
+ with gr.Row():
137
+ examples = gr.Examples(examples=EXAMPLES, inputs=[video_in, text_in])
138
+
139
+ with gr.Row():
140
+ with gr.Column():
141
+ video_in.render()
142
+
143
+ with gr.Column():
144
+ text_in.render()
145
+ time_button = gr.Button("Get start and end time")
146
+ time_button.click(predict_durations, inputs=[model_checkpoint, text_in, video_in], outputs=[text_out])
147
+ text_out.render()
148
+
149
+
150
+ if __name__ == "__main__":
151
+ demo.launch(debug=True)
examples/bed.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43c45bc34c842328323fa568ea65c1ff6df9686e3a9d93f804bb4ab2b241d5c7
3
+ size 1156824
examples/book.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72bb9a4e2843233591252da02790f57f4194bfe5f13278c4e74d00e77590d984
3
+ size 740098
examples/food.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:271940f652b2d51ce19f130731079e1a899eae1df15b1ecab0323c917962d539
3
+ size 4077619
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ av
2
+ huggingface_hub
3
+ gradio
4
+ numpy
5
+ transformers @ git+https://github.com/huggingface/transformers.git
6
+ torch
7
+ opencv-python
8
+ Pillow